Coupang Scraper - Extract Coupang product listings

# Sample Result of Coupang Data Scraper # Detailed Python code (async) to extract product listings from Coupang search result pages # - Uses aiohttp + asyncio for concurrency # - Parses HTML with BeautifulSoup # - Includes polite rate limiting, retry/backoff, and rotating user-agents # - Outputs JSON and CSV # # Requirements: # pip install aiohttp aiodns cchardet beautifulsoup4 lxml pandas tqdm # # NOTE: This is example code for educational and legitimate scraping (rate-limited, respectful). # Adjust selectors if Coupang HTML layout changes. import asyncio import aiohttp import async_timeout import random import time import json import csv from typing import List, Dict, Optional from bs4 import BeautifulSoup from pathlib import Path from tqdm.asyncio import tqdm_asyncio import pandas as pd # --- Configuration --- CONCURRENT_REQUESTS = 6 REQUEST_TIMEOUT = 20 # seconds MAX_RETRIES = 3 BACKOFF_BASE = 1.5 # exponential backoff base multiplier RATE_LIMIT_SECONDS = 0.5 # minimum delay between requests per worker OUTPUT_DIR = Path("output") OUTPUT_DIR.mkdir(exist_ok=True) USER_AGENTS = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)\ Chrome/120.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko)\ Version/15.6 Safari/605.1.15", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)\ Chrome/119.0.0.0 Safari/537.36" ] # Coupang search URL template (category-free search) # Example: https://www.coupang.com/np/search?q=milk&page=1 SEARCH_URL = "https://www.coupang.com/np/search?q={query}&page={page}" # --- Utility functions --- def random_headers() -> Dict[str, str]: ua = random.choice(USER_AGENTS) return { "User-Agent": ua, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.9", "Referer": "https://www.coupang.com/", } async def fetch_html(session: aiohttp.ClientSession, url: str, retries: int = 0) -> Optional[str]: """Fetch HTML with retry and exponential backoff.""" try: async with async_timeout.timeout(REQUEST_TIMEOUT): async with session.get(url, headers=random_headers(), allow_redirects=True) as resp: # Basic status check if resp.status == 200: text = await resp.text() return text # handle transient 429/5xx if resp.status in (429, 500, 502, 503, 504) and retries < MAX_RETRIES: wait = (BACKOFF_BASE ** retries) + random.random() await asyncio.sleep(wait) return await fetch_html(session, url, retries + 1) return None except (asyncio.TimeoutError, aiohttp.ClientError): if retries < MAX_RETRIES: wait = (BACKOFF_BASE ** retries) + random.random() await asyncio.sleep(wait) return await fetch_html(session, url, retries + 1) return None # --- Parsers: adjust selectors if Coupang updates its layout --- def parse_search_listings(html) -> [Dict]: """Parses Coupang search results HTML and returns a list of product summary dicts. Typical fields: product_id, title, price, original_price, rating, review_count, product_url, image""" soup = BeautifulSoup(html, "lxml") results = [] # Coupang uses <li class="search-product"> for each product in many layouts. product_nodes = soup.select("li.search-product") if not product_nodes: # alternative: some layouts may use div.something - try a broader selector product_nodes = soup.select("li[class*='search-product']") for node in product_nodes: # Skip sponsored or ad blocks by CSS classes if necessary try: prod = {} a = node.select_one("a.search-product-link") # fallback to generic link if not a: a = node.select_one("a[href*='/vp/products/'], a[href*='/products/']") href = a["href"].strip() if a and a.has_attr("href") else None if href: # Normalize to full url (Coupang uses relative paths) if href.startswith("/"): prod["product_url"] = f"https://www.coupang.com{href}" else: prod["product_url"] = href else: prod["product_url"] = None title_node = node.select_one("div.name") or node.select_one("div.search-product__title") or node.select_one("strong") prod["title"] = title_node.get_text(strip=True) if title_node else None price_node = node.select_one("strong.price-value") or node.select_one("span.price") if price_node: price_text = price_node.get_text(strip=True).replace(",", "") # Remove non-digits prod["price"] = "".join(ch for ch in price_text if (ch.isdigit() or ch == ".")) else: prod["price"] = None original_price_node = node.select_one("del.price-original") or node.select_one("span.price-original") prod["original_price"] = ( "".join(ch for ch in original_price_node.get_text(strip=True) if (ch.isdigit() or ch == ".")) if original_price_node else None ) rating_node = node.select_one("em.rating") or node.select_one("span.rating") or node.select_one("span.star") prod["rating"] = rating_node.get_text(strip=True) if rating_node else None review_node = node.select_one("span.rating-total-count") or node.select_one("span.review-count") if review_node: # often like "(123)" rc = review_node.get_text(strip=True).replace("(", "").replace(")", "").replace(",", "") prod["review_count"] = rc else: prod["review_count"] = None img_node = node.select_one("img") prod["image_url"] = img_node["src"] if img_node and img_node.has_attr("src") else (img_node["data-src"] if img_node and img_node.has_attr("data-src") else None) # Product id extraction from URL if available (/vp/products/{id}) pid = None if prod["product_url"]: import re m = re.search(r"/vp/products/(\\d+)|/products/(\\d+)", prod["product_url"]) if m: pid = m.group(1) or m.group(2) prod["product_id"] = pid results.append(prod) except Exception: # skip nodes that fail parsing continue return results def parse_product_detail(html) -> Dict: """Parses product detail page for more fields: description, seller, detailed price, stock/delivery, features, etc. Adjust selectors to actual Coupang detail page structure.""" soup = BeautifulSoup(html, "lxml") data = {} # Title t = soup.select_one("h2.prod-buy-header__title, .prod-buy-header__title, .prod-view-title__title, div.product-name") data["title"] = t.get_text(strip=True) if t else None # Price (detail) p = soup.select_one("span.total-price > strong, .price-original, .prod-price") if p: data["price_detail"] = "".join(ch for ch in p.get_text(strip=True) if (ch.isdigit() or ch == ".")) else: data["price_detail"] = None # Seller / Brand brand = soup.select_one("a.prod-brand-name, .prod-brand-name, .product-brand") data["brand"] = brand.get_text(strip=True) if brand else None # Rating and review count rating = soup.select_one("span.total-star > em, .rating figure em") data["rating_detail"] = rating.get_text(strip=True) if rating else None rev = soup.select_one("span.count") data["review_count_detail"] = rev.get_text(strip=True).replace("(", "").replace(")", "") if rev else None # Description / bullets desc = soup.select_one("#productDetail") if desc: data["description"] = desc.get_text(separator=" ", strip=True)[:5000] # truncate long text else: data["description"] = None return data # --- Orchestration / Workers --- class Scraper: def __init__(self, concurrency: int = CONCURRENT_REQUESTS): self.semaphore = asyncio.Semaphore(concurrency) self.session: Optional[aiohttp.ClientSession] = None async def __aenter__(self): timeout = aiohttp.ClientTimeout(total=REQUEST_TIMEOUT + 10) self.session = aiohttp.ClientSession(timeout=timeout) return self async def __aexit__(self, exc_type, exc, tb): if self.session: await self.session.close() async def fetch_search_page(self, query: str, page: int) -> [Dict]: url = SEARCH_URL.format(query=aiohttp.helpers.quote(query), page=page) async with self.semaphore: html = await fetch_html(self.session, url) await asyncio.sleep(RATE_LIMIT_SECONDS + random.random() * 0.5) if not html: return [] items = parse_search_listings(html) return items async def fetch_product_details(self, product_url: str) -> Dict: async with self.semaphore: html = await fetch_html(self.session, product_url) await asyncio.sleep(RATE_LIMIT_SECONDS + random.random() * 0.5) if not html: return {} details = parse_product_detail(html) return details async def scrape_query(query, pages: int = 2) -> [Dict]: """Scrape N search pages for a query and enrich product details. Returns a list of combined product dicts.""" async with Scraper() as s: # step 1: gather summaries from search pages tasks = [s.fetch_search_page(query, p) for p in range(1, pages + 1)] page_results = await asyncio.gather(*tasks) # flatten and unique by product_url summaries = {} for page_list in page_results: for item in page_list: key = item.get("product_url") or item.get("product_id") or item.get("title") if not key: continue if key not in summaries: summaries[key] = item summaries_list = list(summaries.values()) # step 2: fetch product details concurrently (limit via Semaphore) detail_tasks = [] for item in summaries_list: url = item.get("product_url") if url: detail_tasks.append(s.fetch_product_details(url)) else: detail_tasks.append(asyncio.sleep(0, result={})) detailed_results = await tqdm_asyncio.gather(*detail_tasks) combined = [] for base, details in zip(summaries_list, detailed_results): merged = {**base, **details} combined.append(merged) return combined # --- I/O helpers --- def save_json(items: [Dict], filename: str): out = OUTPUT_DIR/filename with open(out, "w", encoding="utf-8") as f: json.dump(items, f, ensure_ascii=False, indent=2) def save_csv(items: [Dict], filename: str): out = OUTPUT_DIR/filename if not items: return # normalize columns cols = sorted({k for it in items for k in it.keys()}) df = pd.DataFrame(items, columns=cols) df.to_csv(out, index=False, encoding="utf-8-sig") # --- Example usage --- async def main(): # parameters: query and number of pages to scan query = "라면" # example Korean search query, replace as needed (e.g., "milk powder", "diapers") pages = 3 print(f"Scraping Coupang search for query={query!r} pages={pages}") items = await scrape_query(query, pages=pages) # Save outputs timestamp = int(time.time()) json_file = f"coupang_{query}_results_{timestamp}.json".replace(" ", "_") csv_file = f"coupang_{query}_results_{timestamp}.csv".replace(" ", "_") save_json(items, json_file) save_csv(items, csv_file) print(f"Saved {len(items)} items to {OUTPUT_DIR / json_file} and {OUTPUT_DIR / csv_file}") if __name__ == "__main__": asyncio.run(main())

import { RealdataAPIClient } from 'RealDataAPI-client'; // Initialize the RealdataAPIClient with API token const client = new RealdataAPIClient({ token: '', }); // Prepare actor input const input = { "categoryOrProductUrls": [ { "url": "https://www.amazon.com/s?i=specialty-aps&bbn=16225009011&rh=n%3A%2116225009011%2Cn%3A2811119011&ref=nav_em__nav_desktop_sa_intl_cell_phones_and_accessories_0_2_5_5" } ], "maxItems": 100, "proxyConfiguration": { "useRealDataAPIProxy": true } }; (async () => { // Run the actor and wait for it to finish const run = await client.actor("junglee/amazon-crawler").call(input); // Fetch and print actor results from the run's dataset (if any) console.log('Results from dataset'); const { items } = await client.dataset(run.defaultDatasetId).listItems(); items.forEach((item) => { console.dir(item); }); })();

from realdataapi_client import RealdataAPIClient # Initialize the RealdataAPIClient with your API token client = RealdataAPIClient("") # Prepare the actor input run_input = { "categoryOrProductUrls": [{ "url": "https://www.amazon.com/s?i=specialty-aps&bbn=16225009011&rh=n%3A%2116225009011%2Cn%3A2811119011&ref=nav_em__nav_desktop_sa_intl_cell_phones_and_accessories_0_2_5_5" }], "maxItems": 100, "proxyConfiguration": { "useRealDataAPIProxy": True }, } # Run the actor and wait for it to finish run = client.actor("junglee/amazon-crawler").call(run_input=run_input) # Fetch and print actor results from the run's dataset (if there are any) for item in client.dataset(run["defaultDatasetId"]).iterate_items(): print(item)

# Set API token API_TOKEN=<YOUR_API_TOKEN> # Prepare actor input cat > input.json <<'EOF' { "categoryOrProductUrls": [ { "url": "https://www.amazon.com/s?i=specialty-aps&bbn=16225009011&rh=n%3A%2116225009011%2Cn%3A2811119011&ref=nav_em__nav_desktop_sa_intl_cell_phones_and_accessories_0_2_5_5" } ], "maxItems": 100, "proxyConfiguration": { "useRealDataAPIProxy": true } } EOF # Run the actor curl "https://api.realdataapi.com/v2/acts/junglee~amazon-crawler/runs?token=$API_TOKEN" \ -X POST \ -d @input.json \ -H 'Content-Type: application/json'

{ "categoryOrProductUrls": [ { "url": "https://www.amazon.com/s?i=specialty-aps&bbn=16225009011&rh=n%3A%2116225009011%2Cn%3A2811119011&ref=nav_em__nav_desktop_sa_intl_cell_phones_and_accessories_0_2_5_5" } ], "maxItems": 100, "detailedInformation": false, "useCaptchaSolver": false, "proxyConfiguration": { "useRealDataAPIProxy": true } }

Coupang Scraper - Extract Coupang Product Listings

RealdataAPI / coupang-scraper

What is Coupang Data Scraper, and How Does It Work?

Why Extract Data from Coupang?

Is It Legal to Extract Coupang Data?

How Can I Extract Data from Coupang?

Do You Want More Coupang Scraping Alternatives?

Input options

Sample Result of Coupang Data Scraper

Integrations with Coupang Data Scraper – Coupang Data Extraction

Executing Coupang Data Scraping Actor with Real Data API

Tesco Grocery Scraper

Asda Grocery Scraper

Zepto Scraper

Additional Resources

Place the Amazon product URLs

Max reviews

Link selector

Mention personal data

Reviews sort

Options:

Proxy configuration

Extended output function

ON THIS PAGE

Tesco Grocery Scraper

Asda Grocery Scraper

Zepto Scraper

Where next?

Get in Touch

Web Data

Store Location

By APIs

Scraper

Use Cases

Datasets

Knowledge Center

Company

About Us

Contact us

© 2025 RealdataAPI. All rights reserved.

By APIs

Ecommerce Scraping API

Food Scraping API

Grocery Scraping API

Travel Scraping API

Real Estate Scraping API

Quick Commerce Scraping API

Social Media Scraping API

OTT Scraping API

Liquor Scraping API

Recruitment Scraping API

Healthcare Scraping API

Web Data

Solutions

Web Scraping Services

Web Scraping API Services

Mobile App Scraping services

Enterprise Web Crawling

Solutions

Web Unlocker API

Anti Blocking

Use Cases

Live Crawler

Scraping Browser API

Trending

Ecommerce

Grocery / Quick Commerce

Food

Travel

Get Free Quote

Unlock Business Growth with Trusted Web Data

Coupang Scraper - Extract Coupang Product Listings

RealdataAPI / coupang-scraper

What is Coupang Data Scraper, and How Does It Work?

Why Extract Data from Coupang?

Is It Legal to Extract Coupang Data?

How Can I Extract Data from Coupang?

Do You Want More Coupang Scraping Alternatives?

Input options

Sample Result of Coupang Data Scraper

Integrations with Coupang Data Scraper – Coupang Data Extraction

Executing Coupang Data Scraping Actor with Real Data API

Related Scrapers

Tesco Grocery Scraper

Asda Grocery Scraper

Zepto Scraper

Additional Resources

Place the Amazon product URLs

Max reviews

Link selector

Mention personal data

Reviews sort

Options:

Proxy configuration

Extended output function

ON THIS PAGE

Related Scrapers

Tesco Grocery Scraper

Asda Grocery Scraper

Zepto Scraper

Where next?

Get in Touch

Web Data

Store Location

By APIs

Scraper

Use Cases

Datasets

Knowledge Center

Company

About Us

Contact us

© 2025 RealdataAPI. All rights reserved.