Homeplus Mall Grocery Scraper – Extract Product Listings

#!/usr/bin/env python3 """ Sample Result of Homeplus Mall Data Scraper - Detailed Example Code This script demonstrates a robust approach for scraping Homeplus Mall grocery product listings using Python. It captures product details such as name, category, price, availability, and images, normalizes the data, and outputs to JSONL and CSV formats. NOTE: Replace URLs, selectors, and API paths with actual Homeplus Mall endpoints or permitted data sources. This is a template for demonstration purposes. """ import requests from requests.adapters import HTTPAdapter, Retry from urllib.parse import urljoin, urlencode import json import csv import time import random from datetime import datetime from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor, as_completed import os # -------- CONFIGURATION -------- BASE_URL = "https://www.homeplusmall.example/" # replace with actual base URL SEARCH_PATH = "/search" USER_AGENTS = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_0) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Safari/605.1.15", ] HEADERS_COMMON = { "Accept": "application/json, text/html, */*", "Accept-Language": "en-US,en;q=0.9", } MAX_WORKERS = 6 MIN_DELAY = 0.3 MAX_DELAY = 1.0 REQUEST_TIMEOUT = 15 OUTPUT_JSONL = "homeplus_products.jsonl" OUTPUT_CSV = "homeplus_products.csv" CSV_FIELDS = [ "scraped_at", "source", "product_id", "name", "brand", "category", "subcategory", "price", "currency", "discounted_price", "availability", "rating", "rating_count", "image_url", "product_url", "description", "store_id", "store_name", ] # -------- HTTP SESSION WITH RETRIES -------- def build_session(): session = requests.Session() retries = Retry( total=5, backoff_factor=0.5, status_forcelist=(429, 500, 502, 503, 504), allowed_methods=frozenset(["GET", "POST"]) ) adapter = HTTPAdapter(max_retries=retries) session.mount("https://", adapter) session.mount("http://", adapter) return session def polite_sleep(): time.sleep(random.uniform(MIN_DELAY, MAX_DELAY)) # -------- PARSERS -------- def parse_json_listing(payload): products = [] items = payload.get("products") or payload.get("items") or [] for it in items: p = { "product_id": str(it.get("id", "")), "name": it.get("name", ""), "brand": it.get("brand", ""), "category": it.get("category", ""), "subcategory": it.get("subcategory", ""), "price": float(it.get("price") or 0.0), "currency": it.get("currency") or "KRW", "discounted_price": float(it.get("discount_price") or 0.0), "availability": it.get("availability") or "unknown", "rating": float(it.get("rating") or 0.0), "rating_count": int(it.get("rating_count") or 0), "image_url": it.get("image_url") or "", "product_url": it.get("product_url") or "", "description": it.get("description") or "", "store_id": str(it.get("store_id") or ""), "store_name": it.get("store_name") or "", } products.append(p) return products def parse_html_listing(html_text, base_page_url=""): soup = BeautifulSoup(html_text, "html.parser") products = [] for card in soup.select(".product-card, .menu-item"): try: prod_id = card.get("data-id") or "" name_el = card.select_one(".product-title") name = name_el.get_text(strip=True) if name_el else "" price_el = card.select_one(".price") price_txt = price_el.get_text(strip=True) if price_el else "0" import re price = float(re.sub(r"[^\d\.]", "", price_txt) or 0) image_el = card.select_one("img") image_url = urljoin(base_page_url, image_el["src"]) if image_el else "" product_url_el = card.select_one("a") product_url = urljoin(base_page_url, product_url_el["href"]) if product_url_el else "" store_el = card.select_one(".store-name") store_name = store_el.get_text(strip=True) if store_el else "" p = { "product_id": prod_id, "name": name, "brand": "", "category": "", "subcategory": "", "price": price, "currency": "KRW", "discounted_price": 0.0, "availability": "unknown", "rating": 0.0, "rating_count": 0, "image_url": image_url, "product_url": product_url, "description": "", "store_id": "", "store_name": store_name, } products.append(p) except Exception: continue return products def normalize_and_stamp(products, source): now = datetime.utcnow().isoformat() + "Z" norm = [] for p in products: out = {"scraped_at": now, "source": source} for key in CSV_FIELDS[2:]: out[key] = p.get(key, "") norm.append(out) return norm # -------- FETCHING PAGES -------- def fetch_listing_page(session, url, params=None): headers = HEADERS_COMMON.copy() headers["User-Agent"] = random.choice(USER_AGENTS) try: resp = session.get(url, headers=headers, params=params, timeout=REQUEST_TIMEOUT) resp.raise_for_status() return resp except requests.RequestException as e: print(f"[WARN] Failed request {url}: {e}") return None def fetch_product_listings(session, query, page_limit=3): all_products = [] for page in range(1, page_limit + 1): polite_sleep() params = {"q": query, "page": page, "per_page": 48} url = urljoin(BASE_URL, SEARCH_PATH) resp = fetch_listing_page(session, url, params=params) if resp is None: continue source_id = f"{url}?{urlencode(params)}" parsed = [] if "application/json" in resp.headers.get("Content-Type", "") or resp.text.strip().startswith("{"): try: payload = resp.json() parsed = parse_json_listing(payload) except Exception: parsed = parse_html_listing(resp.text, base_page_url=url) else: parsed = parse_html_listing(resp.text, base_page_url=url) norm = normalize_and_stamp(parsed, source_id) all_products.extend(norm) if not parsed: break return all_products # -------- OUTPUT -------- def write_jsonl(filename, products): with open(filename, "w", encoding="utf-8") as f: for p in products: f.write(json.dumps(p, ensure_ascii=False) + "\n") print(f"[INFO] Wrote {len(products)} records to {filename}") def write_csv(filename, products): with open(filename, "w", encoding="utf-8", newline="") as f: writer = csv.DictWriter(f, fieldnames=CSV_FIELDS) writer.writeheader() for p in products: row = {k: p.get(k, "") for k in CSV_FIELDS} writer.writerow(row) print(f"[INFO] Wrote {len(products)} records to {filename}") # -------- MAIN -------- def main(): session = build_session() query = "milk" page_limit = 3 print("[INFO] Fetching listings...") products = fetch_product_listings(session, query, page_limit=page_limit) # Optional: deduplicate seen = set() deduped = [] for p in products: key = (p.get("product_id") or p.get("name", "") + "|" + p.get("store_name", "")) if key in seen: continue seen.add(key) deduped.append(p) os.makedirs("output", exist_ok=True) write_jsonl(os.path.join("output", OUTPUT_JSONL), deduped) write_csv(os.path.join("output", OUTPUT_CSV), deduped) print(f"[DONE] Scraped {len(deduped)} unique products.") if __name__ == "__main__": main()

import { RealdataAPIClient } from 'RealDataAPI-client'; // Initialize the RealdataAPIClient with API token const client = new RealdataAPIClient({ token: '', }); // Prepare actor input const input = { "categoryOrProductUrls": [ { "url": "https://www.amazon.com/s?i=specialty-aps&bbn=16225009011&rh=n%3A%2116225009011%2Cn%3A2811119011&ref=nav_em__nav_desktop_sa_intl_cell_phones_and_accessories_0_2_5_5" } ], "maxItems": 100, "proxyConfiguration": { "useRealDataAPIProxy": true } }; (async () => { // Run the actor and wait for it to finish const run = await client.actor("junglee/amazon-crawler").call(input); // Fetch and print actor results from the run's dataset (if any) console.log('Results from dataset'); const { items } = await client.dataset(run.defaultDatasetId).listItems(); items.forEach((item) => { console.dir(item); }); })();

from realdataapi_client import RealdataAPIClient # Initialize the RealdataAPIClient with your API token client = RealdataAPIClient("") # Prepare the actor input run_input = { "categoryOrProductUrls": [{ "url": "https://www.amazon.com/s?i=specialty-aps&bbn=16225009011&rh=n%3A%2116225009011%2Cn%3A2811119011&ref=nav_em__nav_desktop_sa_intl_cell_phones_and_accessories_0_2_5_5" }], "maxItems": 100, "proxyConfiguration": { "useRealDataAPIProxy": True }, } # Run the actor and wait for it to finish run = client.actor("junglee/amazon-crawler").call(run_input=run_input) # Fetch and print actor results from the run's dataset (if there are any) for item in client.dataset(run["defaultDatasetId"]).iterate_items(): print(item)

# Set API token API_TOKEN=<YOUR_API_TOKEN> # Prepare actor input cat > input.json <<'EOF' { "categoryOrProductUrls": [ { "url": "https://www.amazon.com/s?i=specialty-aps&bbn=16225009011&rh=n%3A%2116225009011%2Cn%3A2811119011&ref=nav_em__nav_desktop_sa_intl_cell_phones_and_accessories_0_2_5_5" } ], "maxItems": 100, "proxyConfiguration": { "useRealDataAPIProxy": true } } EOF # Run the actor curl "https://api.realdataapi.com/v2/acts/junglee~amazon-crawler/runs?token=$API_TOKEN" \ -X POST \ -d @input.json \ -H 'Content-Type: application/json'

{ "categoryOrProductUrls": [ { "url": "https://www.amazon.com/s?i=specialty-aps&bbn=16225009011&rh=n%3A%2116225009011%2Cn%3A2811119011&ref=nav_em__nav_desktop_sa_intl_cell_phones_and_accessories_0_2_5_5" } ], "maxItems": 100, "detailedInformation": false, "useCaptchaSolver": false, "proxyConfiguration": { "useRealDataAPIProxy": true } }

By APIs

Ecommerce Scraping API

Food Scraping API

Grocery Scraping API

Travel Scraping API

Real Estate Scraping API

Quick Commerce Scraping API

Social Media Scraping API

OTT Scraping API

Liquor Scraping API

Recruitment Scraping API

Healthcare Scraping API

Web Data

Solutions

Web Scraping Services

Web Scraping API Services

Mobile App Scraping services

Enterprise Web Crawling

Solutions

Web Unlocker API

Anti Blocking

Use Cases

Live Crawler

Scraping Browser API

Trending

Ecommerce

Grocery / Quick Commerce

Food

Travel

Get Free Quote

Unlock Business Growth with Trusted Web Data

Homeplus Mall Scraper - Extract Homeplus Mall Product Listings

RealdataAPI / homeplus-scraper

What is Homeplus Mall Data Scraper, and How Does It Work?

Why Extract Data from Homeplus Mall?

Is It Legal to Extract Homeplus Mall Data?

How Can I Extract Data from Homeplus Mall?

Do You Want More Homeplus Mall Scraping Alternatives?

Input options

Sample Result of Homeplus Mall Data Scraper

Integrations with Homeplus Mall Data Scraper – Homeplus Mall Data Extraction

Executing Homeplus Mall Data Scraping Actor with Real Data API

Related Scrapers

Tesco Grocery Scraper

Asda Grocery Scraper

Zepto Scraper

Additional Resources

Place the Amazon product URLs

Max reviews

Link selector

Mention personal data

Reviews sort

Options:

Proxy configuration

Extended output function

ON THIS PAGE

Related Scrapers

Tesco Grocery Scraper

Asda Grocery Scraper

Zepto Scraper

Where next?

Get in Touch

Web Data

Store Location

Company

By APIs

Scraper

Use Cases

Datasets

Knowledge Center

Blogs

Case Studies

Research Report

Infographics

About Us

Contact us

© 2025 RealdataAPI. All rights reserved.