NTUC FairPrice Grocery Scraper - Extract product listings

#!/usr/bin/env python3 """ NTUC FairPrice Data Scraper - Sample Result Script (detailed, ready-to-run) Requirements: pip install requests beautifulsoup4 lxml pandas Notes: - This script scrapes NTUC FairPrice product listing pages and exports JSON/CSV. - Includes polite rate limiting, retries, and User-Agent headers. - For JavaScript-heavy pages, consider using Selenium or Playwright to render. """ import time import json import os import logging from urllib.parse import urljoin, urlparse import requests from requests.adapters import HTTPAdapter, Retry from bs4 import BeautifulSoup import pandas as pd import urllib.robotparser as robotparser # ---------------------------- # Configuration # ---------------------------- BASE_URL = "https://www.fairprice.com.sg" START_CATEGORY_URL = "https://www.fairprice.com.sg/c/Fruits" # Example category OUTPUT_DIR = "output" CSV_FILENAME = os.path.join(OUTPUT_DIR, "ntuc_fairprice_products.csv") JSON_FILENAME = os.path.join(OUTPUT_DIR, "ntuc_fairprice_products.json") IMAGE_DIR = os.path.join(OUTPUT_DIR, "images") RATE_LIMIT_SECONDS = 1.2 MAX_PAGES = 100 TIMEOUT = 15 USER_AGENT = "Mozilla/5.0 (compatible; NTUCFairPriceScraper/1.0; +https://example.com/bot)" # ---------------------------- # Logging # ---------------------------- logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") logger = logging.getLogger("ntuc-fairprice-scraper") # ---------------------------- # Session with retries # ---------------------------- def create_session() -> requests.Session: session = requests.Session() session.headers.update({"User-Agent": USER_AGENT, "Accept-Language": "en-US,en;q=0.9"}) retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504], allowed_methods=["GET"]) adapter = HTTPAdapter(max_retries=retries) session.mount("https://", adapter) session.mount("http://", adapter) return session # ---------------------------- # robots.txt check # ---------------------------- def can_fetch(url: str, user_agent: str = USER_AGENT) -> bool: parsed = urlparse(url) robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt" rp = robotparser.RobotFileParser() try: rp.set_url(robots_url) rp.read() return rp.can_fetch(user_agent, url) except Exception as e: logger.warning(f"Failed to read robots.txt ({robots_url}): {e} -- proceeding cautiously") return True # ---------------------------- # Fetch page # ---------------------------- def fetch_page(session: requests.Session, url: str) -> str: if not can_fetch(url): logger.error(f"robots.txt disallows scraping the URL: {url}") return None try: resp = session.get(url, timeout=TIMEOUT) resp.raise_for_status() time.sleep(RATE_LIMIT_SECONDS) return resp.text except requests.RequestException as e: logger.error(f"Request failed for {url}: {e}") return None # ---------------------------- # Parse listing page & find product links # ---------------------------- def parse_listing_for_products(html: str) -> list: soup = BeautifulSoup(html, "lxml") product_links = [] for a in soup.select("a[href*='/product/']"): href = a.get("href") full_url = urljoin(BASE_URL, href) if full_url not in product_links: product_links.append(full_url) logger.info(f"Found {len(product_links)} product links") return product_links # ---------------------------- # Parse product page # ---------------------------- def parse_product_page(html: str, url: str) -> dict: soup = BeautifulSoup(html, "lxml") product = {"source_url": url} # Name name_el = soup.select_one("h1.product-title") or soup.select_one(".product-name") product["name"] = name_el.get_text(strip=True) if name_el else None # Price price_el = soup.select_one(".product-price") or soup.select_one(".price") if price_el: try: product["price"] = float(price_el.get_text(strip=True).replace("$", "").replace(",", "")) except: product["price"] = price_el.get_text(strip=True) else: product["price"] = None # Availability avail_el = soup.select_one(".stock-status") or soup.select_one(".availability") product["availability"] = avail_el.get_text(strip=True) if avail_el else "Unknown" # Category/Breadcrumbs crumbs = [c.get_text(strip=True) for c in soup.select(".breadcrumb a")] if soup.select(".breadcrumb a") else [] product["categories"] = crumbs # Description desc_el = soup.select_one(".product-description") or soup.select_one("#description") product["description"] = desc_el.get_text(separator=" ", strip=True) if desc_el else None # Images images = set() for img in soup.select("img"): src = img.get("data-src") or img.get("src") if src: if src.startswith("//"): src = f"{urlparse(BASE_URL).scheme}:{src}" images.add(urljoin(BASE_URL, src)) product["images"] = list(images) return product # ---------------------------- # Save results # ---------------------------- def save_results_json(results: list, path: str): os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, "w", encoding="utf-8") as f: json.dump(results, f, indent=2, ensure_ascii=False) logger.info(f"Wrote JSON results to {path}") def save_results_csv(results: list, path: str): os.makedirs(os.path.dirname(path), exist_ok=True) rows = [] for r in results: row = dict(r) row["images"] = "|".join(r.get("images", [])) if r.get("images") else "" row["categories"] = "|".join(r.get("categories", [])) if r.get("categories") else "" rows.append(row) df = pd.DataFrame(rows) df.to_csv(path, index=False, encoding="utf-8") logger.info(f"Wrote CSV results to {path}") # ---------------------------- # Crawl category # ---------------------------- def crawl_category(start_url: str, max_pages: int = MAX_PAGES) -> list: session = create_session() results = [] seen_products = set() page_url = start_url pages_crawled = 0 while page_url and pages_crawled < max_pages: logger.info(f"Crawling listing page: {page_url} (page {pages_crawled+1})") listing_html = fetch_page(session, page_url) if not listing_html: break product_links = parse_listing_for_products(listing_html) for p_link in product_links: if p_link in seen_products: continue product_html = fetch_page(session, p_link) if not product_html: continue product = parse_product_page(product_html, p_link) results.append(product) seen_products.add(p_link) pages_crawled += 1 # Pagination logic: find next link soup = BeautifulSoup(listing_html, "lxml") next_link_el = soup.select_one("a[rel='next']") or soup.find("a", string=lambda s: s and "next" in s.lower()) page_url = urljoin(BASE_URL, next_link_el["href"]) if next_link_el and next_link_el.get("href") else None return results # ---------------------------- # Main # ---------------------------- def main(): os.makedirs(OUTPUT_DIR, exist_ok=True) logger.info("Starting NTUC FairPrice Data Scraper") results = crawl_category(START_CATEGORY_URL, max_pages=20) if results: save_results_json(results, JSON_FILENAME) save_results_csv(results, CSV_FILENAME) for i, item in enumerate(results[:5], start=1): logger.info(f"Sample {i}: {item.get('name')} - Price: {item.get('price')} - Images: {len(item.get('images', []))}") else: logger.warning("No products scraped.") if __name__ == "__main__": main()

import { RealdataAPIClient } from 'RealDataAPI-client'; // Initialize the RealdataAPIClient with API token const client = new RealdataAPIClient({ token: '', }); // Prepare actor input const input = { "categoryOrProductUrls": [ { "url": "https://www.amazon.com/s?i=specialty-aps&bbn=16225009011&rh=n%3A%2116225009011%2Cn%3A2811119011&ref=nav_em__nav_desktop_sa_intl_cell_phones_and_accessories_0_2_5_5" } ], "maxItems": 100, "proxyConfiguration": { "useRealDataAPIProxy": true } }; (async () => { // Run the actor and wait for it to finish const run = await client.actor("junglee/amazon-crawler").call(input); // Fetch and print actor results from the run's dataset (if any) console.log('Results from dataset'); const { items } = await client.dataset(run.defaultDatasetId).listItems(); items.forEach((item) => { console.dir(item); }); })();

from realdataapi_client import RealdataAPIClient # Initialize the RealdataAPIClient with your API token client = RealdataAPIClient("") # Prepare the actor input run_input = { "categoryOrProductUrls": [{ "url": "https://www.amazon.com/s?i=specialty-aps&bbn=16225009011&rh=n%3A%2116225009011%2Cn%3A2811119011&ref=nav_em__nav_desktop_sa_intl_cell_phones_and_accessories_0_2_5_5" }], "maxItems": 100, "proxyConfiguration": { "useRealDataAPIProxy": True }, } # Run the actor and wait for it to finish run = client.actor("junglee/amazon-crawler").call(run_input=run_input) # Fetch and print actor results from the run's dataset (if there are any) for item in client.dataset(run["defaultDatasetId"]).iterate_items(): print(item)

# Set API token API_TOKEN=<YOUR_API_TOKEN> # Prepare actor input cat > input.json <<'EOF' { "categoryOrProductUrls": [ { "url": "https://www.amazon.com/s?i=specialty-aps&bbn=16225009011&rh=n%3A%2116225009011%2Cn%3A2811119011&ref=nav_em__nav_desktop_sa_intl_cell_phones_and_accessories_0_2_5_5" } ], "maxItems": 100, "proxyConfiguration": { "useRealDataAPIProxy": true } } EOF # Run the actor curl "https://api.realdataapi.com/v2/acts/junglee~amazon-crawler/runs?token=$API_TOKEN" \ -X POST \ -d @input.json \ -H 'Content-Type: application/json'

{ "categoryOrProductUrls": [ { "url": "https://www.amazon.com/s?i=specialty-aps&bbn=16225009011&rh=n%3A%2116225009011%2Cn%3A2811119011&ref=nav_em__nav_desktop_sa_intl_cell_phones_and_accessories_0_2_5_5" } ], "maxItems": 100, "detailedInformation": false, "useCaptchaSolver": false, "proxyConfiguration": { "useRealDataAPIProxy": true } }

By APIs

Ecommerce Scraping API

Food Scraping API

Grocery Scraping API

Travel Scraping API

Real Estate Scraping API

Quick Commerce Scraping API

Social Media Scraping API

OTT Scraping API

Liquor Scraping API

Recruitment Scraping API

Healthcare Scraping API

Web Data

Solutions

Web Scraping Services

Web Scraping API Services

Mobile App Scraping services

Enterprise Web Crawling

Solutions

Web Unlocker API

Anti Blocking

Use Cases

Live Crawler

Scraping Browser API

Trending

Ecommerce

Grocery / Quick Commerce

Food

Travel

Get Free Quote

Unlock Business Growth with Trusted Web Data

NTUC FairPrice Grocery Scraper - Extract NTUC FairPrice Product Listings

RealdataAPI / ntuc-fairprice-grocery-scraper

What is NTUC FairPrice Data Scraper, and How Does It Work?

Why Extract Data from NTUC FairPrice?

Is It Legal to Extract NTUC FairPrice Data?

How Can I Extract Data from NTUC FairPrice?

Do You Want More NTUC FairPrice Scraping Alternatives?

Input options

Sample Result of NTUC FairPrice Data Scraper

Integrations with NTUC FairPrice Data Scraper – NTUC FairPrice Data Extraction

Executing NTUC FairPrice Data Scraping Actor with Real Data API

Related Scrapers

Tesco Grocery Scraper

Asda Grocery Scraper

Zepto Scraper

Additional Resources

Place the Amazon product URLs

Max reviews

Link selector

Mention personal data

Reviews sort

Options:

Proxy configuration

Extended output function

ON THIS PAGE

Related Scrapers

Tesco Grocery Scraper

Asda Grocery Scraper

Zepto Scraper

Where next?

Get in Touch

Web Data

Store Location

Company

By APIs

Scraper

Use Cases

Datasets

Knowledge Center

Blogs

Case Studies

Research Report

Infographics

About Us

Contact us

© 2025 RealdataAPI. All rights reserved.