Arabind Scraper - Extract Arabind product listings

#!/usr/bin/env python3 """Sample Arabind Data Scraper (detailed code only) - Scrapes product listings (name, price, availability, sku, url, category) - Respects robots.txt - Uses polite rate limiting and retries - Outputs JSON and CSV Note: Adjust CSS selectors to match Arabind's actual page structure. """ import requests from bs4 import BeautifulSoup import pandas as pd import json import time import random import logging from urllib.parse import urljoin, urlparse import urllib.robotparser from typing import List, Dict, Optional logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") SESSION = requests.Session() SESSION.headers.update({ "User-Agent": "Mozilla/5.0 (compatible; ArabindDataScraper/1.0; +https://example.com/bot)" }) # ---------- Configuration ---------- BASE_URL = "https://www.arabind.com" # replace with real Arabind domain START_PATHS = [ "/collections/grocery", "/collections/beverages", ] # category listing pages to start from OUTPUT_JSON = "arabind_products.json" OUTPUT_CSV = "arabind_products.csv" MAX_PAGES_PER_CATEGORY = 5 REQUEST_TIMEOUT = 10 MIN_DELAY = 1.0 MAX_DELAY = 3.0 RETRY_COUNT = 3 # ----------------------------------- def can_fetch(url: str, user_agent: str = SESSION.headers["User-Agent"]) -> bool: """Check robots.txt for permission to scrape the given URL.""" parsed = urlparse(url) robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt" rp = urllib.robotparser.RobotFileParser() try: rp.set_url(robots_url) rp.read() return rp.can_fetch(user_agent, url) except Exception: # If robots.txt isn't accessible, default to conservative False logging.warning("Could not read robots.txt; proceeding cautiously.") return False def polite_get(url: str, session: requests.Session = SESSION, timeout: int = REQUEST_TIMEOUT) -> Optional[requests.Response]: """GET with retries and polite sleep.""" for attempt in range(1, RETRY_COUNT + 1): try: resp = session.get(url, timeout=timeout) if resp.status_code == 200: delay = random.uniform(MIN_DELAY, MAX_DELAY) time.sleep(delay) return resp else: logging.warning(f"GET {url} returned status {resp.status_code}") if 500 <= resp.status_code < 600: time.sleep(2 ** attempt) else: return None except requests.RequestException as e: logging.warning(f"RequestException on {url}: {e} (attempt {attempt})") time.sleep(2 ** attempt) logging.error(f"Failed to GET {url} after {RETRY_COUNT} attempts") return None def parse_product_card(card: BeautifulSoup, base_url: str = BASE_URL) -> Dict: """Parse a product card element into structured data. NOTE: Update selectors according to actual site markup. """ # Name name_tag = card.select_one(".product-title, h2.product-title, .title") name = name_tag.get_text(strip=True) if name_tag else None # URL a_tag = card.select_one("a[href]") url = urljoin(base_url, a_tag["href"]) if a_tag and a_tag.get("href") else None # Price price_tag = card.select_one(".price, .product-price, .money") price = price_tag.get_text(strip=True) if price_tag else None # Availability (best-effort) availability_tag = card.select_one(".availability, .stock, .sold-out") if availability_tag: availability = availability_tag.get_text(strip=True) else: add_to_cart = card.select_one(".add-to-cart, button.add-to-cart") availability = "In Stock" if add_to_cart else "Unknown" # SKU / Product code (if present) sku_tag = card.select_one(".sku, .product-sku") sku = sku_tag.get_text(strip=True) if sku_tag else None # Category (optional: inferred from ancestor or provided externally) category_tag = card.select_one(".product-category") category = category_tag.get_text(strip=True) if category_tag else None return { "name": name, "price": price, "availability": availability, "sku": sku, "url": url, "category": category } def extract_products_from_listing(listing_html: str, base_url: str = BASE_URL) -> List[Dict]: """Extract product entries from a category listing page HTML.""" soup = BeautifulSoup(listing_html, "html.parser") # Find product blocks - update selector to match the site's markup product_cards = soup.select(".product-card, .product, .grid-item") results = [] for card in product_cards: try: prod = parse_product_card(card, base_url) results.append(prod) except Exception as e: logging.warning(f"Error parsing product card: {e}") return results def find_pagination_urls(listing_html: str, base_url: str = BASE_URL) -> List[str]: """Extract pagination links from a listing page to follow next pages.""" soup = BeautifulSoup(listing_html, "html.parser") links = [] for a in soup.select("a[href]"): href = a.get("href") if href and "page=" in href: links.append(urljoin(base_url, href)) # Deduplicate while preserving order seen = set() deduped = [] for l in links: if l not in seen: deduped.append(l) seen.add(l) return deduped def scrape_category(path) -> List[Dict]: """Scrape up to MAX_PAGES_PER_CATEGORY pages for a given category path.""" start_url = urljoin(BASE_URL, path) if not can_fetch(start_url): logging.error(f"robots.txt disallows scraping {start_url}. Aborting category.") return [] products = [] logging.info(f"Scraping category start: {start_url}") resp = polite_get(start_url) if not resp: return products listing_html = resp.text products.extend(extract_products_from_listing(listing_html)) pagination_urls = find_pagination_urls(listing_html) # Limit pages and ensure full absolute URLs pagination_urls = [url for url in pagination_urls if urlparse(url).netloc == urlparse(BASE_URL).netloc] pagination_urls = pagination_urls[:MAX_PAGES_PER_CATEGORY - 1] # already scraped page 1 for purl in pagination_urls: if not can_fetch(purl): logging.warning(f"Skipping paginated URL due to robots.txt: {purl}") continue logging.info(f"Scraping paginated URL: {purl}") presp = polite_get(purl) if not presp: continue products.extend(extract_products_from_listing(presp.text)) return products def normalize_price(price_str: Optional[str]) -> Optional[float]: """Attempt to parse a price string into a float (best-effort).""" if not price_str: return None import re # remove currency symbols and commas cleaned = re.sub(r"[^\d\.]", "", price_str) try: return float(cleaned) if cleaned else None except ValueError: return None def main(): all_products = [] for path in START_PATHS: try: category_results = scrape_category(path) for p in category_results: # Post-process fields p["price_float"] = normalize_price(p.get("price")) if not p.get("category"): # infer category from path if not present p["category"] = path.strip("/").split("/")[-1] all_products.append(p) logging.info(f"Scraped {len(category_results)} products from {path}") except Exception as e: logging.exception(f"Unhandled error scraping category {path}: {e}") # Deduplicate by URL or SKU df = pd.DataFrame(all_products) if "url" in df.columns: df = df.drop_duplicates(subset=["url"]) elif "sku" in df.columns: df = df.drop_duplicates(subset=["sku"]) df = df.fillna("") # replace NaN with empty strings for output # Save to JSON records = df.to_dict(orient="records") with open(OUTPUT_JSON, "w", encoding="utf-8") as f: json.dump(records, f, ensure_ascii=False, indent=2) # Save to CSV df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8") logging.info(f"Saved {len(df)} unique products to {OUTPUT_JSON} and {OUTPUT_CSV}") if __name__ == "__main__": main()

import { RealdataAPIClient } from 'RealDataAPI-client'; // Initialize the RealdataAPIClient with API token const client = new RealdataAPIClient({ token: '', }); // Prepare actor input const input = { "categoryOrProductUrls": [ { "url": "https://www.amazon.com/s?i=specialty-aps&bbn=16225009011&rh=n%3A%2116225009011%2Cn%3A2811119011&ref=nav_em__nav_desktop_sa_intl_cell_phones_and_accessories_0_2_5_5" } ], "maxItems": 100, "proxyConfiguration": { "useRealDataAPIProxy": true } }; (async () => { // Run the actor and wait for it to finish const run = await client.actor("junglee/amazon-crawler").call(input); // Fetch and print actor results from the run's dataset (if any) console.log('Results from dataset'); const { items } = await client.dataset(run.defaultDatasetId).listItems(); items.forEach((item) => { console.dir(item); }); })();

from realdataapi_client import RealdataAPIClient # Initialize the RealdataAPIClient with your API token client = RealdataAPIClient("") # Prepare the actor input run_input = { "categoryOrProductUrls": [{ "url": "https://www.amazon.com/s?i=specialty-aps&bbn=16225009011&rh=n%3A%2116225009011%2Cn%3A2811119011&ref=nav_em__nav_desktop_sa_intl_cell_phones_and_accessories_0_2_5_5" }], "maxItems": 100, "proxyConfiguration": { "useRealDataAPIProxy": True }, } # Run the actor and wait for it to finish run = client.actor("junglee/amazon-crawler").call(run_input=run_input) # Fetch and print actor results from the run's dataset (if there are any) for item in client.dataset(run["defaultDatasetId"]).iterate_items(): print(item)

# Set API token API_TOKEN=<YOUR_API_TOKEN> # Prepare actor input cat > input.json <<'EOF' { "categoryOrProductUrls": [ { "url": "https://www.amazon.com/s?i=specialty-aps&bbn=16225009011&rh=n%3A%2116225009011%2Cn%3A2811119011&ref=nav_em__nav_desktop_sa_intl_cell_phones_and_accessories_0_2_5_5" } ], "maxItems": 100, "proxyConfiguration": { "useRealDataAPIProxy": true } } EOF # Run the actor curl "https://api.realdataapi.com/v2/acts/junglee~amazon-crawler/runs?token=$API_TOKEN" \ -X POST \ -d @input.json \ -H 'Content-Type: application/json'

{ "categoryOrProductUrls": [ { "url": "https://www.amazon.com/s?i=specialty-aps&bbn=16225009011&rh=n%3A%2116225009011%2Cn%3A2811119011&ref=nav_em__nav_desktop_sa_intl_cell_phones_and_accessories_0_2_5_5" } ], "maxItems": 100, "detailedInformation": false, "useCaptchaSolver": false, "proxyConfiguration": { "useRealDataAPIProxy": true } }

Arabind Scraper - Extract Arabind Product Listings

RealdataAPI / arabind-scraper

What is Arabind Data Scraper, and How Does It Work?

Why Extract Data from Arabind?

Is It Legal to Extract Arabind Data?

How Can I Extract Data from Arabind?

Do You Want More Arabind Scraping Alternatives?

Input options

Sample Result of Arabind Data Scraper

Integrations with Arabind Data Scraper – Arabind Data Extraction

Executing Arabind Data Scraping Actor with Real Data API

Tesco Grocery Scraper

Asda Grocery Scraper

Zepto Scraper

Additional Resources

Place the Amazon product URLs

Max reviews

Link selector

Mention personal data

Reviews sort

Options:

Proxy configuration

Extended output function

ON THIS PAGE

Tesco Grocery Scraper

Asda Grocery Scraper

Zepto Scraper

Where next?

Get in Touch

Web Data

By APIs

Scraper

Use Cases

Datasets

Store Location

Knowledge Center

Company

About Us

Contact us

© 2025 RealdataAPI. All rights reserved.

By APIs

Ecommerce Scraping API

Food Scraping API

Grocery Scraping API

Travel Scraping API

Real Estate Scraping API

Quick Commerce Scraping API

Social Media Scraping API

OTT Scraping API

Liquor Scraping API

Recruitment Scraping API

Healthcare Scraping API

Web Data

Solutions

Web Scraping Services

Web Scraping API Services

Mobile App Scraping services

Enterprise Web Crawling

Solutions

Web Unlocker API

Anti Blocking

Use Cases

Live Crawler

Scraping Browser API

Trending

Ecommerce

Grocery / Quick Commerce

Food

Travel

Get Free Quote

Unlock Business Growth with Trusted Web Data

Arabind Scraper - Extract Arabind Product Listings

RealdataAPI / arabind-scraper

What is Arabind Data Scraper, and How Does It Work?

Why Extract Data from Arabind?

Is It Legal to Extract Arabind Data?

How Can I Extract Data from Arabind?

Do You Want More Arabind Scraping Alternatives?

Input options

Sample Result of Arabind Data Scraper

Integrations with Arabind Data Scraper – Arabind Data Extraction

Executing Arabind Data Scraping Actor with Real Data API

Related Scrapers

Tesco Grocery Scraper

Asda Grocery Scraper

Zepto Scraper

Additional Resources

Place the Amazon product URLs

Max reviews

Link selector

Mention personal data

Reviews sort

Options:

Proxy configuration

Extended output function

ON THIS PAGE

Related Scrapers

Tesco Grocery Scraper

Asda Grocery Scraper

Zepto Scraper

Where next?

Get in Touch

Web Data

By APIs

Scraper

Use Cases

Datasets

Store Location

Knowledge Center

Company

About Us

Contact us

© 2025 RealdataAPI. All rights reserved.