# scrape > Build Python web scrapers with BeautifulSoup and requests. Includes rate limiting, user-agent rotation, retry logic, structured CSV/JSON output, and error handling. Use when creating tools to extract data from websites or building data collection scripts. - Author: BAM - Repository: bamussel23/claude-skills - Version: 20260207211010 - Stars: 0 - Forks: 0 - Last Updated: 2026-02-08 - Source: https://github.com/bamussel23/claude-skills - Web: https://mule.run/skillshub/@@bamussel23/claude-skills~scrape:20260207211010 --- --- name: scrape description: > Build Python web scrapers with BeautifulSoup and requests. Includes rate limiting, user-agent rotation, retry logic, structured CSV/JSON output, and error handling. Use when creating tools to extract data from websites or building data collection scripts. --- # Web Scraper Builder You build robust Python web scrapers using libraries available on this system. ## Available libraries - `requests` — HTTP client (install if needed: `pip3 install requests`) - `beautifulsoup4` — HTML parsing (`pip3 install beautifulsoup4`) - `pandas` — Data structuring and CSV/JSON export (already installed) - `lxml` — Fast HTML/XML parser (`pip3 install lxml`) ## Scraper template ```python """Scraper for .""" import csv import logging import random import time from pathlib import Path from typing import Any import requests from bs4 import BeautifulSoup logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") logger = logging.getLogger(__name__) # ── Configuration ─────────────────────────────────────────────────── BASE_URL = "https://example.com" OUTPUT_PATH = Path("output/results.csv") REQUEST_DELAY = (1.0, 3.0) # Random delay range in seconds USER_AGENTS = [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0", ] # ── HTTP Session ──────────────────────────────────────────────────── session = requests.Session() session.headers.update({ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.9", }) def fetch_page(url: str, retries: int = 3) -> BeautifulSoup | None: """Fetch a page with retry logic and rate limiting.""" for attempt in range(retries): try: session.headers["User-Agent"] = random.choice(USER_AGENTS) time.sleep(random.uniform(*REQUEST_DELAY)) response = session.get(url, timeout=15) response.raise_for_status() return BeautifulSoup(response.text, "lxml") except requests.RequestException as e: logger.warning("Attempt %d failed for %s: %s", attempt + 1, url, e) if attempt < retries - 1: time.sleep(2 ** attempt) # Exponential backoff return None def parse_item(element) -> dict[str, Any]: """Extract data from a single item element.""" return { "title": element.select_one("h2").get_text(strip=True) if element.select_one("h2") else "", "link": element.select_one("a")["href"] if element.select_one("a") else "", # Add more fields as needed } def scrape() -> list[dict[str, Any]]: """Main scraping loop.""" results: list[dict[str, Any]] = [] page = 1 while True: url = f"{BASE_URL}/page/{page}" logger.info("Scraping page %d: %s", page, url) soup = fetch_page(url) if soup is None: logger.error("Failed to fetch page %d, stopping.", page) break items = soup.select(".item-selector") # Adjust selector if not items: logger.info("No more items found on page %d, done.", page) break for item in items: results.append(parse_item(item)) page += 1 return results def save_csv(data: list[dict[str, Any]], path: Path) -> None: """Save results to CSV.""" path.parent.mkdir(parents=True, exist_ok=True) if not data: logger.warning("No data to save.") return with open(path, "w", newline="", encoding="utf-8") as f: writer = csv.DictWriter(f, fieldnames=data[0].keys()) writer.writeheader() writer.writerows(data) logger.info("Saved %d records to %s", len(data), path) if __name__ == "__main__": results = scrape() save_csv(results, OUTPUT_PATH) ``` ## Best practices 1. **Rate limiting** — Always add random delays between requests (1-3 seconds) 2. **User-agent rotation** — Rotate between realistic browser user-agents 3. **Retry with backoff** — Exponential backoff on failures (2^attempt seconds) 4. **Timeout** — Set request timeout (15s recommended) 5. **Session reuse** — Use `requests.Session()` for connection pooling 6. **Graceful stopping** — Handle KeyboardInterrupt for partial saves 7. **Logging** — Log every page fetch and error for debugging 8. **Respect robots.txt** — Check before scraping commercial sites 9. **Output** — Default to CSV with `csv.DictWriter` for structured data ## Security notes - Never scrape sites that explicitly prohibit it in their ToS - Never store scraped personal data without consent - Never hammer a server — always rate limit - Use sessions but don't store cookies across runs unless needed