CW

Web Scraping and API Data Collection

Module 5: Data Collection & SQLFree Lesson

Advertisement

Web Scraping and API Data Collection

Web scraping and APIs are essential for collecting data not available in standard datasets. This lesson covers ethical approaches to data collection.

HTTP Basics

<svg width="600" height="350" viewBox="0 0 600 350" xmlns="http://www.w3.org/2000/svg">
  <rect width="600" height="350" fill="#f8f9fa" rx="10"/>
  <text x="300" y="30" text-anchor="middle" font-size="18" font-weight="bold" fill="#2c3e50">HTTP Request-Response Cycle</text>
  
  <!-- Client -->
  <rect x="50" y="120" width="120" height="80" fill="#3498db" rx="5"/>
  <text x="110" y="155" text-anchor="middle" font-size="14" fill="white">Client</text>
  <text x="110" y="175" text-anchor="middle" font-size="10" fill="white">(Browser/Python)</text>
  
  <!-- Server -->
  <rect x="430" y="120" width="120" height="80" fill="#2ecc71" rx="5"/>
  <text x="490" y="155" text-anchor="middle" font-size="14" fill="white">Server</text>
  <text x="490" y="175" text-anchor="middle" font-size="10" fill="white">(API/Website)</text>
  
  <!-- Request Arrow -->
  <line x1="170" y1="140" x2="430" y2="140" stroke="#e74c3c" stroke-width="3" marker-end="url(#arrow)"/>
  <text x="300" y="135" text-anchor="middle" font-size="12" fill="#e74c3c">HTTP Request</text>
  <text x="300" y="125" text-anchor="middle" font-size="10" fill="#7f8c8d">GET /api/data</text>
  
  <!-- Response Arrow -->
  <line x1="430" y1="180" x2="170" y2="180" stroke="#27ae60" stroke-width="3" marker-end="url(#arrow)"/>
  <text x="300" y="195" text-anchor="middle" font-size="12" fill="#27ae60">HTTP Response</text>
  <text x="300" y="205" text-anchor="middle" font-size="10" fill="#7f8c8d">200 OK + JSON/HTML</text>
  
  <!-- Status Codes -->
  <text x="300" y="260" text-anchor="middle" font-size="12" fill="#2c3e50">Common Status Codes:</text>
  <text x="300" y="280" text-anchor="middle" font-size="10" fill="#27ae60">200 OK | 301 Redirect | 404 Not Found | 429 Rate Limited | 500 Server Error</text>
  
  <defs>
    <marker id="arrow" markerWidth="10" markerHeight="10" refX="0" refY="3" orient="auto">
      <path d="M0,0 L0,6 L9,3 z" fill="#7f8c8d"/>
    </marker>
  </defs>
</svg>

BeautifulSoup for HTML Scraping

import requests
from bs4 import BeautifulSoup
import pandas as pd

# Basic scraping workflow
url = "https://example.com/products"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Find elements
titles = soup.find_all('h2', class_='product-title')
prices = soup.find_all('span', class_='price')

# Extract data
data = []
for title, price in zip(titles, prices):
    data.append({
        'title': title.text.strip(),
        'price': float(price.text.strip().replace('$', ''))
    })

df = pd.DataFrame(data)

# Handle pagination
all_products = []
for page in range(1, 11):
    url = f"https://example.com/products?page={page}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    # Extract and append products
    products = extract_products(soup)
    all_products.extend(products)

Scrapy Framework

import scrapy

class ProductSpider(scrapy.Spider):
    name = 'products'
    start_urls = ['https://example.com/products']
    
    def parse(self, response):
        for product in response.css('div.product'):
            yield {
                'title': product.css('h2::text').get(),
                'price': product.css('span.price::text').get(),
                'rating': product.css('div.rating::attr(data-rating)').get(),
            }
        
        # Follow pagination
        next_page = response.css('a.next::attr(href)').get()
        if next_page:
            yield response.follow(next_page, self.parse)

REST API Integration

import requests
import pandas as pd
from datetime import datetime

# Basic API request
api_url = "https://api.example.com/v1/data"
headers = {
    "Authorization": "Bearer YOUR_API_KEY",
    "Content-Type": "application/json"
}
params = {
    "start_date": "2024-01-01",
    "end_date": "2024-12-31",
    "limit": 1000
}

response = requests.get(api_url, headers=headers, params=params)

if response.status_code == 200:
    data = response.json()
    df = pd.DataFrame(data['results'])
else:
    print(f"Error: {response.status_code}")

# Pagination handling
def fetch_all_pages(api_url, headers, params):
    all_data = []
    page = 1
    
    while True:
        params['page'] = page
        response = requests.get(api_url, headers=headers, params=params)
        
        if response.status_code != 200:
            break
            
        data = response.json()
        all_data.extend(data['results'])
        
        if not data.get('has_next_page'):
            break
        page += 1
    
    return all_data

Rate Limiting and Ethics

import time
import random
from urllib.robotparser import RobotFileParser

# Check robots.txt
def can_scrape(url):
    rp = RobotFileParser()
    rp.set_url(f"{url}/robots.txt")
    rp.read()
    return rp.can_fetch("*", url)

# Rate limiting
class RateLimiter:
    def __init__(self, requests_per_second=1):
        self.delay = 1 / requests_per_second
        self.last_request = 0
    
    def wait(self):
        elapsed = time.time() - self.last_request
        if elapsed < self.delay:
            time.sleep(self.delay - elapsed)
        self.last_request = time.time()

# Usage with polite scraping
limiter = RateLimiter(requests_per_second=2)

for url in urls:
    if can_scrape(url):
        limiter.wait()
        response = requests.get(url, headers={
            'User-Agent': 'DataScienceBot/1.0 (contact@example.com)'
        })
        # Process response

Key Takeaways

  1. Always check robots.txt and terms of service
  2. Implement rate limiting to be respectful
  3. Use APIs when available - they're more reliable
  4. Handle errors and retries gracefully
  5. Store data responsibly and check legal compliance

Advertisement

Need Expert Data Science Help?

Get personalized tutoring, project support, or professional consulting.

Advertisement