Web Scraping and API Data Collection
Web scraping and APIs are essential for collecting data not available in standard datasets. This lesson covers ethical approaches to data collection.
HTTP Basics
<svg width="600" height="350" viewBox="0 0 600 350" xmlns="http://www.w3.org/2000/svg">
<rect width="600" height="350" fill="#f8f9fa" rx="10"/>
<text x="300" y="30" text-anchor="middle" font-size="18" font-weight="bold" fill="#2c3e50">HTTP Request-Response Cycle</text>
<!-- Client -->
<rect x="50" y="120" width="120" height="80" fill="#3498db" rx="5"/>
<text x="110" y="155" text-anchor="middle" font-size="14" fill="white">Client</text>
<text x="110" y="175" text-anchor="middle" font-size="10" fill="white">(Browser/Python)</text>
<!-- Server -->
<rect x="430" y="120" width="120" height="80" fill="#2ecc71" rx="5"/>
<text x="490" y="155" text-anchor="middle" font-size="14" fill="white">Server</text>
<text x="490" y="175" text-anchor="middle" font-size="10" fill="white">(API/Website)</text>
<!-- Request Arrow -->
<line x1="170" y1="140" x2="430" y2="140" stroke="#e74c3c" stroke-width="3" marker-end="url(#arrow)"/>
<text x="300" y="135" text-anchor="middle" font-size="12" fill="#e74c3c">HTTP Request</text>
<text x="300" y="125" text-anchor="middle" font-size="10" fill="#7f8c8d">GET /api/data</text>
<!-- Response Arrow -->
<line x1="430" y1="180" x2="170" y2="180" stroke="#27ae60" stroke-width="3" marker-end="url(#arrow)"/>
<text x="300" y="195" text-anchor="middle" font-size="12" fill="#27ae60">HTTP Response</text>
<text x="300" y="205" text-anchor="middle" font-size="10" fill="#7f8c8d">200 OK + JSON/HTML</text>
<!-- Status Codes -->
<text x="300" y="260" text-anchor="middle" font-size="12" fill="#2c3e50">Common Status Codes:</text>
<text x="300" y="280" text-anchor="middle" font-size="10" fill="#27ae60">200 OK | 301 Redirect | 404 Not Found | 429 Rate Limited | 500 Server Error</text>
<defs>
<marker id="arrow" markerWidth="10" markerHeight="10" refX="0" refY="3" orient="auto">
<path d="M0,0 L0,6 L9,3 z" fill="#7f8c8d"/>
</marker>
</defs>
</svg>
BeautifulSoup for HTML Scraping
import requests
from bs4 import BeautifulSoup
import pandas as pd
# Basic scraping workflow
url = "https://example.com/products"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# Find elements
titles = soup.find_all('h2', class_='product-title')
prices = soup.find_all('span', class_='price')
# Extract data
data = []
for title, price in zip(titles, prices):
data.append({
'title': title.text.strip(),
'price': float(price.text.strip().replace('$', ''))
})
df = pd.DataFrame(data)
# Handle pagination
all_products = []
for page in range(1, 11):
url = f"https://example.com/products?page={page}"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# Extract and append products
products = extract_products(soup)
all_products.extend(products)
Scrapy Framework
import scrapy
class ProductSpider(scrapy.Spider):
name = 'products'
start_urls = ['https://example.com/products']
def parse(self, response):
for product in response.css('div.product'):
yield {
'title': product.css('h2::text').get(),
'price': product.css('span.price::text').get(),
'rating': product.css('div.rating::attr(data-rating)').get(),
}
# Follow pagination
next_page = response.css('a.next::attr(href)').get()
if next_page:
yield response.follow(next_page, self.parse)
REST API Integration
import requests
import pandas as pd
from datetime import datetime
# Basic API request
api_url = "https://api.example.com/v1/data"
headers = {
"Authorization": "Bearer YOUR_API_KEY",
"Content-Type": "application/json"
}
params = {
"start_date": "2024-01-01",
"end_date": "2024-12-31",
"limit": 1000
}
response = requests.get(api_url, headers=headers, params=params)
if response.status_code == 200:
data = response.json()
df = pd.DataFrame(data['results'])
else:
print(f"Error: {response.status_code}")
# Pagination handling
def fetch_all_pages(api_url, headers, params):
all_data = []
page = 1
while True:
params['page'] = page
response = requests.get(api_url, headers=headers, params=params)
if response.status_code != 200:
break
data = response.json()
all_data.extend(data['results'])
if not data.get('has_next_page'):
break
page += 1
return all_data
Rate Limiting and Ethics
import time
import random
from urllib.robotparser import RobotFileParser
# Check robots.txt
def can_scrape(url):
rp = RobotFileParser()
rp.set_url(f"{url}/robots.txt")
rp.read()
return rp.can_fetch("*", url)
# Rate limiting
class RateLimiter:
def __init__(self, requests_per_second=1):
self.delay = 1 / requests_per_second
self.last_request = 0
def wait(self):
elapsed = time.time() - self.last_request
if elapsed < self.delay:
time.sleep(self.delay - elapsed)
self.last_request = time.time()
# Usage with polite scraping
limiter = RateLimiter(requests_per_second=2)
for url in urls:
if can_scrape(url):
limiter.wait()
response = requests.get(url, headers={
'User-Agent': 'DataScienceBot/1.0 (contact@example.com)'
})
# Process response
Key Takeaways
- Always check robots.txt and terms of service
- Implement rate limiting to be respectful
- Use APIs when available - they're more reliable
- Handle errors and retries gracefully
- Store data responsibly and check legal compliance