Python Web Scraping — BeautifulSoup & Beyond
Web scraping extracts data from websites. It's used for data collection, price monitoring, research, and building datasets. This guide covers ethical scraping practices and technical implementation.
Learning Objectives
- Parse HTML with BeautifulSoup using find, find_all, and CSS selectors
- Handle pagination, dynamic content, and different page structures
- Respect robots.txt, rate limits, and website terms of service
- Store scraped data efficiently as JSON or CSV
- Handle common scraping challenges (JavaScript-rendered content, CAPTCHAs)
BeautifulSoup Basics
BeautifulSoup parses HTML and provides Pythonic ways to navigate the DOM:
from bs4 import BeautifulSoup
import requests
# Fetch and parse a webpage
response = requests.get('https://example.com', timeout=10)
soup = BeautifulSoup(response.text, 'html.parser')
# Find the page title
title = soup.find('title').text
print(f"Page title: {title}")
order: 31
# Find all links
links = soup.find_all('a')
for link in links:
href = link.get('href', '')
text = link.text.strip()
print(f"Link: {text} -> {href}")
# Find first heading
h1 = soup.find('h1')
if h1:
print(f"Main heading: {h1.text}")
Parsing Different Sources
from bs4 import BeautifulSoup
# Parse from string
html_string = "<html><body><h1>Hello</h1></body></html>"
soup = BeautifulSoup(html_string, 'html.parser')
# Parse from file
with open('page.html', 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'html.parser')
# Different parsers
soup = BeautifulSoup(html, 'html.parser') # Built-in (no extra deps)
soup = BeautifulSoup(html, 'lxml') # Faster, needs lxml
soup = BeautifulSoup(html, 'html5lib') # Most lenient, handles broken HTML
Finding Elements
BeautifulSoup offers multiple ways to find elements:
find() and find_all()
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
# Find first matching element
first_div = soup.find('div')
first_paragraph = soup.find('p', class_='intro')
first_link = soup.find('a', href='/about')
# Find all matching elements
all_paragraphs = soup.find_all('p')
all_links = soup.find_all('a')
all_images = soup.find_all('img')
# Filter by attributes
links_with_class = soup.find_all('a', class_='nav-link')
divs_with_id = soup.find_all('div', id='content')
inputs_with_type = soup.find_all('input', type='text')
# Filter by text content
paragraphs_hello = soup.find_all('p', string='Hello')
# Or use a function
paragraphs_with_hello = soup.find_all('p', string=lambda text: text and 'hello' in text.lower())
CSS Selectors
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
# Select by tag
paragraphs = soup.select('p')
# Select by class
intro = soup.select('.intro')
# Select by ID
content = soup.select('#content')
# Combinators
div_paragraphs = soup.select('div > p') # Direct children
all_paragraphs = soup.select('div p') # All descendants
first_child = soup.select_one('ul > li:first-child') # First list item
# Attribute selectors
links = soup.select('a[href^="https://"]') # Links starting with https
images = soup.select('img[src$=".jpg"]') # Images ending with .jpg
inputs = soup.select('input[type="text"]') # Text inputs
# Complex selectors
product_cards = soup.select('div.product-card > h2.title')
nav_links = soup.select('nav ul li a.active')
Extracting Data
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
# Extract text
h1_text = soup.find('h1').text.strip()
link_text = soup.find('a').get_text(strip=True)
# Extract attributes
link_href = soup.find('a')['href']
img_src = soup.find('img')['src']
data_value = soup.find('div')['data-value']
# Extract multiple attributes
img = soup.find('img')
src = img.get('src', '')
alt = img.get('alt', '')
width = img.get('width', '')
# Get all text from page
all_text = soup.get_text(separator='\n', strip=True)
# Get text with specific formatting
for element in soup.find_all(['h1', 'h2', 'h3', 'p']):
print(f"{element.name}: {element.text.strip()}")
Structured Scraping Patterns
Scraping Product Listings
import requests
from bs4 import BeautifulSoup
import json
def scrape_products(url):
"""Scrape product information from an e-commerce page."""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=10)
soup = BeautifulSoup(response.text, 'html.parser')
products = []
for card in soup.select('.product-card'):
try:
product = {
'name': card.select_one('.product-title').text.strip(),
'price': float(card.select_one('.price').text.strip('$').replace(',', '')),
'rating': float(card.select_one('.rating').text) if card.select_one('.rating') else None,
'url': card.select_one('a.product-link')['href'],
'image': card.select_one('img')['src'] if card.select_one('img') else None,
'in_stock': 'out-of-stock' not in card.get('class', [])
}
products.append(product)
except (AttributeError, ValueError, TypeError) as e:
print(f"Error parsing product: {e}")
continue
return products
Scraping News Articles
import requests
from bs4 import BeautifulSoup
from datetime import datetime
def scrape_article(url):
"""Scrape a news article page."""
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers, timeout=10)
soup = BeautifulSoup(response.text, 'html.parser')
article = {
'title': soup.find('h1').text.strip() if soup.find('h1') else '',
'author': soup.find('meta', attrs={'name': 'author'})['content'] if soup.find('meta', attrs={'name': 'author'}) else '',
'date': soup.find('time')['datetime'] if soup.find('time') else '',
'content': [],
'url': url
}
# Extract article body paragraphs
article_body = soup.find('article') or soup.find('div', class_='content')
if article_body:
for paragraph in article_body.find_all('p'):
text = paragraph.text.strip()
if text:
article['content'].append(text)
article['full_text'] = '\n\n'.join(article['content'])
return article
def scrape_news_list(base_url):
"""Scrape a list of article links from a news page."""
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(base_url, headers=headers, timeout=10)
soup = BeautifulSoup(response.text, 'html.parser')
articles = []
for link in soup.select('article a[href]'):
href = link['href']
if not href.startswith('http'):
href = base_url.rstrip('/') + '/' + href.lstrip('/')
articles.append(href)
return list(set(articles)) # Remove duplicates
Handling Pagination
import requests
from bs4 import BeautifulSoup
import time
def scrape_all_pages(base_url, max_pages=10):
"""Scrape multiple pages with pagination."""
all_items = []
headers = {'User-Agent': 'Mozilla/5.0'}
for page in range(1, max_pages + 1):
url = f"{base_url}?page={page}"
print(f"Scraping page {page}...")
response = requests.get(url, headers=headers, timeout=10)
if response.status_code != 200:
print(f"Failed to fetch page {page}: {response.status_code}")
break
soup = BeautifulSoup(response.text, 'html.parser')
items = soup.select('.item-card')
if not items:
print(f"No items found on page {page}, stopping.")
break
for item in items:
data = {
'title': item.select_one('.title').text.strip(),
'description': item.select_one('.description').text.strip(),
'url': item.select_one('a')['href']
}
all_items.append(data)
# Be polite — wait between requests
time.sleep(2)
print(f"Scraped {len(all_items)} items from {page} pages")
return all_items
Infinite Scroll Simulation
import requests
from bs4 import BeautifulSoup
import time
import json
def scrape_infinite_scroll(base_url, api_endpoint=None, max_pages=50):
"""Simulate infinite scroll by finding the underlying API."""
all_items = []
for page in range(1, max_pages + 1):
# Many infinite scroll sites use an API endpoint
params = {'page': page, 'per_page': 20}
headers = {'User-Agent': 'Mozilla/5.0', 'X-Requested-With': 'XMLHttpRequest'}
try:
if api_endpoint:
response = requests.get(api_endpoint, params=params, headers=headers, timeout=10)
data = response.json()
items = data.get('items', [])
else:
# Fall back to HTML scraping
response = requests.get(f"{base_url}?page={page}", headers=headers, timeout=10)
soup = BeautifulSoup(response.text, 'html.parser')
items = [{'title': el.text} for el in soup.select('.item')]
if not items:
break
all_items.extend(items)
time.sleep(1)
except Exception as e:
print(f"Error on page {page}: {e}")
break
return all_items
Respecting robots.txt and Rate Limiting
from urllib.robotparser import RobotFileParser
import time
def can_scrape(url, user_agent='*'):
"""Check if scraping is allowed by robots.txt."""
from urllib.parse import urlparse
parsed = urlparse(url)
robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
rp = RobotFileParser()
rp.set_url(robots_url)
rp.read()
return rp.can_fetch(user_agent, url)
# Usage
if can_scrape('https://example.com/products'):
data = scrape_page('https://example.com/products')
else:
print("Scraping not allowed by robots.txt")
Rate Limiting
import time
from functools import wraps
class RateLimiter:
"""Simple rate limiter for web scraping."""
def __init__(self, requests_per_second=1):
self.min_interval = 1.0 / requests_per_second
self.last_request_time = 0
def wait(self):
elapsed = time.time() - self.last_request_time
if elapsed < self.min_interval:
time.sleep(self.min_interval - elapsed)
self.last_request_time = time.time()
def __call__(self, func):
@wraps(func)
def wrapper(*args, **kwargs):
self.wait()
return func(*args, **kwargs)
return wrapper
# Usage
limiter = RateLimiter(requests_per_second=2)
@limiter
def scrape_page(url):
return requests.get(url, timeout=10)
Handling JavaScript-Rendered Content
For pages that load content dynamically with JavaScript:
# Option 1: Use Selenium for full browser rendering
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def scrape_js_page(url):
"""Scrape JavaScript-rendered content with Selenium."""
options = webdriver.ChromeOptions()
options.add_argument('--headless') # Run without visible browser
driver = webdriver.Chrome(options=options)
try:
driver.get(url)
# Wait for dynamic content to load
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '.dynamic-content'))
)
# Get rendered HTML
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
return soup
finally:
driver.quit()
# Option 2: Find the underlying API (preferred, faster)
import requests
def find_api_endpoint(url):
"""Network tab in browser dev tools to find API calls."""
# Example: The page might load data from:
api_url = url.replace('/page/', '/api/v1/items?page=')
response = requests.get(api_url, headers={'Accept': 'application/json'})
return response.json()
Real-World Examples
Example 1: Price Monitoring
import requests
from bs4 import BeautifulSoup
import json
import time
from datetime import datetime
class PriceMonitor:
"""Monitor product prices across multiple retailers."""
def __init__(self):
self.prices = []
self.headers = {'User-Agent': 'Mozilla/5.0'}
def check_price(self, url, name):
"""Check current price of a product."""
response = requests.get(url, headers=self.headers, timeout=10)
soup = BeautifulSoup(response.text, 'html.parser')
price_element = soup.select_one('.price-value')
if price_element:
price = float(price_element.text.strip('$').replace(',', ''))
self.prices.append({
'name': name,
'price': price,
'url': url,
'timestamp': datetime.now().isoformat()
})
return price
return None
def get_price_history(self, name):
"""Get price history for a product."""
return [p for p in self.prices if p['name'] == name]
def find_best_deal(self, urls):
"""Find lowest price across multiple URLs."""
best_price = float('inf')
best_url = None
for name, url in urls.items():
price = self.check_price(url, name)
if price and price < best_price:
best_price = price
best_url = url
return {'price': best_price, 'url': best_url}
def save_history(self, filename):
"""Save price history to JSON file."""
with open(filename, 'w') as f:
json.dump(self.prices, f, indent=2)
Example 2: Job Listing Scraper
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime
def scrape_job_listings(search_query, location, max_pages=5):
"""Scrape job listings from a job board."""
jobs = []
headers = {'User-Agent': 'Mozilla/5.0'}
for page in range(1, max_pages + 1):
url = f"https://example-jobs.com/search?q={search_query}&l={location}&page={page}"
response = requests.get(url, headers=headers, timeout=10)
soup = BeautifulSoup(response.text, 'html.parser')
listings = soup.select('.job-listing')
if not listings:
break
for listing in listings:
try:
job = {
'title': listing.select_one('.job-title').text.strip(),
'company': listing.select_one('.company-name').text.strip(),
'location': listing.select_one('.job-location').text.strip(),
'salary': listing.select_one('.salary').text.strip() if listing.select_one('.salary') else 'Not specified',
'posted': listing.select_one('.post-date').text.strip(),
'url': listing.select_one('a.job-link')['href'],
'scraped_at': datetime.now().isoformat()
}
jobs.append(job)
except AttributeError:
continue
time.sleep(2)
return jobs
def save_jobs_to_csv(jobs, filename):
"""Save scraped jobs to CSV."""
if not jobs:
return
with open(filename, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=jobs[0].keys())
writer.writeheader()
writer.writerows(jobs)
Example 3: News Aggregator
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime
class NewsAggregator:
"""Aggregate news from multiple sources."""
def __init__(self):
self.sources = {
'source1': 'https://news1.com',
'source2': 'https://news2.com',
}
self.headers = {'User-Agent': 'Mozilla/5.0'}
def scrape_source(self, source_name, url, article_selector, title_selector):
"""Scrape articles from a single source."""
response = requests.get(url, headers=self.headers, timeout=10)
soup = BeautifulSoup(response.text, 'html.parser')
articles = []
for item in soup.select(article_selector):
try:
article = {
'source': source_name,
'title': item.select_one(title_selector).text.strip(),
'link': item.select_one('a')['href'],
'scraped_at': datetime.now().isoformat()
}
articles.append(article)
except (AttributeError, TypeError):
continue
return articles
def aggregate_all(self):
"""Scrape all configured sources."""
all_articles = []
for source_name, url in self.sources.items():
try:
articles = self.scrape_source(
source_name, url,
article_selector='.article-card',
title_selector='h2'
)
all_articles.extend(articles)
print(f"Scraped {len(articles)} articles from {source_name}")
except Exception as e:
print(f"Error scraping {source_name}: {e}")
return all_articles
def save_to_json(self, articles, filename):
"""Save aggregated articles to JSON."""
with open(filename, 'w', encoding='utf-8') as f:
json.dump(articles, f, indent=2, ensure_ascii=False)
Common Mistakes
| Mistake | Problem | Solution |
|---|---|---|
| Ignoring robots.txt | Legal/ethical violations | Always check and respect robots.txt |
| No rate limiting | Overwhelms servers, gets IP banned | Add delays between requests |
| Hardcoded selectors | Breaks when site changes | Make selectors configurable |
| Not handling errors | Crashes on missing elements | Use try/except and check for None |
| Scraping behind login | May violate ToS | Use official APIs when available |
| Not saving progress | Loses work on crash | Save incrementally |
Best Practices
# 1. Always identify yourself
headers = {
'User-Agent': 'MyScraper/1.0 (contact@example.com)',
'Accept': 'text/html,application/xhtml+xml'
}
# 2. Add delays between requests
import time
time.sleep(2) # At minimum 1 second
# 3. Cache responses to avoid repeated requests
import hashlib
import os
def cached_request(url, cache_dir='.cache'):
os.makedirs(cache_dir, exist_ok=True)
cache_key = hashlib.md5(url.encode()).hexdigest()
cache_file = os.path.join(cache_dir, cache_key)
if os.path.exists(cache_file):
with open(cache_file, 'r') as f:
return f.read()
response = requests.get(url, headers=headers, timeout=10)
with open(cache_file, 'w') as f:
f.write(response.text)
return response.text
# 4. Use CSS selectors for resilience
soup.select('.product > .title') # More specific = more stable
# 5. Handle missing data gracefully
def safe_extract(soup, selector, default=''):
element = soup.select_one(selector)
return element.text.strip() if element else default
Key Takeaways
- Always use BeautifulSoup with requests —
html.parseris built-in,lxmlis faster - CSS selectors (
soup.select()) are more readable and powerful thanfind_all() - Always add delays between requests and respect robots.txt
- Use
headers={'User-Agent': ...}to identify your scraper - Handle missing elements with try/except or the
defaultpattern - For JavaScript-rendered content, use Selenium or find the underlying API
- Store scraped data as JSON for easy processing or CSV for spreadsheet analysis