Python Web Scraping — BeautifulSoup & Beyond

Web scraping extracts data from websites. It's used for data collection, price monitoring, research, and building datasets. This guide covers ethical scraping practices and technical implementation.

Learning Objectives

Parse HTML with BeautifulSoup using find, find_all, and CSS selectors
Handle pagination, dynamic content, and different page structures
Respect robots.txt, rate limits, and website terms of service
Store scraped data efficiently as JSON or CSV
Handle common scraping challenges (JavaScript-rendered content, CAPTCHAs)

BeautifulSoup Basics

BeautifulSoup parses HTML and provides Pythonic ways to navigate the DOM:

from bs4 import BeautifulSoup
import requests

# Fetch and parse a webpage
response = requests.get('https://example.com', timeout=10)
soup = BeautifulSoup(response.text, 'html.parser')

# Find the page title
title = soup.find('title').text
print(f"Page title: {title}")
order: 31

# Find all links
links = soup.find_all('a')
for link in links:
    href = link.get('href', '')
    text = link.text.strip()
    print(f"Link: {text} -> {href}")

# Find first heading
h1 = soup.find('h1')
if h1:
    print(f"Main heading: {h1.text}")

Parsing Different Sources

from bs4 import BeautifulSoup

# Parse from string
html_string = "<html><body><h1>Hello</h1></body></html>"
soup = BeautifulSoup(html_string, 'html.parser')

# Parse from file
with open('page.html', 'r', encoding='utf-8') as f:
    soup = BeautifulSoup(f, 'html.parser')

# Different parsers
soup = BeautifulSoup(html, 'html.parser')      # Built-in (no extra deps)
soup = BeautifulSoup(html, 'lxml')             # Faster, needs lxml
soup = BeautifulSoup(html, 'html5lib')         # Most lenient, handles broken HTML

Finding Elements

BeautifulSoup offers multiple ways to find elements:

find() and find_all()

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'html.parser')

# Find first matching element
first_div = soup.find('div')
first_paragraph = soup.find('p', class_='intro')
first_link = soup.find('a', href='/about')

# Find all matching elements
all_paragraphs = soup.find_all('p')
all_links = soup.find_all('a')
all_images = soup.find_all('img')

# Filter by attributes
links_with_class = soup.find_all('a', class_='nav-link')
divs_with_id = soup.find_all('div', id='content')
inputs_with_type = soup.find_all('input', type='text')

# Filter by text content
paragraphs_hello = soup.find_all('p', string='Hello')
# Or use a function
paragraphs_with_hello = soup.find_all('p', string=lambda text: text and 'hello' in text.lower())

CSS Selectors

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'html.parser')

# Select by tag
paragraphs = soup.select('p')

# Select by class
intro = soup.select('.intro')

# Select by ID
content = soup.select('#content')

# Combinators
div_paragraphs = soup.select('div > p')           # Direct children
all_paragraphs = soup.select('div p')             # All descendants
first_child = soup.select_one('ul > li:first-child')  # First list item

# Attribute selectors
links = soup.select('a[href^="https://"]')        # Links starting with https
images = soup.select('img[src$=".jpg"]')          # Images ending with .jpg
inputs = soup.select('input[type="text"]')        # Text inputs

# Complex selectors
product_cards = soup.select('div.product-card > h2.title')
nav_links = soup.select('nav ul li a.active')

Extracting Data

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'html.parser')

# Extract text
h1_text = soup.find('h1').text.strip()
link_text = soup.find('a').get_text(strip=True)

# Extract attributes
link_href = soup.find('a')['href']
img_src = soup.find('img')['src']
data_value = soup.find('div')['data-value']

# Extract multiple attributes
img = soup.find('img')
src = img.get('src', '')
alt = img.get('alt', '')
width = img.get('width', '')

# Get all text from page
all_text = soup.get_text(separator='\n', strip=True)

# Get text with specific formatting
for element in soup.find_all(['h1', 'h2', 'h3', 'p']):
    print(f"{element.name}: {element.text.strip()}")

Structured Scraping Patterns

Scraping Product Listings

import requests
from bs4 import BeautifulSoup
import json

def scrape_products(url):
    """Scrape product information from an e-commerce page."""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    response = requests.get(url, headers=headers, timeout=10)
    soup = BeautifulSoup(response.text, 'html.parser')

    products = []
    for card in soup.select('.product-card'):
        try:
            product = {
                'name': card.select_one('.product-title').text.strip(),
                'price': float(card.select_one('.price').text.strip('$').replace(',', '')),
                'rating': float(card.select_one('.rating').text) if card.select_one('.rating') else None,
                'url': card.select_one('a.product-link')['href'],
                'image': card.select_one('img')['src'] if card.select_one('img') else None,
                'in_stock': 'out-of-stock' not in card.get('class', [])
            }
            products.append(product)
        except (AttributeError, ValueError, TypeError) as e:
            print(f"Error parsing product: {e}")
            continue

    return products

Scraping News Articles

import requests
from bs4 import BeautifulSoup
from datetime import datetime

def scrape_article(url):
    """Scrape a news article page."""
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers, timeout=10)
    soup = BeautifulSoup(response.text, 'html.parser')

    article = {
        'title': soup.find('h1').text.strip() if soup.find('h1') else '',
        'author': soup.find('meta', attrs={'name': 'author'})['content'] if soup.find('meta', attrs={'name': 'author'}) else '',
        'date': soup.find('time')['datetime'] if soup.find('time') else '',
        'content': [],
        'url': url
    }

    # Extract article body paragraphs
    article_body = soup.find('article') or soup.find('div', class_='content')
    if article_body:
        for paragraph in article_body.find_all('p'):
            text = paragraph.text.strip()
            if text:
                article['content'].append(text)

    article['full_text'] = '\n\n'.join(article['content'])
    return article

def scrape_news_list(base_url):
    """Scrape a list of article links from a news page."""
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(base_url, headers=headers, timeout=10)
    soup = BeautifulSoup(response.text, 'html.parser')

    articles = []
    for link in soup.select('article a[href]'):
        href = link['href']
        if not href.startswith('http'):
            href = base_url.rstrip('/') + '/' + href.lstrip('/')
        articles.append(href)

    return list(set(articles))  # Remove duplicates

Handling Pagination

import requests
from bs4 import BeautifulSoup
import time

def scrape_all_pages(base_url, max_pages=10):
    """Scrape multiple pages with pagination."""
    all_items = []
    headers = {'User-Agent': 'Mozilla/5.0'}

    for page in range(1, max_pages + 1):
        url = f"{base_url}?page={page}"
        print(f"Scraping page {page}...")

        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code != 200:
            print(f"Failed to fetch page {page}: {response.status_code}")
            break

        soup = BeautifulSoup(response.text, 'html.parser')
        items = soup.select('.item-card')

        if not items:
            print(f"No items found on page {page}, stopping.")
            break

        for item in items:
            data = {
                'title': item.select_one('.title').text.strip(),
                'description': item.select_one('.description').text.strip(),
                'url': item.select_one('a')['href']
            }
            all_items.append(data)

        # Be polite — wait between requests
        time.sleep(2)

    print(f"Scraped {len(all_items)} items from {page} pages")
    return all_items

Infinite Scroll Simulation

import requests
from bs4 import BeautifulSoup
import time
import json

def scrape_infinite_scroll(base_url, api_endpoint=None, max_pages=50):
    """Simulate infinite scroll by finding the underlying API."""
    all_items = []

    for page in range(1, max_pages + 1):
        # Many infinite scroll sites use an API endpoint
        params = {'page': page, 'per_page': 20}
        headers = {'User-Agent': 'Mozilla/5.0', 'X-Requested-With': 'XMLHttpRequest'}

        try:
            if api_endpoint:
                response = requests.get(api_endpoint, params=params, headers=headers, timeout=10)
                data = response.json()
                items = data.get('items', [])
            else:
                # Fall back to HTML scraping
                response = requests.get(f"{base_url}?page={page}", headers=headers, timeout=10)
                soup = BeautifulSoup(response.text, 'html.parser')
                items = [{'title': el.text} for el in soup.select('.item')]

            if not items:
                break

            all_items.extend(items)
            time.sleep(1)

        except Exception as e:
            print(f"Error on page {page}: {e}")
            break

    return all_items

Respecting robots.txt and Rate Limiting

from urllib.robotparser import RobotFileParser
import time

def can_scrape(url, user_agent='*'):
    """Check if scraping is allowed by robots.txt."""
    from urllib.parse import urlparse

    parsed = urlparse(url)
    robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"

    rp = RobotFileParser()
    rp.set_url(robots_url)
    rp.read()

    return rp.can_fetch(user_agent, url)

# Usage
if can_scrape('https://example.com/products'):
    data = scrape_page('https://example.com/products')
else:
    print("Scraping not allowed by robots.txt")

Rate Limiting

import time
from functools import wraps

class RateLimiter:
    """Simple rate limiter for web scraping."""

    def __init__(self, requests_per_second=1):
        self.min_interval = 1.0 / requests_per_second
        self.last_request_time = 0

    def wait(self):
        elapsed = time.time() - self.last_request_time
        if elapsed < self.min_interval:
            time.sleep(self.min_interval - elapsed)
        self.last_request_time = time.time()

    def __call__(self, func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            self.wait()
            return func(*args, **kwargs)
        return wrapper

# Usage
limiter = RateLimiter(requests_per_second=2)

@limiter
def scrape_page(url):
    return requests.get(url, timeout=10)

Handling JavaScript-Rendered Content

For pages that load content dynamically with JavaScript:

# Option 1: Use Selenium for full browser rendering
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def scrape_js_page(url):
    """Scrape JavaScript-rendered content with Selenium."""
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Run without visible browser
    driver = webdriver.Chrome(options=options)

    try:
        driver.get(url)

        # Wait for dynamic content to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '.dynamic-content'))
        )

        # Get rendered HTML
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        return soup

    finally:
        driver.quit()

# Option 2: Find the underlying API (preferred, faster)
import requests

def find_api_endpoint(url):
    """Network tab in browser dev tools to find API calls."""
    # Example: The page might load data from:
    api_url = url.replace('/page/', '/api/v1/items?page=')
    response = requests.get(api_url, headers={'Accept': 'application/json'})
    return response.json()

Real-World Examples

Example 1: Price Monitoring

import requests
from bs4 import BeautifulSoup
import json
import time
from datetime import datetime

class PriceMonitor:
    """Monitor product prices across multiple retailers."""

    def __init__(self):
        self.prices = []
        self.headers = {'User-Agent': 'Mozilla/5.0'}

    def check_price(self, url, name):
        """Check current price of a product."""
        response = requests.get(url, headers=self.headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')

        price_element = soup.select_one('.price-value')
        if price_element:
            price = float(price_element.text.strip('$').replace(',', ''))
            self.prices.append({
                'name': name,
                'price': price,
                'url': url,
                'timestamp': datetime.now().isoformat()
            })
            return price
        return None

    def get_price_history(self, name):
        """Get price history for a product."""
        return [p for p in self.prices if p['name'] == name]

    def find_best_deal(self, urls):
        """Find lowest price across multiple URLs."""
        best_price = float('inf')
        best_url = None

        for name, url in urls.items():
            price = self.check_price(url, name)
            if price and price < best_price:
                best_price = price
                best_url = url

        return {'price': best_price, 'url': best_url}

    def save_history(self, filename):
        """Save price history to JSON file."""
        with open(filename, 'w') as f:
            json.dump(self.prices, f, indent=2)

Example 2: Job Listing Scraper

import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime

def scrape_job_listings(search_query, location, max_pages=5):
    """Scrape job listings from a job board."""
    jobs = []
    headers = {'User-Agent': 'Mozilla/5.0'}

    for page in range(1, max_pages + 1):
        url = f"https://example-jobs.com/search?q={search_query}&l={location}&page={page}"
        response = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')

        listings = soup.select('.job-listing')
        if not listings:
            break

        for listing in listings:
            try:
                job = {
                    'title': listing.select_one('.job-title').text.strip(),
                    'company': listing.select_one('.company-name').text.strip(),
                    'location': listing.select_one('.job-location').text.strip(),
                    'salary': listing.select_one('.salary').text.strip() if listing.select_one('.salary') else 'Not specified',
                    'posted': listing.select_one('.post-date').text.strip(),
                    'url': listing.select_one('a.job-link')['href'],
                    'scraped_at': datetime.now().isoformat()
                }
                jobs.append(job)
            except AttributeError:
                continue

        time.sleep(2)

    return jobs

def save_jobs_to_csv(jobs, filename):
    """Save scraped jobs to CSV."""
    if not jobs:
        return

    with open(filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=jobs[0].keys())
        writer.writeheader()
        writer.writerows(jobs)

Example 3: News Aggregator

import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime

class NewsAggregator:
    """Aggregate news from multiple sources."""

    def __init__(self):
        self.sources = {
            'source1': 'https://news1.com',
            'source2': 'https://news2.com',
        }
        self.headers = {'User-Agent': 'Mozilla/5.0'}

    def scrape_source(self, source_name, url, article_selector, title_selector):
        """Scrape articles from a single source."""
        response = requests.get(url, headers=self.headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')

        articles = []
        for item in soup.select(article_selector):
            try:
                article = {
                    'source': source_name,
                    'title': item.select_one(title_selector).text.strip(),
                    'link': item.select_one('a')['href'],
                    'scraped_at': datetime.now().isoformat()
                }
                articles.append(article)
            except (AttributeError, TypeError):
                continue

        return articles

    def aggregate_all(self):
        """Scrape all configured sources."""
        all_articles = []

        for source_name, url in self.sources.items():
            try:
                articles = self.scrape_source(
                    source_name, url,
                    article_selector='.article-card',
                    title_selector='h2'
                )
                all_articles.extend(articles)
                print(f"Scraped {len(articles)} articles from {source_name}")
            except Exception as e:
                print(f"Error scraping {source_name}: {e}")

        return all_articles

    def save_to_json(self, articles, filename):
        """Save aggregated articles to JSON."""
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(articles, f, indent=2, ensure_ascii=False)

Common Mistakes

Mistake	Problem	Solution
Ignoring robots.txt	Legal/ethical violations	Always check and respect robots.txt
No rate limiting	Overwhelms servers, gets IP banned	Add delays between requests
Hardcoded selectors	Breaks when site changes	Make selectors configurable
Not handling errors	Crashes on missing elements	Use try/except and check for None
Scraping behind login	May violate ToS	Use official APIs when available
Not saving progress	Loses work on crash	Save incrementally

Best Practices

# 1. Always identify yourself
headers = {
    'User-Agent': 'MyScraper/1.0 (contact@example.com)',
    'Accept': 'text/html,application/xhtml+xml'
}

# 2. Add delays between requests
import time
time.sleep(2)  # At minimum 1 second

# 3. Cache responses to avoid repeated requests
import hashlib
import os

def cached_request(url, cache_dir='.cache'):
    os.makedirs(cache_dir, exist_ok=True)
    cache_key = hashlib.md5(url.encode()).hexdigest()
    cache_file = os.path.join(cache_dir, cache_key)

    if os.path.exists(cache_file):
        with open(cache_file, 'r') as f:
            return f.read()

    response = requests.get(url, headers=headers, timeout=10)
    with open(cache_file, 'w') as f:
        f.write(response.text)
    return response.text

# 4. Use CSS selectors for resilience
soup.select('.product > .title')  # More specific = more stable

# 5. Handle missing data gracefully
def safe_extract(soup, selector, default=''):
    element = soup.select_one(selector)
    return element.text.strip() if element else default

Key Takeaways

Always use BeautifulSoup with requests — html.parser is built-in, lxml is faster
CSS selectors (soup.select()) are more readable and powerful than find_all()
Always add delays between requests and respect robots.txt
Use headers={'User-Agent': ...} to identify your scraper
Handle missing elements with try/except or the default pattern
For JavaScript-rendered content, use Selenium or find the underlying API
Store scraped data as JSON for easy processing or CSV for spreadsheet analysis

Python Web Scraping — BeautifulSoup & Beyond

Python Web Scraping — BeautifulSoup & Beyond

Learning Objectives

BeautifulSoup Basics

Parsing Different Sources

Finding Elements

find() and find_all()

CSS Selectors

Extracting Data

Structured Scraping Patterns

Scraping Product Listings

Scraping News Articles

Handling Pagination

Infinite Scroll Simulation

Respecting robots.txt and Rate Limiting

Rate Limiting

Handling JavaScript-Rendered Content

Real-World Examples

Example 1: Price Monitoring

Example 2: Job Listing Scraper

Example 3: News Aggregator

Common Mistakes

Best Practices

Key Takeaways

Premium Content

Need Expert Python Help?