CW

String and Text Processing

Module 1: Introduction & Python BasicsFree Lesson

Advertisement

String and Text Processing

Text preprocessing is a critical step in NLP and data science pipelines. Master these operations for clean, analyzable text data.

String Fundamentals

s = "Hello, Data Science!"
s[0]            # 'H'
s[-1]           # '!'
s[0:5]          # 'Hello'
len(s)          # 21

# Strings are immutable
s[0] = 'h'      # TypeError
s = 'h' + s[1:]  # 'hello, Data Science!'

Essential String Methods

text = "  Hello, World!  123  "

# Case operations
text.upper()           # "  HELLO, WORLD!  123  "
text.lower()           # "  hello, world!  123  "
text.title()           # "  Hello, World!  123  "
text.capitalize()      # "  hello, world!  123  "
text.swapcase()        # "  hELLO, wORLD!  123  "

# Whitespace
text.strip()           # "Hello, World!  123"
text.lstrip()          # "Hello, World!  123  "
text.rstrip()          # "  Hello, World!  123"
text.replace("  ", " ")  # " Hello, World! 123 "

# Search and check
text.find("World")     # 9 (index, -1 if not found)
text.count("l")        # 3
text.startswith("  H") # True
text.endswith("123")   # True
text.isalpha()         # False
text.isdigit()         # False
text.isalnum()         # False

F-Strings and Formatting

name = "Alice"
score = 95.6789
items = ["ML", "Stats", "DL"]

# f-strings (Python 3.6+)
f"Hello, {name}!"                    # "Hello, Alice!"
f"Score: {score:.2f}"                # "Score: 95.68"
f"Result: {100 * 3.14:.1f}"         # "Result: 314.0"
f"{'centered':^20}"                  # "      centered      "
f"{'left-aligned':<20}"              # "left-aligned       "
f"{'right-aligned':>20}"             # "       right-aligned"
f"Binary: {42:b}, Hex: {42:x}"       # "Binary: 101010, Hex: 2a"

# Debugging (Python 3.8+)
x = 42
f"{x = }"                           # "x = 42"
f"{x + 1 = }"                       # "x + 1 = 43"

String Splitting and Joining

# Splitting
csv_line = "Alice,30,NYC,Data Scientist"
parts = csv_line.split(",")  # ["Alice", "30", "NYC", "Data Scientist"]

text = "Hello   World   Python"
words = text.split()        # ["Hello", "World", "Python"] (splits on any whitespace)
words = text.split(" ")     # ["Hello", "", "", "World", "", "", "Python"]

# Joining
", ".join(["Alice", "Bob", "Charlie"])  # "Alice, Bob, Charlie"
"".join(["H", "e", "l", "l", "o"])      # "Hello"

Regular Expressions (Regex)

Character Classes. Any character except \n\d Digit [0-9]\w Word char [a-zA-Z0-9_]\s Whitespace [ \t\n\r][abc] a, b, or c[^abc] Not a, b, or cQuantifiers* Zero or more+ One or more? Zero or one{"{n}"} Exactly n times{"{n,m}"} Between n and m times*?, +? Non-greedyAnchors and Groups^ Start of string$ End of string\b Word boundary( ) Capture group(?: ) Non-capturing group| Alternation (OR)Common Patterns for Data ScienceEmail: r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+.[a-zA-Z]{"{2,}"}Phone: r"+?{"{"}1{"}"}?\d{"{"}3{"}"}[-.\s]?\d{"{"}3{"}"}[-.\s]?\d{"{"}4{"}"}"URL: r"https?://(?:www.)?[\w-]+(?:.[\w-]+)+[/\w.-]*"Date: r"\d{"{4}"}[-/]\d{"{2}"}[-/]\d{"{2}"}" or r"\d{"{1,2}"}[-/]\d{"{1,2}"}[-/]\d{"{4}"}"IP: r"\b(?:\d{"{1,3}"}.){3}\d{"{1,3}"}\b"

Regex in Python

import re

text = "Contact us at support@example.com or sales@company.org"

# re.search - find first match
match = re.search(r"[\w.+-]+@[\w-]+\.[\w.]+", text)
match.group()  # "support@example.com"

# re.findall - find all matches
emails = re.findall(r"[\w.+-]+@[\w-]+\.[\w.]+", text)
# ["support@example.com", "sales@company.org"]

# re.sub - substitute
cleaned = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # removes special chars

# re.split
parts = re.split(r"[,.\s]+", "Hello, world. How are you?")
# ["Hello", "world", "How", "are", "you", ""]

# Named groups
pattern = r"(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})"
match = re.search(pattern, "2024-03-15")
match.group("year")   # "2024"
match.group("month")  # "03"
match.group("day")    # "15"

Text Preprocessing Pipeline

Raw TextLowercaseRemove PunctTokenizeRemove StopStem/LemmaClean
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

def preprocess_text(text):
    # 1. Lowercase
    text = text.lower()

    # 2. Remove URLs
    text = re.sub(r"https?://\S+|www\.\S+", "", text)

    # 3. Remove HTML tags
    text = re.sub(r"<.*?>", "", text)

    # 4. Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # 5. Remove digits
    text = re.sub(r"\d+", "", text)

    # 6. Remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()

    # 7. Tokenize
    tokens = word_tokenize(text)

    # 8. Remove stopwords
    stop_words = set(stopwords.words("english"))
    tokens = [t for t in tokens if t not in stop_words]

    # 9. Stem or lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]

    return tokens

# Example
raw = "<p>Visit https://example.com! We have 3 amazing deals!!!</p>"
print(preprocess_text(raw))
# ['visit', 'amazing', 'deal']

Common Data Science Text Patterns

# Extract features from text
def extract_features(text):
    return {
        "word_count": len(text.split()),
        "char_count": len(text),
        "avg_word_length": sum(len(w) for w in text.split()) / max(len(text.split()), 1),
        "uppercase_ratio": sum(1 for c in text if c.isupper()) / max(len(text), 1),
        "digit_count": sum(c.isdigit() for c in text),
        "special_char_count": sum(not c.isalnum() and not c.isspace() for c in text),
        "sentence_count": len(re.split(r"[.!?]+", text)) - 1
    }

Unicode and Encoding

# Unicode normalization
import unicodedata
text = "café"  # may contain combining characters
normalized = unicodedata.normalize("NFKD", text)  # decomposes accented chars

# Encoding for file I/O
with open("data.txt", "r", encoding="utf-8") as f:
    content = f.read()

# Detect encoding
# pip install chardet
import chardet
with open("mystery.txt", "rb") as f:
    raw = f.read()
    encoding = chardet.detect(raw)["encoding"]

Summary

  • Use f-strings for readable string formatting
  • Master regex for pattern matching and text extraction
  • Apply the preprocessing pipeline systematically for NLP tasks
  • Choose lemmatization over stemming for better interpretability
  • Always handle encoding properly for international text

Advertisement

Need Expert Data Science Help?

Get personalized tutoring, project support, or professional consulting.

Advertisement