String and Text Processing
Text preprocessing is a critical step in NLP and data science pipelines. Master these operations for clean, analyzable text data.
String Fundamentals
s = "Hello, Data Science!"
s[0] # 'H'
s[-1] # '!'
s[0:5] # 'Hello'
len(s) # 21
# Strings are immutable
s[0] = 'h' # TypeError
s = 'h' + s[1:] # 'hello, Data Science!'
Essential String Methods
text = " Hello, World! 123 "
# Case operations
text.upper() # " HELLO, WORLD! 123 "
text.lower() # " hello, world! 123 "
text.title() # " Hello, World! 123 "
text.capitalize() # " hello, world! 123 "
text.swapcase() # " hELLO, wORLD! 123 "
# Whitespace
text.strip() # "Hello, World! 123"
text.lstrip() # "Hello, World! 123 "
text.rstrip() # " Hello, World! 123"
text.replace(" ", " ") # " Hello, World! 123 "
# Search and check
text.find("World") # 9 (index, -1 if not found)
text.count("l") # 3
text.startswith(" H") # True
text.endswith("123") # True
text.isalpha() # False
text.isdigit() # False
text.isalnum() # False
F-Strings and Formatting
name = "Alice"
score = 95.6789
items = ["ML", "Stats", "DL"]
# f-strings (Python 3.6+)
f"Hello, {name}!" # "Hello, Alice!"
f"Score: {score:.2f}" # "Score: 95.68"
f"Result: {100 * 3.14:.1f}" # "Result: 314.0"
f"{'centered':^20}" # " centered "
f"{'left-aligned':<20}" # "left-aligned "
f"{'right-aligned':>20}" # " right-aligned"
f"Binary: {42:b}, Hex: {42:x}" # "Binary: 101010, Hex: 2a"
# Debugging (Python 3.8+)
x = 42
f"{x = }" # "x = 42"
f"{x + 1 = }" # "x + 1 = 43"
String Splitting and Joining
# Splitting
csv_line = "Alice,30,NYC,Data Scientist"
parts = csv_line.split(",") # ["Alice", "30", "NYC", "Data Scientist"]
text = "Hello World Python"
words = text.split() # ["Hello", "World", "Python"] (splits on any whitespace)
words = text.split(" ") # ["Hello", "", "", "World", "", "", "Python"]
# Joining
", ".join(["Alice", "Bob", "Charlie"]) # "Alice, Bob, Charlie"
"".join(["H", "e", "l", "l", "o"]) # "Hello"
Regular Expressions (Regex)
Regex in Python
import re
text = "Contact us at support@example.com or sales@company.org"
# re.search - find first match
match = re.search(r"[\w.+-]+@[\w-]+\.[\w.]+", text)
match.group() # "support@example.com"
# re.findall - find all matches
emails = re.findall(r"[\w.+-]+@[\w-]+\.[\w.]+", text)
# ["support@example.com", "sales@company.org"]
# re.sub - substitute
cleaned = re.sub(r"[^a-zA-Z0-9\s]", "", text) # removes special chars
# re.split
parts = re.split(r"[,.\s]+", "Hello, world. How are you?")
# ["Hello", "world", "How", "are", "you", ""]
# Named groups
pattern = r"(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})"
match = re.search(pattern, "2024-03-15")
match.group("year") # "2024"
match.group("month") # "03"
match.group("day") # "15"
Text Preprocessing Pipeline
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
def preprocess_text(text):
# 1. Lowercase
text = text.lower()
# 2. Remove URLs
text = re.sub(r"https?://\S+|www\.\S+", "", text)
# 3. Remove HTML tags
text = re.sub(r"<.*?>", "", text)
# 4. Remove punctuation
text = text.translate(str.maketrans("", "", string.punctuation))
# 5. Remove digits
text = re.sub(r"\d+", "", text)
# 6. Remove extra whitespace
text = re.sub(r"\s+", " ", text).strip()
# 7. Tokenize
tokens = word_tokenize(text)
# 8. Remove stopwords
stop_words = set(stopwords.words("english"))
tokens = [t for t in tokens if t not in stop_words]
# 9. Stem or lemmatize
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(t) for t in tokens]
return tokens
# Example
raw = "<p>Visit https://example.com! We have 3 amazing deals!!!</p>"
print(preprocess_text(raw))
# ['visit', 'amazing', 'deal']
Common Data Science Text Patterns
# Extract features from text
def extract_features(text):
return {
"word_count": len(text.split()),
"char_count": len(text),
"avg_word_length": sum(len(w) for w in text.split()) / max(len(text.split()), 1),
"uppercase_ratio": sum(1 for c in text if c.isupper()) / max(len(text), 1),
"digit_count": sum(c.isdigit() for c in text),
"special_char_count": sum(not c.isalnum() and not c.isspace() for c in text),
"sentence_count": len(re.split(r"[.!?]+", text)) - 1
}
Unicode and Encoding
# Unicode normalization
import unicodedata
text = "café" # may contain combining characters
normalized = unicodedata.normalize("NFKD", text) # decomposes accented chars
# Encoding for file I/O
with open("data.txt", "r", encoding="utf-8") as f:
content = f.read()
# Detect encoding
# pip install chardet
import chardet
with open("mystery.txt", "rb") as f:
raw = f.read()
encoding = chardet.detect(raw)["encoding"]
Summary
- Use f-strings for readable string formatting
- Master regex for pattern matching and text extraction
- Apply the preprocessing pipeline systematically for NLP tasks
- Choose lemmatization over stemming for better interpretability
- Always handle encoding properly for international text