Introduction to Python Data Types
Python provides several built-in data types that are essential for data science work. Understanding these types is crucial for effective data manipulation and analysis.
Core Data Types in Python
| Type | Description | Example |
|---|
int | Integer numbers | 42, -17, 1000 |
float | Decimal numbers | 3.14, -0.5, 2.718 |
str | Text/strings | "hello", 'data science' |
bool | Boolean values | True, False |
list | Ordered, mutable | [1, 2, 3] |
tuple | Ordered, immutable | (1, 2, 3) |
dict | Key-value pairs | ('name': 'Alice') |
set | Unordered, unique | (1, 2, 3) |
Numeric Types for Data Science
# Integer operations
age = 25
year = 2024
count = 1000
# Float operations
price = 99.99
temperature = 36.6
pi = 3.141592653589793
# Type conversion for data science
int_to_float = float(42) # 42.0
float_to_int = int(3.99) # 3 (truncates)
str_to_int = int("100") # 100
str_to_float = float("3.14") # 3.14
# Precision handling (important for financial data)
from decimal import Decimal
price = Decimal('19.99')
total = price * 3 # Decimal('59.97')
String Operations for Data Science
# String creation and basic operations
text = "Data Science"
text_upper = text.upper() # "DATA SCIENCE"
text_lower = text.lower() # "data science"
# String slicing (essential for text processing)
sample = "Machine Learning"
print(sample[0:7]) # "Machine"
print(sample[-8:]) # "Learning"
print(sample[::2]) # "Mchin ernn"
# String methods for data cleaning
dirty_text = " Hello, World! "
clean = dirty_text.strip() # "Hello, World!"
lower = clean.lower() # "hello, world!"
split = clean.split(',') # ["Hello", " World!"]
# String formatting for data science
name = "Alice"
score = 95.5
f"Student: {name}, Score: {score:.1f}" # "Student: Alice, Score: 95.5"
List Operations for Data Analysis
# Creating lists for data storage
data = [23, 45, 67, 89, 12, 34, 56, 78, 90, 11]
# List indexing and slicing
first_three = data[0:3] # [23, 45, 67]
last_three = data[-3:] # [78, 90, 11]
every_other = data[::2] # [23, 67, 12, 56, 90]
# List methods for data manipulation
data.append(100) # Add element
data.extend([101, 102]) # Add multiple
data.insert(0, 0) # Insert at position
data.remove(23) # Remove by value
popped = data.pop() # Remove and return last
# List comprehension for data transformation
squares = [x**2 for x in range(10)] # [0, 1, 4, 9, 16, 25, 36, 49, 64, 81]
evens = [x for x in data if x % 2 == 0] # Filter even numbers
Dictionary Operations for Feature Storage
# Dictionary for structured data storage
student = {
'name': 'Alice',
'age': 25,
'gpa': 3.8,
'courses': ['Math', 'Physics', 'CS'],
'grades': {'Math': 'A', 'Physics': 'B', 'CS': 'A'}
}
# Accessing and modifying
print(student['name']) # "Alice"
print(student.get('major', 'N/A')) # "N/A" (default)
student['age'] = 26 # Update value
student['major'] = 'Data Science' # Add new key
# Dictionary iteration
for key, value in student.items():
print(f"{key}: {value}")
# Dictionary comprehension for feature engineering
features = ['age', 'salary', 'experience']
scaled = {f: 0 for f in features} # {'age': 0, 'salary': 0, 'experience': 0}
Tuple for Fixed Data Structures
# Tuples for fixed data (coordinates, RGB values, etc.)
point = (10, 20)
rgb_color = (255, 128, 0)
date = (2024, 1, 15)
# Tuple unpacking
x, y = point
r, g, b = rgb_color
year, month, day = date
# Named tuples for clarity (useful for data science)
from collections import namedtuple
DataPoint = namedtuple('DataPoint', ['x', 'y', 'label'])
point1 = DataPoint(x=10, y=20, label='A')
print(point1.x, point1.label) # 10, 'A'
# Multiple return values from functions
def calculate_stats(numbers):
return min(numbers), max(numbers), sum(numbers)/len(numbers)
Set Operations for Unique Values
# Sets for unique value handling
data_with_duplicates = [1, 2, 2, 3, 3, 3, 4, 4, 5]
unique_values = set(data_with_duplicates) # {1, 2, 3, 4, 5}
# Set operations
set1 = {1, 2, 3, 4}
set2 = {3, 4, 5, 6}
union = set1 | set2 # {1, 2, 3, 4, 5, 6}
intersection = set1 & set2 # {3, 4}
difference = set1 - set2 # {1, 2}
sym_diff = set1 ^ set2 # {1, 2, 5, 6}
# Set comprehension
squares_set = {x**2 for x in range(10)} # {0, 1, 4, 9, 16, 25, 36, 49, 64, 81}
Type Checking and Conversion
# Type checking (important for data validation)
value = 42
print(type(value)) # <class 'int'>
print(isinstance(value, int)) # True
print(isinstance(value, (int, float))) # True
# Check for numeric types
def is_numeric(x):
return isinstance(x, (int, float))
# Type conversion in data pipelines
data = ['1', '2', '3', '4', '5']
int_data = [int(x) for x in data] # [1, 2, 3, 4, 5]
# Handle mixed types in data
mixed = [1, 'two', 3.0, True]
clean = [x for x in mixed if isinstance(x, (int, float))]
Data Types in NumPy and Pandas
import numpy as np
import pandas as pd
# NumPy data types
arr_int = np.array([1, 2, 3], dtype=np.int32)
arr_float = np.array([1.0, 2.0, 3.0], dtype=np.float64)
arr_bool = np.array([True, False, True], dtype=np.bool_)
print(arr_int.dtype) # int32
print(arr_float.dtype) # float64
# Pandas data types
df = pd.DataFrame({
'int_col': [1, 2, 3],
'float_col': [1.1, 2.2, 3.3],
'str_col': ['a', 'b', 'c'],
'bool_col': [True, False, True]
})
print(df.dtypes)
# int_col int64
# float_col float64
# str_col object
# bool_col bool
Key Takeaways
- Choose appropriate types - Use int/float for numeric data, str for text
- Handle type conversion - Convert strings to numbers for calculations
- Use collections wisely - Lists for ordered data, dicts for key-value pairs
- Consider memory - Use appropriate dtypes in NumPy/Pandas for large datasets
Common Data Science Type Operations
| Operation | Code |
|---|
| Check type | type(x) |
| Convert to int | int(x) |
| Convert to float | float(x) |
| Convert to string | str(x) |
| Check numeric | isinstance(x, (int, float)) |
| NumPy dtype | arr.dtype |
| Pandas dtype | df.dtypes |