NumPy — Complete Guide to Array Computing

NumPy: The Engine of Scientific Python

NumPy (Numerical Python) provides the ndarray — a fast, memory-efficient multi-dimensional array. It is the foundation of every major data science library (Pandas, Scikit-learn, TensorFlow all use NumPy arrays internally).

Why NumPy is fast:

Data stored in contiguous memory blocks (like C arrays)
Operations implemented in C — no Python overhead per element
Vectorized operations replace slow Python loops
BLAS/LAPACK libraries for linear algebra

Array Creation

import numpy as np

# From Python lists
a1 = np.array([1, 2, 3, 4, 5])           # 1D array
a2 = np.array([[1, 2, 3],                  # 2D array (matrix)
               [4, 5, 6]])
a3 = np.array([[[1,2],[3,4]],              # 3D array
               [[5,6],[7,8]]])

# Shape, dtype, size
print(a1.shape)    # (5,)
print(a2.shape)    # (2, 3)
print(a3.shape)    # (2, 2, 2)
print(a2.ndim)     # 2
print(a2.dtype)    # int64
print(a2.size)     # 6  (total elements)
print(a2.nbytes)   # 48 (bytes in memory)

# Built-in creators
zeros  = np.zeros((3, 4))                   # all zeros, float64
ones   = np.ones((2, 3), dtype=np.float32)  # all ones, 32-bit
eye    = np.eye(4)                           # identity matrix
full   = np.full((3, 3), 7.0)               # all 7s
empty  = np.empty((2, 2))                    # uninitialized (fast, but values garbage)

arange  = np.arange(0, 20, 2)              # [0, 2, 4, ..., 18]
linspace = np.linspace(0, 1, 11)           # [0.0, 0.1, 0.2, ..., 1.0]
logspace = np.logspace(0, 3, 7)            # log-spaced from 10^0 to 10^3

# Random arrays
rng = np.random.default_rng(seed=42)       # reproducible generator (modern API)
uniform    = rng.uniform(0, 1, (3, 4))     # uniform [0, 1)
normal     = rng.normal(loc=0, scale=1, size=(100,))  # standard normal
integers   = rng.integers(0, 100, size=10)  # random ints

# Reshape and resize
flat  = np.arange(12)                       # [0..11]
mat   = flat.reshape(3, 4)                  # 3 rows, 4 columns
cube  = flat.reshape(2, 2, 3)               # 3D
back  = mat.ravel()                          # back to 1D
col_v = flat.reshape(-1, 1)                  # column vector (12, 1)  — -1 means infer
row_v = flat.reshape(1, -1)                  # row vector (1, 12)

Indexing and Slicing

arr = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90])

# Basic indexing
print(arr[0])       # 10  (first element)
print(arr[-1])      # 90  (last element)
print(arr[2:7])     # [30 40 50 60 70]   (exclusive stop)
print(arr[::2])     # [10 30 50 70 90]   (every 2nd)
print(arr[::-1])    # [90 80 70 60 50 40 30 20 10]  (reversed)

# 2D indexing
M = np.arange(1, 17).reshape(4, 4)
# [[ 1  2  3  4]
#  [ 5  6  7  8]
#  [ 9 10 11 12]
#  [13 14 15 16]]

print(M[1, 2])         # 7   (row 1, col 2)
print(M[0, :])         # [1 2 3 4]   (first row)
print(M[:, 1])         # [2 6 10 14] (second column)
print(M[1:3, 1:3])     # [[6 7], [10 11]] (submatrix)
print(M[-1, :])        # last row
print(M[:, -1])        # last column

# Boolean (fancy) indexing — CRITICAL for data science
scores = np.array([72, 85, 91, 68, 77, 88, 95, 73, 82])
print(scores[scores >= 80])           # [85 91 88 95 82]
print(scores[(scores > 70) & (scores < 90)])  # [85 77 88 73 82]

# Modify values using boolean index
scores[scores < 70] = 70              # floor all values at 70
print(scores)

# np.where — conditional element-wise selection
grades = np.where(scores >= 80, 'Pass', 'Fail')
print(grades)

# Fancy indexing (index with array)
indices = np.array([0, 3, 6])
print(scores[indices])   # [70 70 95]  (elements at positions 0, 3, 6)

# np.take — same as fancy indexing
print(np.take(scores, [0, 3, 6]))

# IMPORTANT: views vs copies
arr = np.arange(10)
view = arr[2:5]         # SLICE creates a VIEW (shares memory)
view[0] = 99            # modifies arr too!
print(arr)              # [0 1 99 3 4 5 6 7 8 9]

copy = arr[2:5].copy()  # explicit copy — independent
copy[0] = 0             # does NOT affect arr

Vectorized Operations — Replace All Python Loops

import numpy as np
import time

n = 10_000_000
arr = np.arange(n, dtype=np.float64)

# Python loop — SLOW
start = time.time()
result_py = [x**2 for x in range(n)]
t_py = time.time() - start

# NumPy vectorized — FAST
start = time.time()
result_np = arr ** 2
t_np = time.time() - start

print(f"Python loop: {t_py:.3f}s")
print(f"NumPy array: {t_np:.3f}s")
print(f"Speedup: {t_py/t_np:.0f}x")   # typically 100-500x faster

# Universal Functions (ufuncs) — element-wise operations
a = np.array([1.0, 4.0, 9.0, 16.0, 25.0])

np.sqrt(a)           # [1. 2. 3. 4. 5.]
np.exp(a)            # [e^1, e^4, e^9, ...]
np.log(a)            # natural log
np.log2(a)           # base-2 log
np.log10(a)          # base-10 log
np.abs(np.array([-1, 2, -3]))   # [1 2 3]
np.sign(np.array([-3, 0, 5]))   # [-1  0  1]
np.floor(np.array([1.2, 2.7]))  # [1. 2.]
np.ceil(np.array([1.2, 2.7]))   # [2. 3.]
np.round(np.array([1.25, 2.75]), 1)  # [1.2, 2.8]

# Trigonometric
x = np.linspace(0, 2*np.pi, 100)
y = np.sin(x) + 0.5 * np.cos(2*x)   # complex expression, element-wise

# Statistical aggregations
data = np.random.normal(50, 10, (100, 5))  # 100 samples, 5 features

print(data.mean())                     # grand mean
print(data.mean(axis=0))               # column means (shape: 5,)
print(data.mean(axis=1))               # row means (shape: 100,)
print(data.std(ddof=0))                # population std
print(data.std(ddof=1, axis=0))        # sample std per column
print(data.var(axis=0))                # variance per column
print(np.median(data, axis=0))         # median per column
print(np.percentile(data, [25,50,75], axis=0))  # quartiles per column
print(data.min(axis=0), data.max(axis=0))       # min/max per column
print(data.sum(axis=0))                # column sums
print(data.cumsum(axis=0))             # cumulative sum
print(data.argmin(axis=0))             # index of min per column
print(data.argmax(axis=0))             # index of max per column

# Sorting
arr = np.array([3, 1, 4, 1, 5, 9, 2, 6])
print(np.sort(arr))           # sorted copy
arr.sort()                    # in-place sort
print(np.argsort(arr))        # indices that would sort arr (useful for ranking)
print(np.argsort(arr)[::-1])  # descending order indices

Broadcasting — The Key to Elegant NumPy Code

Broadcasting allows operations between arrays of different shapes. NumPy stretches the smaller array without copying data.

Broadcasting Rules (applied right to left on dimensions):

If arrays have different ndim, pad smaller shape with 1s on the LEFT
Dimensions of size 1 are "stretched" to match the other
If sizes don't match and neither is 1, error

# Scalar broadcast
a = np.array([1, 2, 3, 4, 5])
print(a + 10)      # [11 12 13 14 15]  (10 broadcast to all elements)
print(a * 2)       # [ 2  4  6  8 10]

# 1D + 1D broadcast
a = np.array([1, 2, 3])         # shape (3,)
b = np.array([10, 20, 30])      # shape (3,)
print(a + b)       # [11 22 33]  (same shape, no broadcasting)

# 2D + 1D broadcast  ← VERY common in data science
M = np.array([[1, 2, 3],        # shape (3, 3)
              [4, 5, 6],
              [7, 8, 9]])
v = np.array([10, 20, 30])      # shape (3,) → broadcast to (3,3)

print(M + v)
# [[11 22 33]
#  [14 25 36]
#  [17 28 39]]

# Column vector broadcast
col = np.array([[100],          # shape (3, 1)
               [200],
               [300]])
print(M + col)
# [[101 102 103]
#  [204 205 206]
#  [307 308 309]]

# Real-world example: normalize each feature (column) to zero mean, unit variance
data = np.random.randn(1000, 10)          # 1000 samples, 10 features
means  = data.mean(axis=0, keepdims=True) # shape (1, 10) — keepdims is key
stds   = data.std(axis=0,  keepdims=True) # shape (1, 10)
normalized = (data - means) / stds        # broadcast: (1000,10) - (1,10) = (1000,10)

print(normalized.mean(axis=0).round(10))  # all ~0
print(normalized.std(axis=0).round(10))   # all ~1

# Outer product via broadcasting
x = np.array([1, 2, 3])[:, np.newaxis]   # shape (3, 1) — add new axis
y = np.array([10, 20, 30])[np.newaxis, :] # shape (1, 3) — add new axis
outer = x * y
# [[10 20 30]
#  [20 40 60]
#  [30 60 90]]

Linear Algebra

# Matrix operations
A = np.array([[2, 1, 0],
              [1, 3, 1],
              [0, 1, 2]], dtype=float)
b = np.array([1, 0, -1], dtype=float)

# Solve linear system Ax = b
x = np.linalg.solve(A, b)
print(f"Solution: {x}")
print(f"Verify Ax: {A @ x}")   # should equal b

# Matrix multiplication
A = np.random.randn(4, 3)
B = np.random.randn(3, 5)
C = A @ B                   # (4,3) @ (3,5) = (4,5)   ← @ is matmul
C_old = np.matmul(A, B)    # same thing, older syntax

# Transpose
print(A.T)                  # transpose
print(A.T.shape)            # (3, 4)

# Determinant, inverse, rank
sq = np.array([[3, 1], [1, 2]], dtype=float)
print(np.linalg.det(sq))    # determinant
print(np.linalg.inv(sq))    # inverse (use solve() instead when possible)
print(np.linalg.matrix_rank(sq))  # rank

# Eigendecomposition
eigenvalues, eigenvectors = np.linalg.eig(sq)
print(f"Eigenvalues:  {eigenvalues}")
print(f"Eigenvectors:\n{eigenvectors}")

# SVD — Singular Value Decomposition (fundamental for PCA, recommender systems)
data = np.random.randn(100, 5)
U, S, Vt = np.linalg.svd(data, full_matrices=False)
print(f"U: {U.shape}")   # (100, 5) — left singular vectors
print(f"S: {S.shape}")   # (5,)     — singular values
print(f"Vt:{Vt.shape}")  # (5, 5)   — right singular vectors (transposed)

# Reconstruct from SVD
reconstructed = U @ np.diag(S) @ Vt
print(f"Max reconstruction error: {np.abs(data - reconstructed).max():.2e}")

# Norm
v = np.array([3.0, 4.0])
print(np.linalg.norm(v))        # 5.0  (Euclidean / L2 norm)
print(np.linalg.norm(v, ord=1)) # 7.0  (L1 norm)
print(np.linalg.norm(v, ord=np.inf))  # 4.0  (L∞ norm)

Performance Tips

# 1. Avoid Python loops — use vectorized operations
# BAD
result = []
for i in range(len(data)):
    result.append(data[i] ** 2 + 1)

# GOOD
result = data**2 + 1

# 2. Specify dtype to save memory
float32_arr = np.zeros((1000, 1000), dtype=np.float32)  # 4MB instead of 8MB
int8_arr = np.zeros(1000, dtype=np.int8)                 # if values fit in [-128, 127]

# 3. Use in-place operations to avoid copies
a = np.ones(1_000_000)
a *= 2          # in-place, no copy
a += 1          # in-place, no copy
np.add(a, 1, out=a)   # explicit in-place

# 4. np.einsum for complex tensor operations
A = np.random.randn(10, 5)
B = np.random.randn(5, 8)
C = np.einsum('ij,jk->ik', A, B)  # equivalent to A @ B but more general

# 5. Memory layout matters — row-major (C) vs column-major (Fortran)
arr_c = np.ascontiguousarray(data)   # ensure row-major
arr_f = np.asfortranarray(data)      # column-major

# Summing along axis 0 is faster on C-order (row-major) arrays

Key Takeaways

NumPy arrays are homogeneous and contiguous — this is what makes them fast
Vectorized operations replace Python loops — learn to think in arrays, not loops
Broadcasting allows elegant operations between different-shaped arrays — understand the rules
Boolean indexing enables powerful data filtering without explicit loops
Views vs copies: slices create views (memory-efficient but mutates original); .copy() for independence
Specify dtype when memory matters — float32 vs float64 is a 2× memory difference
@ operator is matrix multiplication; * is element-wise

→ Next: Pandas — Data Manipulation