Quantization
What is Quantization?
Quantization reduces model size and computational requirements by representing weights with lower precision numbers (e.g., 8-bit integers instead of 32-bit floats).
Quantization Methods
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import torch
# Post-Training Quantization (PTQ)
def quantize_ptq(model_name, bits=8):
if bits == 8:
config = BitsAndBytesConfig(
load_in_8bit=True,
llm_int8_threshold=6.0
)
elif bits == 4:
config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_quant_type="nf4"
)
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=config
)
return model
# Quantization-Aware Training (QAT)
class QuantAwareTraining:
def __init__(self, model):
self.model = model
def prepare_qat(self):
self.model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
torch.quantization.prepare_qat(self.model, inplace=True)
def convert(self):
torch.quantization.convert(self.model, inplace=True)
Quantization Libraries
| Library | Features |
|---|---|
| bitsandbytes | Easy 8-bit/4-bit loading |
| GPTQ | Post-training quantization |
| AWQ | Activation-aware quantization |
| GGML/GGUF | CPU-optimized quantization |
Using GGUF Format
# For llama.cpp compatible models
from llama_cpp import Llama
def load_quantized_gguf(model_path, n_ctx=2048):
llm = Llama(
model_path=model_path,
n_ctx=n_ctx,
n_gpu_layers=35 # GPU offloading
)
return llm
# Generate text
llm = load_quantized_gguf("model.Q4_K_M.gguf")
output = llm("Hello, world!", max_tokens=100)
Quality vs Size Tradeoffs
def compare_quantization_quality(original_model, quantized_model, test_data):
results = {}
# Measure inference speed
import time
start = time.time()
for _ in range(100):
original_model.generate(test_data)
results['original_time'] = time.time() - start
start = time.time()
for _ in range(100):
quantized_model.generate(test_data)
results['quantized_time'] = time.time() - start
# Measure memory usage
results['original_memory'] = get_model_memory(original_model)
results['quantized_memory'] = get_model_memory(quantized_model)
return results
Summary
Quantization enables deploying large models on resource-constrained devices while maintaining acceptable quality. Choose the quantization level based on your hardware and quality requirements.
Next: We'll explore model evaluation metrics.