LLM Cost Optimization
LLM cost optimization involves strategically routing requests to appropriate models, leveraging caching, batching similar requests, and monitoring spending to minimize costs while maintaining quality.
Intelligent Model Router
from dataclasses import dataclass
from typing import List
import re
@dataclass
class ModelConfig:
name: str
cost_per_1k_input: float
cost_per_1k_output: float
max_tokens: int
quality_score: float # 0-1
speed_score: float # 0-1
class CostAwareRouter:
def __init__(self, models: List[ModelConfig]):
self.models = {m.name: m for m in models}
self.budget_limit = 100.0
self.current_spend = 0.0
def analyze_complexity(self, prompt: str) -> dict:
token_count = len(prompt.split())
has_code = bool(re.search(r'```|def |class |import ', prompt))
is_question = '?' in prompt
complexity = "low"
if token_count > 200 or has_code:
complexity = "high"
elif token_count > 50 or is_question:
complexity = "medium"
return {
"complexity": complexity,
"tokens": token_count,
"has_code": has_code
}
def select_model(self, prompt: str, quality_required: float = 0.7) -> str:
analysis = self.analyze_complexity(prompt)
suitable = []
for name, model in self.models.items():
if model.quality_score >= quality_required:
suitable.append(model)
if not suitable:
suitable = list(self.models.values())
best = min(suitable, key=lambda m: m.cost_per_1k_input)
return best.name
def estimate_cost(self, model_name: str, input_tokens: int, output_tokens: int) -> float:
model = self.models[model_name]
input_cost = (input_tokens / 1000) * model.cost_per_1k_input
output_cost = (output_tokens / 1000) * model.cost_per_1k_output
return input_cost + output_cost
def route(self, prompt: str, quality_required: float = 0.7) -> dict:
model_name = self.select_model(prompt, quality_required)
estimated_tokens = len(prompt.split()) * 1.3
cost = self.estimate_cost(model_name, estimated_tokens, 200)
if self.current_spend + cost > self.budget_limit:
model_name = min(self.models.values(), key=lambda m: m.cost_per_1k_input).name
self.current_spend += cost
return {"model": model_name, "estimated_cost": cost}
# Usage
router = CostAwareRouter([
ModelConfig("gpt-3.5-turbo", 0.0015, 0.002, 4096, 0.7, 0.9),
ModelConfig("gpt-4", 0.03, 0.06, 8192, 0.95, 0.5),
ModelConfig("claude-haiku", 0.0025, 0.0125, 200000, 0.75, 0.85)
])
result = router.route("What is 2+2?")
Request Batching
import asyncio
from collections import defaultdict
from typing import List, Callable
import time
class RequestBatcher:
def __init__(self, llm_call: Callable, max_batch_size: int = 10,
max_wait_ms: int = 100):
self.llm_call = llm_call
self.max_batch_size = max_batch_size
self.max_wait_ms = max_wait_ms
self.pending = []
async def add_request(self, request: dict) -> dict:
future = asyncio.Future()
self.pending.append({"request": request, "future": future})
if len(self.pending) >= self.max_batch_size:
await self._process_batch()
else:
await asyncio.sleep(self.max_wait_ms / 1000)
if self.pending:
await self._process_batch()
return await future
async def _process_batch(self):
if not self.pending:
return
batch = self.pending[:self.max_batch_size]
self.pending = self.pending[self.max_batch_size:]
prompts = [item["request"]["prompt"] for item in batch]
batch_prompt = "\n---\n".join(prompts)
try:
responses = await self.llm_call(batch_prompt)
for item, response in zip(batch, responses):
item["future"].set_result(response)
except Exception as e:
for item in batch:
item["future"].set_exception(e)
# Usage
async def batch_llm_call(prompts: str) -> List[str]:
response = await llm.ainvoke(prompts)
return response.content.split("---")
batcher = RequestBatcher(batch_llm_call)
Cost Monitoring Dashboard
from dataclasses import dataclass, field
from datetime import datetime
from typing import Dict, List
import json
@dataclass
class UsageRecord:
timestamp: str
model: str
input_tokens: int
output_tokens: int
cost: float
user_id: str
class CostMonitor:
def __init__(self):
self.records: List[UsageRecord] = []
self.daily_budget = 50.0
self.monthly_budget = 1000.0
def record_usage(self, model: str, input_tokens: int, output_tokens: int,
cost: float, user_id: str = "default"):
record = UsageRecord(
timestamp=datetime.now().isoformat(),
model=model,
input_tokens=input_tokens,
output_tokens=output_tokens,
cost=cost,
user_id=user_id
)
self.records.append(record)
return record
def get_daily_cost(self, date: str = None) -> float:
date = date or datetime.now().strftime("%Y-%m-%d")
return sum(r.cost for r in self.records
if r.timestamp.startswith(date))
def get_monthly_cost(self) -> float:
month = datetime.now().strftime("%Y-%m")
return sum(r.cost for r in self.records
if r.timestamp.startswith(month))
def check_budget(self) -> dict:
daily = self.get_daily_cost()
monthly = self.get_monthly_cost()
return {
"daily_remaining": max(0, self.daily_budget - daily),
"monthly_remaining": max(0, self.monthly_budget - monthly),
"daily_pct": (daily / self.daily_budget * 100) if self.daily_budget else 0,
"monthly_pct": (monthly / self.monthly_budget * 100) if self.monthly_budget else 0,
"alert": daily > self.daily_budget * 0.9 or monthly > self.monthly_budget * 0.9
}
def get_breakdown(self) -> Dict:
breakdown = defaultdict(lambda: {"count": 0, "cost": 0, "tokens": 0})
for r in self.records:
breakdown[r.model]["count"] += 1
breakdown[r.model]["cost"] += r.cost
breakdown[r.model]["tokens"] += r.input_tokens + r.output_tokens
return dict(breakdown)
def export_report(self, filename: str = "cost_report.json"):
report = {
"summary": {
"total_records": len(self.records),
"total_cost": sum(r.cost for r in self.records),
"budget_status": self.check_budget()
},
"breakdown": self.get_breakdown(),
"recent_records": [vars(r) for r in self.records[-100:]]
}
with open(filename, "w") as f:
json.dump(report, f, indent=2)
Prompt Optimization for Cost
class PromptOptimizer:
def __init__(self):
self.optimization_rules = [
{"pattern": r"please\s+", "replacement": ""},
{"pattern": r"could you\s+", "replacement": ""},
{"pattern": r"i would like you to\s+", "replacement": ""},
{"pattern": r"\s+", "replacement": " "}
]
def compress_prompt(self, prompt: str) -> str:
optimized = prompt
for rule in self.optimization_rules:
import re
optimized = re.sub(rule["pattern"], rule["replacement"], optimized)
return optimized.strip()
def extract_system_prompt(self, messages: list) -> tuple:
system = ""
user = []
for msg in messages:
if msg["role"] == "system":
system = msg["content"]
else:
user.append(msg)
return system, user
def estimate_savings(self, original: str, optimized: str) -> dict:
orig_tokens = len(original.split()) * 1.3
opt_tokens = len(optimized.split()) * 1.3
return {
"original_tokens": int(orig_tokens),
"optimized_tokens": int(opt_tokens),
"savings_pct": ((orig_tokens - opt_tokens) / orig_tokens * 100) if orig_tokens else 0,
"estimated_cost_savings": (orig_tokens - opt_tokens) * 0.00003
}
Key Takeaways
- Model routing selects the cheapest model that meets quality requirements
- Semantic caching eliminates redundant API calls
- Request batching reduces per-request overhead
- Cost monitoring prevents budget overruns
- Prompt optimization reduces token usage without losing effectiveness