LLM Cost Optimization

LLM cost optimization involves strategically routing requests to appropriate models, leveraging caching, batching similar requests, and monitoring spending to minimize costs while maintaining quality.

Intelligent Model Router

from dataclasses import dataclass
from typing import List
import re

@dataclass
class ModelConfig:
    name: str
    cost_per_1k_input: float
    cost_per_1k_output: float
    max_tokens: int
    quality_score: float  # 0-1
    speed_score: float    # 0-1

class CostAwareRouter:
    def __init__(self, models: List[ModelConfig]):
        self.models = {m.name: m for m in models}
        self.budget_limit = 100.0
        self.current_spend = 0.0

    def analyze_complexity(self, prompt: str) -> dict:
        token_count = len(prompt.split())
        has_code = bool(re.search(r'```|def |class |import ', prompt))
        is_question = '?' in prompt
        complexity = "low"
        if token_count > 200 or has_code:
            complexity = "high"
        elif token_count > 50 or is_question:
            complexity = "medium"
        return &#123;
            "complexity": complexity,
            "tokens": token_count,
            "has_code": has_code
        &#125;

    def select_model(self, prompt: str, quality_required: float = 0.7) -> str:
        analysis = self.analyze_complexity(prompt)

        suitable = []
        for name, model in self.models.items():
            if model.quality_score >= quality_required:
                suitable.append(model)

        if not suitable:
            suitable = list(self.models.values())

        best = min(suitable, key=lambda m: m.cost_per_1k_input)
        return best.name

    def estimate_cost(self, model_name: str, input_tokens: int, output_tokens: int) -> float:
        model = self.models[model_name]
        input_cost = (input_tokens / 1000) * model.cost_per_1k_input
        output_cost = (output_tokens / 1000) * model.cost_per_1k_output
        return input_cost + output_cost

    def route(self, prompt: str, quality_required: float = 0.7) -> dict:
        model_name = self.select_model(prompt, quality_required)
        estimated_tokens = len(prompt.split()) * 1.3
        cost = self.estimate_cost(model_name, estimated_tokens, 200)

        if self.current_spend + cost > self.budget_limit:
            model_name = min(self.models.values(), key=lambda m: m.cost_per_1k_input).name

        self.current_spend += cost
        return &#123;"model": model_name, "estimated_cost": cost&#125;

# Usage
router = CostAwareRouter([
    ModelConfig("gpt-3.5-turbo", 0.0015, 0.002, 4096, 0.7, 0.9),
    ModelConfig("gpt-4", 0.03, 0.06, 8192, 0.95, 0.5),
    ModelConfig("claude-haiku", 0.0025, 0.0125, 200000, 0.75, 0.85)
])
result = router.route("What is 2+2?")

Request Batching

import asyncio
from collections import defaultdict
from typing import List, Callable
import time

class RequestBatcher:
    def __init__(self, llm_call: Callable, max_batch_size: int = 10,
                 max_wait_ms: int = 100):
        self.llm_call = llm_call
        self.max_batch_size = max_batch_size
        self.max_wait_ms = max_wait_ms
        self.pending = []

    async def add_request(self, request: dict) -> dict:
        future = asyncio.Future()
        self.pending.append(&#123;"request": request, "future": future&#125;)

        if len(self.pending) >= self.max_batch_size:
            await self._process_batch()
        else:
            await asyncio.sleep(self.max_wait_ms / 1000)
            if self.pending:
                await self._process_batch()

        return await future

    async def _process_batch(self):
        if not self.pending:
            return

        batch = self.pending[:self.max_batch_size]
        self.pending = self.pending[self.max_batch_size:]

        prompts = [item["request"]["prompt"] for item in batch]
        batch_prompt = "\n---\n".join(prompts)

        try:
            responses = await self.llm_call(batch_prompt)
            for item, response in zip(batch, responses):
                item["future"].set_result(response)
        except Exception as e:
            for item in batch:
                item["future"].set_exception(e)

# Usage
async def batch_llm_call(prompts: str) -> List[str]:
    response = await llm.ainvoke(prompts)
    return response.content.split("---")

batcher = RequestBatcher(batch_llm_call)

Cost Monitoring Dashboard

from dataclasses import dataclass, field
from datetime import datetime
from typing import Dict, List
import json

@dataclass
class UsageRecord:
    timestamp: str
    model: str
    input_tokens: int
    output_tokens: int
    cost: float
    user_id: str

class CostMonitor:
    def __init__(self):
        self.records: List[UsageRecord] = []
        self.daily_budget = 50.0
        self.monthly_budget = 1000.0

    def record_usage(self, model: str, input_tokens: int, output_tokens: int,
                     cost: float, user_id: str = "default"):
        record = UsageRecord(
            timestamp=datetime.now().isoformat(),
            model=model,
            input_tokens=input_tokens,
            output_tokens=output_tokens,
            cost=cost,
            user_id=user_id
        )
        self.records.append(record)
        return record

    def get_daily_cost(self, date: str = None) -> float:
        date = date or datetime.now().strftime("%Y-%m-%d")
        return sum(r.cost for r in self.records
                  if r.timestamp.startswith(date))

    def get_monthly_cost(self) -> float:
        month = datetime.now().strftime("%Y-%m")
        return sum(r.cost for r in self.records
                  if r.timestamp.startswith(month))

    def check_budget(self) -> dict:
        daily = self.get_daily_cost()
        monthly = self.get_monthly_cost()
        return &#123;
            "daily_remaining": max(0, self.daily_budget - daily),
            "monthly_remaining": max(0, self.monthly_budget - monthly),
            "daily_pct": (daily / self.daily_budget * 100) if self.daily_budget else 0,
            "monthly_pct": (monthly / self.monthly_budget * 100) if self.monthly_budget else 0,
            "alert": daily > self.daily_budget * 0.9 or monthly > self.monthly_budget * 0.9
        &#125;

    def get_breakdown(self) -> Dict:
        breakdown = defaultdict(lambda: &#123;"count": 0, "cost": 0, "tokens": 0&#125;)
        for r in self.records:
            breakdown[r.model]["count"] += 1
            breakdown[r.model]["cost"] += r.cost
            breakdown[r.model]["tokens"] += r.input_tokens + r.output_tokens
        return dict(breakdown)

    def export_report(self, filename: str = "cost_report.json"):
        report = &#123;
            "summary": &#123;
                "total_records": len(self.records),
                "total_cost": sum(r.cost for r in self.records),
                "budget_status": self.check_budget()
            &#125;,
            "breakdown": self.get_breakdown(),
            "recent_records": [vars(r) for r in self.records[-100:]]
        &#125;
        with open(filename, "w") as f:
            json.dump(report, f, indent=2)

Prompt Optimization for Cost

class PromptOptimizer:
    def __init__(self):
        self.optimization_rules = [
            &#123;"pattern": r"please\s+", "replacement": ""&#125;,
            &#123;"pattern": r"could you\s+", "replacement": ""&#125;,
            &#123;"pattern": r"i would like you to\s+", "replacement": ""&#125;,
            &#123;"pattern": r"\s+", "replacement": " "&#125;
        ]

    def compress_prompt(self, prompt: str) -> str:
        optimized = prompt
        for rule in self.optimization_rules:
            import re
            optimized = re.sub(rule["pattern"], rule["replacement"], optimized)
        return optimized.strip()

    def extract_system_prompt(self, messages: list) -> tuple:
        system = ""
        user = []
        for msg in messages:
            if msg["role"] == "system":
                system = msg["content"]
            else:
                user.append(msg)
        return system, user

    def estimate_savings(self, original: str, optimized: str) -> dict:
        orig_tokens = len(original.split()) * 1.3
        opt_tokens = len(optimized.split()) * 1.3
        return &#123;
            "original_tokens": int(orig_tokens),
            "optimized_tokens": int(opt_tokens),
            "savings_pct": ((orig_tokens - opt_tokens) / orig_tokens * 100) if orig_tokens else 0,
            "estimated_cost_savings": (orig_tokens - opt_tokens) * 0.00003
        &#125;

Key Takeaways

Model routing selects the cheapest model that meets quality requirements
Semantic caching eliminates redundant API calls
Request batching reduces per-request overhead
Cost monitoring prevents budget overruns
Prompt optimization reduces token usage without losing effectiveness

LLM Cost Optimization

LLM Cost Optimization

Intelligent Model Router

Request Batching

Cost Monitoring Dashboard

Prompt Optimization for Cost

Key Takeaways

Premium Content

Need Expert Generative AI Help?