AI-Powered Search
Traditional vs AI-Powered Search
Traditional keyword search relies on exact matching and statistical relevance (TF-IDF, BM25). AI-powered search understands semantic meaning, context, and user intent through dense vector representations.
Semantic Search Pipeline
Building Semantic Search with Python
Document Indexing
from sentence_transformers import SentenceTransformer
import chromadb
import numpy as np
class SemanticSearchEngine:
def __init__(self, model_name="all-MiniLM-L6-v2"):
self.model = SentenceTransformer(model_name)
self.client = chromadb.Client()
self.collection = self.client.create_collection(
name="documents",
metadata={"hnsw:space": "cosine"}
)
def index_documents(self, documents, metadatas=None, ids=None):
embeddings = self.model.encode(documents).tolist()
if ids is None:
ids = [f"doc_{i}" for i in range(len(documents))]
if metadatas is None:
metadatas = [{"source": "web"} for _ in documents]
self.collection.add(
documents=documents,
embeddings=embeddings,
metadatas=metadatas,
ids=ids
)
def search(self, query, k=5):
query_embedding = self.model.encode([query]).tolist()
results = self.collection.query(
query_embeddings=query_embedding,
n_results=k
)
return [
{
"document": doc,
"score": score,
"metadata": meta
}
for doc, score, meta in zip(
results["documents"][0],
results["distances"][0],
results["metadatas"][0]
)
]
engine = SemanticSearchEngine()
engine.index_documents([
"Machine learning is a subset of artificial intelligence",
"Deep learning uses neural networks with many layers",
"Natural language processing enables computers to understand text",
"Computer vision allows machines to interpret images",
"Reinforcement learning trains agents through rewards"
])
results = engine.search("How do neural networks work?", k=3)
for r in results:
print(f"Score: {r['score']:.4f} | {r['document'][:60]}...")
Advanced Query Processing
import openai
from typing import List, Dict
class QueryProcessor:
def __init__(self, api_key: str):
self.client = openai.OpenAI(api_key=api_key)
def expand_query(self, query: str) -> List[str]:
response = self.client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "Generate 5 search query variations."},
{"role": "user", "content": f"Original query: {query}"}
],
temperature=0.7
)
return response.choices[0].message.content.split("\n")
def detect_intent(self, query: str) -> str:
intents = ["informational", "navigational", "transactional"]
response = self.client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": f"Classify query intent: {', '.join(intents)}"},
{"role": "user", "content": query}
],
temperature=0
)
return response.choices[0].message.content.strip()
def extract_entities(self, query: str) -> Dict:
response = self.client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "Extract named entities as JSON."},
{"role": "user", "content": query}
],
temperature=0,
response_format={"type": "json_object"}
)
import json
return json.loads(response.choices[0].message.content)
processor = QueryProcessor(api_key="your-api-key")
expanded = processor.expand_query("machine learning algorithms")
intent = processor.detect_intent("best Python courses")
entities = processor.extract_entities("TensorFlow by Google")
Hybrid Search Implementation
import numpy as np
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
class HybridSearch:
def __init__(self, alpha=0.5):
self.alpha = alpha
self.dense_model = SentenceTransformer("all-MiniLM-L6-v2")
self.sparse_model = None
self.documents = []
self.dense_embeddings = None
def index(self, documents):
self.documents = documents
tokenized_docs = [doc.lower().split() for doc in documents]
self.sparse_model = BM25Okapi(tokenized_docs)
self.dense_embeddings = self.dense_model.encode(documents)
def search(self, query, k=5):
sparse_scores = self.sparse_model.get_scores(query.lower().split())
query_embedding = self.dense_model.encode([query])[0]
dense_scores = np.dot(self.dense_embeddings, query_embedding)
sparse_norm = sparse_scores / (sparse_scores.max() + 1e-8)
dense_norm = dense_scores / (dense_scores.max() + 1e-8)
hybrid_scores = self.alpha * sparse_norm + (1 - self.alpha) * dense_norm
top_k_idx = np.argsort(hybrid_scores)[-k:][::-1]
return [
{"document": self.documents[i], "score": hybrid_scores[i]}
for i in top_k_idx
]
hybrid = HybridSearch(alpha=0.3)
hybrid.index([
"Introduction to neural networks and deep learning",
"Python programming for data science",
"Machine learning algorithms explained",
"Natural language processing with transformers",
"Computer vision fundamentals"
])
results = hybrid.search("deep learning tutorial")
Vector Database Comparison
| Database | Type | Performance | Scale | Use Case |
|---|---|---|---|---|
| Pinecone | Managed | High | 10B+ vectors | Production |
| Weaviate | Self-hosted | High | 1B+ vectors | Flexible |
| Qdrant | Self-hosted | Very High | 1B+ vectors | Performance |
| Chroma | Embedded | Medium | 10M vectors | Development |
| Milvus | Distributed | Very High | 10B+ vectors | Enterprise |
Search Quality Metrics
class SearchEvaluator:
def __init__(self):
self.metrics = {}
def precision_at_k(self, relevant: set, retrieved: list, k: int) -> float:
retrieved_k = retrieved[:k]
relevant_retrieved = len(set(retrieved_k) & relevant)
return relevant_retrieved / k
def recall_at_k(self, relevant: set, retrieved: list, k: int) -> float:
retrieved_k = retrieved[:k]
relevant_retrieved = len(set(retrieved_k) & relevant)
return relevant_retrieved / len(relevant)
def ndcg_at_k(self, relevance_scores: list, k: int) -> float:
dcg = sum(rel / np.log2(i + 2) for i, rel in enumerate(relevance_scores[:k]))
ideal_relevance = sorted(relevance_scores, reverse=True)[:k]
idcg = sum(rel / np.log2(i + 2) for i, rel in enumerate(ideal_relevance))
return dcg / idcg if idcg > 0 else 0
def mean_reciprocal_rank(self, queries_results: list) -> float:
rr_scores = []
for results in queries_results:
for i, doc in enumerate(results):
if doc.get("relevant", False):
rr_scores.append(1 / (i + 1))
break
else:
rr_scores.append(0)
return np.mean(rr_scores)
evaluator = SearchEvaluator()
relevant = {"doc_1", "doc_3", "doc_5"}
retrieved = ["doc_1", "doc_2", "doc_3", "doc_4", "doc_5"]
print(f"P@5: {evaluator.precision_at_k(relevant, retrieved, 5):.2f}")
print(f"R@5: {evaluator.recall_at_k(relevant, retrieved, 5):.2f}")
Best Practices
- Choose embedding models trained on your domain
- Use hybrid search combining sparse and dense retrieval
- Implement proper chunking strategies for long documents
- Cache frequent queries to reduce latency
- Monitor search quality with relevance feedback
- Use query understanding to improve recall
- Implement re-ranking for precision optimization