Advanced RAG Techniques
Advanced RAG techniques go beyond basic retrieval-augmented generation by incorporating self-reflection, corrective mechanisms, and adaptive strategies to significantly improve answer quality.
HyDE (Hypothetical Document Embeddings)
HyDE generates a hypothetical answer first, then uses its embedding to find real documents:
from langchain_openai import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import ChatPromptTemplate
class HyDERetriever:
def __init__(self, vector_store, llm, embeddings):
self.vector_store = vector_store
self.llm = llm
self.embeddings = embeddings
def generate_hypothetical_doc(self, query: str) -> str:
prompt = ChatPromptTemplate.from_template(
"""Please write a passage that answers this question:
Question: {query}
Passage:"""
)
chain = prompt | self.llm
result = chain.invoke({"query": query})
return result.content
def retrieve(self, query: str, k: int = 5):
hypothetical = self.generate_hypothetical_doc(query)
hyde_embedding = self.embeddings.embed_query(hypothetical)
results = self.vector_store.similarity_search_by_vector(
hyde_embedding, k=k
)
return results
# Usage
retriever = HyDERetriever(vector_store, llm, embeddings)
docs = retriever.retrieve("How does quantum computing work?")
Self-RAG
Self-RAG uses reflection tokens to decide when to retrieve and evaluate relevance:
from enum import Enum
from dataclasses import dataclass
class ReflectionToken(Enum):
RETRIEVE = "[Retrieval]"
REL = "[Relevance]"
SUP = "[Support]"
USE = "[Utility]"
@dataclass
class SelfRAGOutput:
answer: str
retrieved_docs: list
reflection_tokens: list
is_useful: bool
class SelfRAG:
def __init__(self, llm, retriever, threshold_rel=0.5):
self.llm = llm
self.retriever = retriever
self.threshold_rel = threshold_rel
def should_retrieve(self, query: str) -> bool:
prompt = f"""Given the query, determine if retrieval is needed.
Query: {query}
Answer with [Retrieval] if needed, [No Retrieval] if not."""
response = self.llm.invoke(prompt)
return "[Retrieval]" in response.content
def assess_relevance(self, query: str, doc: str) -> float:
prompt = f"""Rate relevance of document to query (0-1).
Query: {query}
Document: {doc}
Score:"""
score = float(self.llm.invoke(prompt).content)
return score
def generate(self, query: str) -> SelfRAGOutput:
if not self.should_retrieve(query):
answer = self.llm.invoke(query).content
return SelfRAGOutput(answer, [], [], True)
docs = self.retriever.get_relevant_documents(query)
filtered = [
d for d in docs
if self.assess_relevance(query, d.page_content) >= self.threshold_rel
]
context = "\n".join([d.page_content for d in filtered[:3]])
answer = self.llm.invoke(
f"Context: {context}\nQuestion: {query}"
).content
return SelfRAGOutput(answer, filtered, [], True)
CRAG (Corrective RAG)
CRAG evaluates retrieval quality and takes corrective actions:
class CRAG:
def __init__(self, llm, retriever, knowledge_store):
self.llm = llm
self.retriever = retriever
self.knowledge_store = knowledge_store
def evaluate_retrieval(self, query: str, docs: list) -> str:
prompt = f"""Evaluate if documents are relevant (Correct/Ambiguous/Incorrect).
Query: {query}
Documents: {[d.page_content[:200] for d in docs]}
Assessment:"""
return self.llm.invoke(prompt).content
def decompose_and_search(self, query: str) -> list:
prompt = f"Decompose into sub-queries: {query}"
sub_queries = self.llm.invoke(prompt).content.split("\n")
all_docs = []
for sq in sub_queries[:3]:
all_docs.extend(self.retriever.get_relevant_documents(sq))
return all_docs
def retrieve_and_correct(self, query: str) -> str:
docs = self.retriever.get_relevant_documents(query)
assessment = self.evaluate_retrieval(query, docs)
if "Correct" in assessment:
filtered = [d for d in docs if self._is_relevant(d)]
elif "Ambiguous" in assessment:
filtered = self.decompose_and_search(query)
else:
filtered = self.knowledge_store.search_web(query)
context = "\n".join([d.page_content for d in filtered[:5]])
prompt = f"""Answer using only this context:
{context}
Question: {query}"""
return self.llm.invoke(prompt).content
Adaptive Query Expansion
class AdaptiveQueryExpansion:
def __init__(self, llm):
self.llm = llm
def expand_query(self, query: str, strategy: str = "hyde") -> list:
if strategy == "multi_query":
prompt = f"""Generate 3 different versions of this query:
{query}
Return each on a new line."""
expansions = self.llm.invoke(prompt).content.split("\n")
return [query] + expansions[:3]
elif strategy == "step_back":
prompt = f"""Generate a more general, broader question:
{query}"""
return [query, self.llm.invoke(prompt).content]
elif strategy == "sub_query":
prompt = f"""Break into 2-3 sub-questions:
{query}"""
return self.llm.invoke(prompt).content.split("\n")[:4]
return [query]
Reranking with Cross-Encoders
from sentence_transformers import CrossEncoder
class Reranker:
def __init__(self, model_name="cross-encoder/ms-marco-MiniLM-L-6-v2"):
self.model = CrossEncoder(model_name)
def rerank(self, query: str, documents: list, top_k: int = 3) -> list:
pairs = [(query, doc.page_content) for doc in documents]
scores = self.model.predict(pairs)
scored_docs = list(zip(documents, scores))
scored_docs.sort(key=lambda x: x[1], reverse=True)
return [doc for doc, score in scored_docs[:top_k]]
Key Takeaways
- HyDE improves retrieval by matching on hypothetical document embeddings
- Self-RAG adds reflection tokens for adaptive retrieval decisions
- CRAG handles poor retrievals with corrective web search fallback
- Reranking significantly improves precision of retrieved results
- Combine techniques for maximum effectiveness in production systems