RAG (Retrieval-Augmented Generation) Pattern
Build knowledge-powered AI systems that retrieve relevant context before generating responses.
Overview#
RAG combines retrieval systems with generative AI to answer questions using your own data. It retrieves relevant documents from a knowledge base and provides them as context for the AI to generate accurate, grounded responses.
When to use:
- Knowledge bases and documentation search
- Document Q&A systems
- Support chatbots with company-specific knowledge
- Search result enhancement
- Any AI application requiring factual grounding
Key features:
- Document ingestion and chunking
- Vector similarity search
- Context-aware generation
- Source attribution
- Streaming responses
- Hybrid search (vector + keyword)
Code Example#
Document Ingestion#
1// lib/rag/ingest.ts
2import { OpenAI } from 'openai'
3import { prisma } from '@/lib/db'
4
5const openai = new OpenAI()
6
7interface Document {
8 id: string
9 content: string
10 metadata?: Record<string, any>
11}
12
13export async function ingestDocument(doc: Document) {
14 // Split into chunks
15 const chunks = splitIntoChunks(doc.content, 500)
16
17 // Generate embeddings for each chunk
18 const embeddingsResponse = await openai.embeddings.create({
19 model: 'text-embedding-3-small',
20 input: chunks
21 })
22
23 // Store chunks with embeddings
24 await prisma.$transaction(
25 chunks.map((chunk, i) =>
26 prisma.documentChunk.create({
27 data: {
28 documentId: doc.id,
29 content: chunk,
30 embedding: embeddingsResponse.data[i].embedding,
31 metadata: doc.metadata
32 }
33 })
34 )
35 )
36}
37
38function splitIntoChunks(text: string, maxChunkSize: number): string[] {
39 const sentences = text.split(/(?<=[.!?])\s+/)
40 const chunks: string[] = []
41 let currentChunk = ''
42
43 for (const sentence of sentences) {
44 if ((currentChunk + sentence).length > maxChunkSize && currentChunk) {
45 chunks.push(currentChunk.trim())
46 currentChunk = ''
47 }
48 currentChunk += sentence + ' '
49 }
50
51 if (currentChunk.trim()) {
52 chunks.push(currentChunk.trim())
53 }
54
55 return chunks
56}Vector Search#
1// lib/rag/search.ts
2import { OpenAI } from 'openai'
3import { prisma } from '@/lib/db'
4
5const openai = new OpenAI()
6
7export async function searchSimilar(query: string, limit = 5) {
8 // Generate query embedding
9 const embeddingResponse = await openai.embeddings.create({
10 model: 'text-embedding-3-small',
11 input: query
12 })
13
14 const queryEmbedding = embeddingResponse.data[0].embedding
15
16 // Search using pgvector
17 const results = await prisma.$queryRaw<Array<{
18 id: string
19 content: string
20 similarity: number
21 }>>`
22 SELECT
23 id,
24 content,
25 1 - (embedding <=> ${queryEmbedding}::vector) as similarity
26 FROM "DocumentChunk"
27 ORDER BY embedding <=> ${queryEmbedding}::vector
28 LIMIT ${limit}
29 `
30
31 return results
32}RAG Query#
1// lib/rag/query.ts
2import Anthropic from '@anthropic-ai/sdk'
3import { searchSimilar } from './search'
4
5const client = new Anthropic()
6
7export async function ragQuery(question: string) {
8 // Retrieve relevant chunks
9 const relevantChunks = await searchSimilar(question, 5)
10
11 // Build context
12 const context = relevantChunks
13 .map(chunk => chunk.content)
14 .join('\n\n---\n\n')
15
16 // Generate answer
17 const response = await client.messages.create({
18 model: 'claude-sonnet-4-20250514',
19 max_tokens: 1024,
20 system: `You are a helpful assistant that answers questions based on the provided context.
21If the context doesn't contain relevant information, say so.
22Always cite which parts of the context you're using.`,
23 messages: [
24 {
25 role: 'user',
26 content: `Context:
27${context}
28
29Question: ${question}
30
31Answer based on the context above:`
32 }
33 ]
34 })
35
36 const textBlock = response.content.find(b => b.type === 'text')
37
38 return {
39 answer: textBlock?.text ?? '',
40 sources: relevantChunks.map(c => ({
41 content: c.content.slice(0, 200),
42 similarity: c.similarity
43 }))
44 }
45}Streaming RAG#
1// lib/rag/streaming.ts
2import Anthropic from '@anthropic-ai/sdk'
3import { searchSimilar } from './search'
4
5const client = new Anthropic()
6
7export async function* streamingRagQuery(question: string) {
8 // Yield search status
9 yield { type: 'status', message: 'Searching knowledge base...' }
10
11 const relevantChunks = await searchSimilar(question, 5)
12
13 yield {
14 type: 'sources',
15 sources: relevantChunks.map(c => ({
16 content: c.content.slice(0, 100),
17 similarity: c.similarity
18 }))
19 }
20
21 const context = relevantChunks.map(c => c.content).join('\n\n---\n\n')
22
23 yield { type: 'status', message: 'Generating answer...' }
24
25 const stream = client.messages.stream({
26 model: 'claude-sonnet-4-20250514',
27 max_tokens: 1024,
28 system: `Answer questions based on the provided context.`,
29 messages: [
30 {
31 role: 'user',
32 content: `Context:\n${context}\n\nQuestion: ${question}`
33 }
34 ]
35 })
36
37 for await (const event of stream) {
38 if (event.type === 'content_block_delta' && event.delta.type === 'text_delta') {
39 yield { type: 'text', content: event.delta.text }
40 }
41 }
42
43 yield { type: 'done' }
44}Hybrid Search#
Combine vector and keyword search for better results:
1// lib/rag/hybrid-search.ts
2import { searchSimilar } from './search'
3import { prisma } from '@/lib/db'
4
5export async function hybridSearch(query: string, limit = 5) {
6 // Vector search
7 const vectorResults = await searchSimilar(query, limit * 2)
8
9 // Keyword search
10 const keywordResults = await prisma.documentChunk.findMany({
11 where: {
12 content: {
13 contains: query,
14 mode: 'insensitive'
15 }
16 },
17 take: limit * 2
18 })
19
20 // Combine and deduplicate
21 const allResults = new Map()
22
23 vectorResults.forEach((r, i) => {
24 allResults.set(r.id, {
25 ...r,
26 vectorRank: i,
27 keywordRank: Infinity
28 })
29 })
30
31 keywordResults.forEach((r, i) => {
32 if (allResults.has(r.id)) {
33 allResults.get(r.id).keywordRank = i
34 } else {
35 allResults.set(r.id, {
36 ...r,
37 vectorRank: Infinity,
38 keywordRank: i,
39 similarity: 0
40 })
41 }
42 })
43
44 // RRF (Reciprocal Rank Fusion) scoring
45 const k = 60
46 const scoredResults = Array.from(allResults.values()).map(r => ({
47 ...r,
48 rrfScore: 1 / (k + r.vectorRank) + 1 / (k + r.keywordRank)
49 }))
50
51 return scoredResults
52 .sort((a, b) => b.rrfScore - a.rrfScore)
53 .slice(0, limit)
54}Usage Instructions#
- Ingest documents: Split documents into chunks and generate embeddings
- Store in vector database: Save chunks with embeddings in pgvector
- Handle queries: Generate query embedding and find similar chunks
- Build context: Concatenate relevant chunks for the AI
- Generate response: Pass context to the LLM with clear instructions
- Return sources: Include retrieved chunks for transparency
Best Practices#
- Chunk wisely - Use sentence boundaries, aim for 300-800 tokens per chunk
- Add overlap - Include 10-20% overlap between chunks
- Include metadata - Store source, page numbers, dates for attribution
- Filter by relevance - Set a minimum similarity threshold (e.g., 0.7)
- Use hybrid search - Combine vector and keyword search for better recall
- Limit context - Keep context within model limits (aim for 3-5 chunks)
- Stream responses - Show sources first, then stream the answer
- Handle edge cases - Respond gracefully when no relevant content is found
Related Patterns#
- Embeddings - Vector embedding fundamentals
- Streaming - Real-time AI response streaming
- Full-Text Search - PostgreSQL text search