3:I[4707,[],""]
6:I[6423,[],""]
8:I[2972,["972","static/chunks/972-5c59758923e28d42.js","202","static/chunks/app/%5Blocale%5D/guides/%5Bslug%5D/page-218cc45de949c3d3.js"],""]
4:["locale","es","d"]
5:["slug","rag-python-gdpr-document-search","d"]
0:["WrGklf9BQ3ZZUJK-OObp-",[[["",{"children":[["locale","es","d"],{"children":["guides",{"children":[["slug","rag-python-gdpr-document-search","d"],{"children":["__PAGE__?{\"locale\":\"es\",\"slug\":\"rag-python-gdpr-document-search\"}",{}]}]}]}]},"$undefined","$undefined",true],["",{"children":[["locale","es","d"],{"children":["guides",{"children":[["slug","rag-python-gdpr-document-search","d"],{"children":["__PAGE__",{},[["$L1","$L2",null],null],null]},[null,["$","$L3",null,{"parallelRouterKey":"children","segmentPath":["children","$4","children","guides","children","$5","children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L6",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined"}]],null]},[null,["$","$L3",null,{"parallelRouterKey":"children","segmentPath":["children","$4","children","guides","children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L6",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined"}]],null]},[[[["$","link","0",{"rel":"stylesheet","href":"/_next/static/css/e1c2344b7b83da59.css","precedence":"next","crossOrigin":"$undefined"}]],"$L7"],null],null]},[[null,["$","$L3",null,{"parallelRouterKey":"children","segmentPath":["children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L6",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":["$","html",null,{"lang":"en","children":["$","body",null,{"className":"min-h-screen bg-background text-foreground flex items-center justify-center","children":["$","div",null,{"className":"text-center space-y-6 px-4","children":[["$","h1",null,{"className":"text-6xl font-bold text-primary","children":"404"}],["$","p",null,{"className":"text-xl text-muted-foreground","children":"Page not found"}],["$","$L8",null,{"href":"/en","className":"inline-block px-6 py-3 bg-primary text-primary-foreground rounded-lg font-semibold hover:bg-primary/90 transition-colors","children":"Go to Homepage"}]]}]}]}],"notFoundStyles":[]}]],null],null],["$L9",null]]]]
e:I[346,["972","static/chunks/972-5c59758923e28d42.js","203","static/chunks/app/%5Blocale%5D/layout-eaa8629840a87bdd.js"],"ThemeProvider"]
f:I[6932,["972","static/chunks/972-5c59758923e28d42.js","203","static/chunks/app/%5Blocale%5D/layout-eaa8629840a87bdd.js"],"LocaleSwitcher"]
10:I[9783,["972","static/chunks/972-5c59758923e28d42.js","203","static/chunks/app/%5Blocale%5D/layout-eaa8629840a87bdd.js"],"MobileMenu"]
11:I[8003,["972","static/chunks/972-5c59758923e28d42.js","203","static/chunks/app/%5Blocale%5D/layout-eaa8629840a87bdd.js"],""]
a:T44a,{"@context":"https://schema.org","@type":"FAQPage","mainEntity":[{"@type":"Question","name":"Can I use Qdrant with EU-hosted inference?","acceptedAnswer":{"@type":"Answer","text":"Yes. Qdrant runs on your own infrastructure (or Qdrant Cloud in EU regions). The inference call to JuiceFactory only receives the retrieved context — the vector database itself never leaves your environment."}},{"@type":"Question","name":"How does this differ from using OpenAI directly?","acceptedAnswer":{"@type":"Answer","text":"Two lines of code change: base_url and api_key. The SDK, request format, and response format are identical. The difference is where the data goes — EU infrastructure with zero retention instead of US servers with 30-day retention."}},{"@type":"Question","name":"What embedding model should I use for EU-compliant RAG?","acceptedAnswer":{"@type":"Answer","text":"JuiceFactory offers Qwen3-Embed (2560 dimensions) hosted in Stockholm. It outperforms most 1024-dim models on retrieval benchmarks and processes your documents statelessly — no training on your data, no retention."}}]}b:Tdaf,# ingest.py
import fitz  # PyMuPDF
from openai import OpenAI
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
import uuid
import config


def get_openai_client() -> OpenAI:
    """Create OpenAI client pointing to Juice Factory EU API."""
    return OpenAI(
        api_key=config.API_KEY,
        base_url=config.API_BASE_URL,
    )


def get_qdrant_client() -> QdrantClient:
    """Create Qdrant client."""
    return QdrantClient(host=config.QDRANT_HOST, port=config.QDRANT_PORT)


def extract_text_from_pdf(pdf_bytes: bytes) -> list[dict]:
    """Extract text from PDF, page by page."""
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    pages = []
    for page_num, page in enumerate(doc):
        text = page.get_text("text").strip()
        if text:
            pages.append({
                "page": page_num + 1,
                "text": text,
            })
    doc.close()
    return pages


def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]:
    """Split text into overlapping chunks by word count."""
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = start + chunk_size
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start = end - overlap
    return chunks


def generate_embeddings(texts: list[str], client: OpenAI) -> list[list[float]]:
    """Generate embeddings using Juice Factory EU API."""
    response = client.embeddings.create(
        model=config.EMBEDDING_MODEL,
        input=texts,
    )
    return [item.embedding for item in response.data]


def ensure_collection(qdrant: QdrantClient):
    """Create Qdrant collection if it doesn't exist."""
    collections = [c.name for c in qdrant.get_collections().collections]
    if config.COLLECTION_NAME not in collections:
        qdrant.create_collection(
            collection_name=config.COLLECTION_NAME,
            vectors_config=VectorParams(
                size=config.EMBEDDING_DIMENSIONS,
                distance=Distance.COSINE,
            ),
        )


def ingest_pdf(pdf_bytes: bytes, filename: str) -> int:
    """Full ingestion pipeline: PDF → chunks → embeddings → Qdrant."""
    openai_client = get_openai_client()
    qdrant = get_qdrant_client()
    ensure_collection(qdrant)

    # Extract text from PDF
    pages = extract_text_from_pdf(pdf_bytes)

    # Chunk all pages
    all_chunks = []
    for page_data in pages:
        chunks = chunk_text(
            page_data["text"],
            chunk_size=config.CHUNK_SIZE,
            overlap=config.CHUNK_OVERLAP,
        )
        for chunk in chunks:
            all_chunks.append({
                "text": chunk,
                "page": page_data["page"],
                "filename": filename,
            })

    if not all_chunks:
        return 0

    # Generate embeddings (batch)
    texts = [c["text"] for c in all_chunks]
    embeddings = generate_embeddings(texts, openai_client)

    # Store in Qdrant
    points = [
        PointStruct(
            id=str(uuid.uuid4()),
            vector=embedding,
            payload={
                "text": chunk["text"],
                "page": chunk["page"],
                "filename": chunk["filename"],
            },
        )
        for chunk, embedding in zip(all_chunks, embeddings)
    ]

    qdrant.upsert(
        collection_name=config.COLLECTION_NAME,
        points=points,
    )

    return len(points)
c:Tb5a,# search.py
from openai import OpenAI
from qdrant_client import QdrantClient
import config
from ingest import get_openai_client, get_qdrant_client, generate_embeddings


def search_documents(query: str, top_k: int = None) -> list[dict]:
    """Search for relevant document chunks."""
    if top_k is None:
        top_k = config.TOP_K

    openai_client = get_openai_client()
    qdrant = get_qdrant_client()

    # Embed the query
    query_embedding = generate_embeddings([query], openai_client)[0]

    # Search Qdrant
    results = qdrant.search(
        collection_name=config.COLLECTION_NAME,
        query_vector=query_embedding,
        limit=top_k,
    )

    return [
        {
            "text": hit.payload["text"],
            "page": hit.payload["page"],
            "filename": hit.payload["filename"],
            "score": hit.score,
        }
        for hit in results
    ]


def rag_query(question: str) -> dict:
    """Full RAG pipeline: embed query → retrieve context → generate answer."""
    # Retrieve relevant chunks
    chunks = search_documents(question)

    if not chunks:
        return {
            "answer": "No relevant documents found. Please upload documents first.",
            "sources": [],
        }

    # Build context from retrieved chunks
    context_parts = []
    for i, chunk in enumerate(chunks, 1):
        context_parts.append(
            f"[Source {i}: {chunk['filename']}, page {chunk['page']}]\n{chunk['text']}"
        )
    context = "\n\n".join(context_parts)

    # Generate answer using EU-hosted LLM
    openai_client = get_openai_client()
    response = openai_client.chat.completions.create(
        model=config.CHAT_MODEL,
        messages=[
            {
                "role": "system",
                "content": (
                    "You are a document assistant. Answer questions based on the "
                    "provided context. Always cite which source and page number your "
                    "answer comes from. If the context doesn't contain enough "
                    "information to answer, say so clearly."
                ),
            },
            {
                "role": "user",
                "content": f"Context:\n{context}\n\nQuestion: {question}",
            },
        ],
        temperature=0.1,
        max_tokens=1000,
    )

    return {
        "answer": response.choices[0].message.content,
        "sources": [
            {
                "filename": c["filename"],
                "page": c["page"],
                "score": round(c["score"], 4),
                "excerpt": c["text"][:200] + "..." if len(c["text"]) > 200 else c["text"],
            }
            for c in chunks
        ],
        "model": response.model,
        "usage": {
            "prompt_tokens": response.usage.prompt_tokens,
            "completion_tokens": response.usage.completion_tokens,
        },
    }
d:T742,# main.py
from fastapi import FastAPI, UploadFile, File, HTTPException
from pydantic import BaseModel
from ingest import ingest_pdf
from search import rag_query, search_documents

app = FastAPI(
    title="GDPR-Safe Document Search API",
    description="RAG-powered document search with EU-hosted inference",
    version="1.0.0",
)


class QueryRequest(BaseModel):
    question: str
    top_k: int = 5


class QueryResponse(BaseModel):
    answer: str
    sources: list[dict]
    model: str | None = None
    usage: dict | None = None


@app.post("/upload")
async def upload_document(file: UploadFile = File(...)):
    """Upload a PDF document for indexing."""
    if not file.filename.lower().endswith(".pdf"):
        raise HTTPException(status_code=400, detail="Only PDF files are supported")

    pdf_bytes = await file.read()
    if len(pdf_bytes) > 50 * 1024 * 1024:  # 50MB limit
        raise HTTPException(status_code=400, detail="File too large (max 50MB)")

    num_chunks = ingest_pdf(pdf_bytes, file.filename)

    return {
        "filename": file.filename,
        "chunks_indexed": num_chunks,
        "status": "indexed",
    }


@app.post("/query", response_model=QueryResponse)
async def query_documents(request: QueryRequest):
    """Ask a question about uploaded documents."""
    if not request.question.strip():
        raise HTTPException(status_code=400, detail="Question cannot be empty")

    result = rag_query(request.question)
    return QueryResponse(**result)


@app.post("/search")
async def search_only(request: QueryRequest):
    """Search for relevant chunks without generating an answer."""
    results = search_documents(request.question, top_k=request.top_k)
    return {"results": results}


@app.get("/health")
async def health():
    """Health check endpoint."""
    return {"status": "ok", "data_residency": "EU"}
2:[["$","script",null,{"type":"application/ld+json","dangerouslySetInnerHTML":{"__html":"{\"@context\":\"https://schema.org\",\"@type\":\"BreadcrumbList\",\"itemListElement\":[{\"@type\":\"ListItem\",\"position\":1,\"name\":\"Home\",\"item\":\"https://juicefactory.ai/es/\"},{\"@type\":\"ListItem\",\"position\":2,\"name\":\"Guides\",\"item\":\"https://juicefactory.ai/es/guides\"},{\"@type\":\"ListItem\",\"position\":3,\"name\":\"RAG in Python: GDPR-Safe Document Search\",\"item\":\"https://juicefactory.ai/es/guides/rag-python-gdpr-document-search\"}]}"}}],["$","script",null,{"type":"application/ld+json","dangerouslySetInnerHTML":{"__html":"$a"}}],["$","script",null,{"type":"application/ld+json","dangerouslySetInnerHTML":{"__html":"{\"@context\":\"https://schema.org\",\"@type\":\"TechArticle\",\"headline\":\"RAG en Python: Crea una API de búsqueda documental conforme al RGPD (2026)\",\"description\":\"Construye un sistema RAG en Python con FastAPI, Qdrant e inferencia alojada en la UE. Búsqueda documental conforme al RGPD con PyMuPDF, embeddings y LLM privado.\",\"author\":{\"@type\":\"Organization\",\"name\":\"Juice Factory\",\"url\":\"https://juicefactory.ai\"},\"publisher\":{\"@type\":\"Organization\",\"name\":\"Juice Factory\",\"logo\":{\"@type\":\"ImageObject\",\"url\":\"https://juicefactory.ai/logo-opengraph.png\"}},\"url\":\"https://juicefactory.ai/es/guides/rag-python-gdpr-document-search\",\"datePublished\":\"2025-01-15\",\"dateModified\":\"2026-05-15\",\"inLanguage\":\"es\",\"mainEntityOfPage\":{\"@type\":\"WebPage\",\"@id\":\"https://juicefactory.ai/es/guides/rag-python-gdpr-document-search\"}}"}}],["$","section",null,{"className":"pt-32 pb-20 px-4","children":["$","div",null,{"className":"container mx-auto max-w-4xl","children":["$","article",null,{"className":"prose prose-neutral dark:prose-invert max-w-none","children":[["$","h1","h1-0",{"children":"RAG in Python: Build a GDPR-Safe Document Search API with EU-Hosted Inference"}],"\n",["$","p","p-0",{"children":"Build a production-ready retrieval-augmented generation (RAG) system in Python that keeps all data within the EU. This guide covers document ingestion with PyMuPDF, vector storage with Qdrant, and LLM inference through Juice Factory's private EU API — all wrapped in a FastAPI service."}],"\n",["$","p","p-1",{"children":"By the end, you'll have a working document search API that:"}],"\n",["$","ul","ul-0",{"children":["\n",["$","li","li-0",{"children":"Extracts text from PDFs using PyMuPDF"}],"\n",["$","li","li-1",{"children":"Generates embeddings and stores them in Qdrant"}],"\n",["$","li","li-2",{"children":"Answers questions using retrieved context + EU-hosted LLM inference"}],"\n",["$","li","li-3",{"children":"Never sends user data outside the EU"}],"\n"]}],"\n",["$","hr","hr-0",{}],"\n",["$","h2","h2-0",{"children":"Prerequisites"}],"\n",["$","ul","ul-1",{"children":["\n",["$","li","li-0",{"children":"Python 3.10+"}],"\n",["$","li","li-1",{"children":"Docker (for Qdrant)"}],"\n",["$","li","li-2",{"children":["A Juice Factory API key (",["$","a","a-0",{"href":"https://portal.juicefactory.ai","children":"get one here"}],")"]}],"\n"]}],"\n",["$","hr","hr-1",{}],"\n",["$","h2","h2-1",{"children":"Architecture Overview"}],"\n",["$","pre","pre-0",{"children":["$","code","code-0",{"children":"┌──────────────┐     ┌───────────────┐     ┌──────────────────┐\n│  PDF Upload  │────▶│  PyMuPDF      │────▶│  Qdrant          │\n│  (FastAPI)   │     │  Text Extract │     │  Vector Store    │\n└──────────────┘     └───────────────┘     └──────────────────┘\n                                                    │\n┌──────────────┐     ┌───────────────┐              │\n│  User Query  │────▶│  Embedding    │──── search ──┘\n│  (FastAPI)   │     │  (EU API)     │\n└──────────────┘     └───────┬───────┘\n                             │\n                     ┌───────▼───────┐     ┌──────────────────┐\n                     │  Context +    │────▶│  LLM Inference   │\n                     │  Query        │     │  (EU-hosted)     │\n                     └───────────────┘     └──────────────────┘\n"}]}],"\n",["$","p","p-2",{"children":"The system follows a standard RAG pipeline, but every component that touches user data runs within EU infrastructure. Qdrant is self-hosted, and both embeddings and LLM inference route through Juice Factory's EU endpoints."}],"\n",["$","hr","hr-2",{}],"\n",["$","h2","h2-2",{"children":"Step 1: Project Setup"}],"\n",["$","p","p-3",{"children":"Create the project directory and install dependencies:"}],"\n",["$","pre","pre-1",{"children":["$","code","code-0",{"className":"language-bash","children":"mkdir rag-document-search && cd rag-document-search\npython -m venv .venv\nsource .venv/bin/activate\n"}]}],"\n",["$","p","p-4",{"children":"Install the required packages:"}],"\n",["$","pre","pre-2",{"children":["$","code","code-0",{"className":"language-bash","children":"pip install fastapi uvicorn pymupdf qdrant-client openai python-multipart\n"}]}],"\n",["$","p","p-5",{"children":"Create the project structure:"}],"\n",["$","pre","pre-3",{"children":["$","code","code-0",{"children":"rag-document-search/\n├── main.py              # FastAPI application\n├── ingest.py            # Document ingestion pipeline\n├── search.py            # Query and retrieval logic\n├── config.py            # Configuration\n└── requirements.txt\n"}]}],"\n",["$","p","p-6",{"children":[["$","strong","strong-0",{"children":"requirements.txt"}],":"]}],"\n",["$","pre","pre-4",{"children":["$","code","code-0",{"children":"fastapi==0.115.0\nuvicorn==0.30.0\npymupdf==1.24.0\nqdrant-client==1.11.0\nopenai==1.50.0\npython-multipart==0.0.9\n"}]}],"\n",["$","hr","hr-3",{}],"\n",["$","h2","h2-3",{"children":"Step 2: Configuration"}],"\n",["$","p","p-7",{"children":"Set up the configuration with your Juice Factory API credentials:"}],"\n",["$","pre","pre-5",{"children":["$","code","code-0",{"className":"language-python","children":"# config.py\nimport os\n\n# Juice Factory EU API (OpenAI-compatible)\nAPI_BASE_URL = \"https://api.juicefactory.ai/v1\"\nAPI_KEY = os.environ.get(\"JUICEFACTORY_API_KEY\", \"your-api-key\")\n\n# Embedding model\nEMBEDDING_MODEL = \"text-embedding-3-small\"\nEMBEDDING_DIMENSIONS = 1536\n\n# Chat model for RAG responses\nCHAT_MODEL = \"gpt-4\"\n\n# Qdrant configuration (self-hosted in EU)\nQDRANT_HOST = os.environ.get(\"QDRANT_HOST\", \"localhost\")\nQDRANT_PORT = int(os.environ.get(\"QDRANT_PORT\", \"6333\"))\nCOLLECTION_NAME = \"documents\"\n\n# Chunk settings\nCHUNK_SIZE = 500       # tokens per chunk (approximate)\nCHUNK_OVERLAP = 50     # overlap between chunks\nTOP_K = 5              # number of chunks to retrieve\n"}]}],"\n",["$","hr","hr-4",{}],"\n",["$","h2","h2-4",{"children":"Step 3: Start Qdrant with Docker"}],"\n",["$","p","p-8",{"children":"Run Qdrant locally (or on your EU server):"}],"\n",["$","pre","pre-6",{"children":["$","code","code-0",{"className":"language-bash","children":"docker run -d \\\n  --name qdrant \\\n  -p 6333:6333 \\\n  -p 6334:6334 \\\n  -v qdrant_storage:/qdrant/storage \\\n  qdrant/qdrant:latest\n"}]}],"\n",["$","p","p-9",{"children":"Qdrant stores all data locally — no external calls, no telemetry, full control over data location."}],"\n",["$","hr","hr-5",{}],"\n",["$","h2","h2-5",{"children":"Step 4: Document Ingestion with PyMuPDF"}],"\n",["$","p","p-10",{"children":"The ingestion pipeline extracts text from PDFs, splits it into chunks, generates embeddings via the EU API, and stores everything in Qdrant."}],"\n",["$","pre","pre-7",{"children":["$","code","code-0",{"className":"language-python","children":"$b"}]}],"\n",["$","p","p-11",{"children":"Key points:"}],"\n",["$","ul","ul-2",{"children":["\n",["$","li","li-0",{"children":[["$","strong","strong-0",{"children":"PyMuPDF"}]," (",["$","code","code-0",{"children":"fitz"}],") extracts text without external dependencies or cloud calls"]}],"\n",["$","li","li-1",{"children":[["$","strong","strong-0",{"children":"Embeddings"}]," are generated through Juice Factory's EU API — same OpenAI SDK, EU endpoint"]}],"\n",["$","li","li-2",{"children":[["$","strong","strong-0",{"children":"Qdrant"}]," stores vectors locally with no telemetry"]}],"\n"]}],"\n",["$","hr","hr-6",{}],"\n",["$","h2","h2-6",{"children":"Step 5: Search and RAG Query"}],"\n",["$","p","p-12",{"children":"The search module embeds the user query, retrieves relevant chunks, and sends them with the question to the LLM."}],"\n",["$","pre","pre-8",{"children":["$","code","code-0",{"className":"language-python","children":"$c"}]}],"\n",["$","p","p-13",{"children":["The ",["$","code","code-0",{"children":"rag_query"}]," function is the core of the system:"]}],"\n",["$","ol","ol-0",{"children":["\n",["$","li","li-0",{"children":"Embeds the user question via EU API"}],"\n",["$","li","li-1",{"children":"Retrieves the top-K most relevant chunks from Qdrant"}],"\n",["$","li","li-2",{"children":"Sends context + question to the EU-hosted LLM"}],"\n",["$","li","li-3",{"children":"Returns the answer with source citations"}],"\n"]}],"\n",["$","hr","hr-7",{}],"\n",["$","h2","h2-7",{"children":"Step 6: FastAPI Application"}],"\n",["$","p","p-14",{"children":"Wire everything together with a FastAPI service:"}],"\n",["$","pre","pre-9",{"children":["$","code","code-0",{"className":"language-python","children":"$d"}]}],"\n",["$","hr","hr-8",{}],"\n",["$","h2","h2-8",{"children":"Step 7: Run and Test"}],"\n",["$","p","p-15",{"children":"Start the API server:"}],"\n",["$","pre","pre-10",{"children":["$","code","code-0",{"className":"language-bash","children":"export JUICEFACTORY_API_KEY=\"your-api-key\"\nuvicorn main:app --host 0.0.0.0 --port 8000 --reload\n"}]}],"\n",["$","h3","h3-0",{"children":"Upload a document"}],"\n",["$","pre","pre-11",{"children":["$","code","code-0",{"className":"language-bash","children":"curl -X POST http://localhost:8000/upload \\\n  -F \"file=@contract.pdf\"\n"}]}],"\n",["$","p","p-16",{"children":"Response:"}],"\n",["$","pre","pre-12",{"children":["$","code","code-0",{"className":"language-json","children":"{\n  \"filename\": \"contract.pdf\",\n  \"chunks_indexed\": 47,\n  \"status\": \"indexed\"\n}\n"}]}],"\n",["$","h3","h3-1",{"children":"Ask a question"}],"\n",["$","pre","pre-13",{"children":["$","code","code-0",{"className":"language-bash","children":"curl -X POST http://localhost:8000/query \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\"question\": \"What are the payment terms in the contract?\"}'\n"}]}],"\n",["$","p","p-17",{"children":"Response:"}],"\n",["$","pre","pre-14",{"children":["$","code","code-0",{"className":"language-json","children":"{\n  \"answer\": \"According to the contract (Source 1, page 4), payment terms are Net 30 from the date of invoice. Late payments accrue interest at 1.5% per month as specified in Section 5.2.\",\n  \"sources\": [\n    {\n      \"filename\": \"contract.pdf\",\n      \"page\": 4,\n      \"score\": 0.9234,\n      \"excerpt\": \"Payment Terms. The Client shall pay all invoices within thirty (30) days...\"\n    }\n  ],\n  \"model\": \"gpt-4-0125-preview\",\n  \"usage\": {\n    \"prompt_tokens\": 847,\n    \"completion_tokens\": 89\n  }\n}\n"}]}],"\n",["$","hr","hr-9",{}],"\n",["$","h2","h2-9",{"children":"GDPR Compliance Checklist"}],"\n",["$","p","p-18",{"children":"This architecture satisfies GDPR requirements at each layer:"}],"\n",["$","table","table-0",{"children":[["$","thead","thead-0",{"children":["$","tr","tr-0",{"children":[["$","th","th-0",{"children":"Component"}],["$","th","th-1",{"children":"Data Handling"}],["$","th","th-2",{"children":"GDPR Compliance"}]]}]}],["$","tbody","tbody-0",{"children":[["$","tr","tr-0",{"children":[["$","td","td-0",{"children":["$","strong","strong-0",{"children":"PDF Upload"}]}],["$","td","td-1",{"children":"Files processed in memory, text extracted locally"}],["$","td","td-2",{"children":"No external data transfer"}]]}],["$","tr","tr-1",{"children":[["$","td","td-0",{"children":["$","strong","strong-0",{"children":"Embeddings"}]}],["$","td","td-1",{"children":"Generated via Juice Factory EU API"}],["$","td","td-2",{"children":"EU data residency, no retention"}]]}],["$","tr","tr-2",{"children":[["$","td","td-0",{"children":["$","strong","strong-0",{"children":"Vector Store"}]}],["$","td","td-1",{"children":"Self-hosted Qdrant, EU infrastructure"}],["$","td","td-2",{"children":"Full control over data location"}]]}],["$","tr","tr-3",{"children":[["$","td","td-0",{"children":["$","strong","strong-0",{"children":"LLM Inference"}]}],["$","td","td-1",{"children":"Juice Factory EU API, stateless processing"}],["$","td","td-2",{"children":"No query storage, no training use"}]]}],["$","tr","tr-4",{"children":[["$","td","td-0",{"children":["$","strong","strong-0",{"children":"API Server"}]}],["$","td","td-1",{"children":"Your infrastructure, your logging policy"}],["$","td","td-2",{"children":"Application-level control"}]]}]]}]]}],"\n",["$","p","p-19",{"children":["$","strong","strong-0",{"children":"Key guarantees:"}]}],"\n",["$","ul","ul-3",{"children":["\n",["$","li","li-0",{"children":"User queries never leave the EU"}],"\n",["$","li","li-1",{"children":"No data is used for model training"}],"\n",["$","li","li-2",{"children":"Qdrant stores only embeddings (not raw queries)"}],"\n",["$","li","li-3",{"children":"LLM inference is stateless — queries are not retained"}],"\n",["$","li","li-4",{"children":"You control all logging and data retention policies"}],"\n"]}],"\n",["$","hr","hr-10",{}],"\n",["$","h2","h2-10",{"children":"Production Considerations"}],"\n",["$","h3","h3-2",{"children":"Scaling Qdrant"}],"\n",["$","p","p-20",{"children":"For production deployments with large document collections:"}],"\n",["$","pre","pre-15",{"children":["$","code","code-0",{"className":"language-bash","children":"# Run Qdrant with persistent storage and resource limits\ndocker run -d \\\n  --name qdrant \\\n  -p 6333:6333 \\\n  --memory=4g \\\n  -v /data/qdrant:/qdrant/storage \\\n  qdrant/qdrant:latest\n"}]}],"\n",["$","p","p-21",{"children":"For collections exceeding 10M vectors, consider Qdrant's distributed mode with sharding across multiple EU-hosted nodes."}],"\n",["$","h3","h3-3",{"children":"Chunking Strategy"}],"\n",["$","p","p-22",{"children":"The simple word-count chunking in this guide works for most documents. For better results with structured documents:"}],"\n",["$","ul","ul-4",{"children":["\n",["$","li","li-0",{"children":[["$","strong","strong-0",{"children":"Semantic chunking"}],": Split on paragraph or section boundaries"]}],"\n",["$","li","li-1",{"children":[["$","strong","strong-0",{"children":"Sliding window"}],": Use overlapping chunks to avoid splitting context"]}],"\n",["$","li","li-2",{"children":[["$","strong","strong-0",{"children":"Metadata enrichment"}],": Include section headers, document titles, and dates in chunk metadata"]}],"\n"]}],"\n",["$","h3","h3-4",{"children":"Error Handling"}],"\n",["$","p","p-23",{"children":"Add retry logic for API calls and handle Qdrant connection failures:"}],"\n",["$","pre","pre-16",{"children":["$","code","code-0",{"className":"language-python","children":"from tenacity import retry, stop_after_attempt, wait_exponential\n\n@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, max=10))\ndef generate_embeddings_with_retry(texts, client):\n    return generate_embeddings(texts, client)\n"}]}],"\n",["$","h3","h3-5",{"children":"Authentication"}],"\n",["$","p","p-24",{"children":"Add API key authentication to your FastAPI endpoints for production:"}],"\n",["$","pre","pre-17",{"children":["$","code","code-0",{"className":"language-python","children":"from fastapi import Depends, Security\nfrom fastapi.security import APIKeyHeader\n\napi_key_header = APIKeyHeader(name=\"X-API-Key\")\n\nasync def verify_api_key(api_key: str = Security(api_key_header)):\n    if api_key != os.environ.get(\"APP_API_KEY\"):\n        raise HTTPException(status_code=403, detail=\"Invalid API key\")\n    return api_key\n\n@app.post(\"/query\", dependencies=[Depends(verify_api_key)])\nasync def query_documents(request: QueryRequest):\n    ...\n"}]}],"\n",["$","hr","hr-11",{}],"\n",["$","h2","h2-11",{"children":"Summary"}],"\n",["$","p","p-25",{"children":"This guide demonstrates a complete RAG pipeline that maintains GDPR compliance throughout:"}],"\n",["$","ol","ol-1",{"children":["\n",["$","li","li-0",{"children":[["$","strong","strong-0",{"children":"Document ingestion"}],": PyMuPDF extracts text locally, no cloud dependencies"]}],"\n",["$","li","li-1",{"children":[["$","strong","strong-0",{"children":"Embeddings"}],": Generated through Juice Factory's EU API with no data retention"]}],"\n",["$","li","li-2",{"children":[["$","strong","strong-0",{"children":"Vector storage"}],": Self-hosted Qdrant keeps all indexed data under your control"]}],"\n",["$","li","li-3",{"children":[["$","strong","strong-0",{"children":"LLM inference"}],": EU-hosted, stateless processing with no query storage"]}],"\n",["$","li","li-4",{"children":[["$","strong","strong-0",{"children":"API layer"}],": FastAPI gives you full control over access, logging, and data handling"]}],"\n"]}],"\n",["$","p","p-26",{"children":"The entire system can be deployed on EU infrastructure with no data leaving the region. Switching from a non-compliant setup is straightforward — replace the API base URL, point embeddings at the EU endpoint, and self-host your vector store."}],"\n",["$","hr","hr-12",{}],"\n",["$","h2","h2-12",{"children":"Related Guides"}],"\n",["$","ul","ul-5",{"children":["\n",["$","li","li-0",{"children":[["$","a","a-0",{"href":"/en/guides/gdpr-safe-ai-inference","children":"GDPR-Safe AI Inference"}]," — Architecture guide for compliant AI applications with RAG"]}],"\n",["$","li","li-1",{"children":[["$","a","a-0",{"href":"/en/replace-openai","children":"Replacing OpenAI with EU Infrastructure"}]," — Migration guide for switching API providers"]}],"\n",["$","li","li-2",{"children":[["$","a","a-0",{"href":"/en/automation/n8n-private-ai","children":"n8n + Private AI Automation"}]," — Workflow automation with EU-hosted inference"]}],"\n",["$","li","li-3",{"children":[["$","a","a-0",{"href":"/en/guides/cursor-byok-setup","children":"Cursor AI BYOK Setup"}]," — Use Juice Factory as your BYOK provider in Cursor"]}],"\n"]}]]}]}]}],["$","section",null,{"className":"py-12 px-4","children":["$","div",null,{"className":"container mx-auto max-w-4xl","children":[["$","h2",null,{"className":"text-2xl font-bold mb-6 text-foreground","children":"Related Guides"}],["$","div",null,{"className":"grid gap-4 sm:grid-cols-2 lg:grid-cols-3","children":[["$","$L8","gdpr-safe-ai-inference",{"href":"/es/guides/gdpr-safe-ai-inference","className":"block rounded-lg border border-border bg-card p-5 transition-colors hover:bg-accent hover:text-accent-foreground","children":["$","h3",null,{"className":"text-sm font-semibold leading-snug text-foreground","children":"GDPR-Safe AI Inference"}]}],["$","$L8","implementing-gdpr-compliant-ai",{"href":"/es/guides/implementing-gdpr-compliant-ai","className":"block rounded-lg border border-border bg-card p-5 transition-colors hover:bg-accent hover:text-accent-foreground","children":["$","h3",null,{"className":"text-sm font-semibold leading-snug text-foreground","children":"GDPR-Compliant AI Infrastructure"}]}],["$","$L8","rag-with-qwen",{"href":"/es/guides/rag-with-qwen","className":"block rounded-lg border border-border bg-card p-5 transition-colors hover:bg-accent hover:text-accent-foreground","children":["$","h3",null,{"className":"text-sm font-semibold leading-snug text-foreground","children":"RAG with Qwen: Private Document Search"}]}]]}]]}]}],["$","section",null,{"className":"py-16 px-4","children":["$","div",null,{"className":"container mx-auto max-w-4xl","children":["$","div",null,{"className":"bg-gradient-to-br from-primary/5 to-secondary/5 backdrop-blur-sm border border-primary/30 rounded-lg p-12 space-y-6 text-center","children":[["$","h2",null,{"className":"text-3xl md:text-4xl font-bold","children":"Ship GDPR-Compliant AI Today"}],["$","p",null,{"className":"text-xl text-muted-foreground max-w-2xl mx-auto","children":"Zero-retention inference in Stockholm. DPA included. Same OpenAI SDK, two lines change."}],["$","div",null,{"className":"flex flex-col sm:flex-row gap-4 justify-center pt-4","children":[["$","a",null,{"href":"https://portal.juicefactory.ai/auth/signup","className":"px-8 py-4 bg-primary text-primary-foreground rounded-lg text-lg font-semibold hover:bg-primary/90 transition-colors inline-block","children":"Get a free API key"}],["$","$L8",null,{"href":"/es/guides/implementing-gdpr-compliant-ai","className":"px-8 py-4 border border-border rounded-lg text-lg font-semibold hover:bg-accent transition-colors inline-block","children":"Read the GDPR implementation guide"}]]}]]}]}]}]]
7:["$","html",null,{"lang":"es","suppressHydrationWarning":true,"children":[["$","head",null,{}],["$","body",null,{"className":"min-h-screen bg-background text-foreground antialiased","children":[["$","$Le",null,{"attribute":"class","defaultTheme":"dark","enableSystem":true,"disableTransitionOnChange":true,"children":[["$","nav",null,{"className":"fixed top-0 left-0 right-0 z-50 bg-background/80 backdrop-blur-md border-b border-border","children":["$","div",null,{"className":"container mx-auto px-4 h-16 flex items-center justify-between","children":[["$","$L8",null,{"href":"/es","className":"flex items-center gap-2","children":[["$","svg",null,{"className":"w-8 h-8 text-primary animate-pulse-glow","fill":"none","viewBox":"0 0 24 24","stroke":"currentColor","children":["$","path",null,{"strokeLinecap":"round","strokeLinejoin":"round","strokeWidth":2,"d":"M13 10V3L4 14h7v7l9-11h-7z"}]}],["$","span",null,{"className":"text-2xl font-bold bg-gradient-primary bg-clip-text text-transparent","children":"Juice Factory"}]]}],["$","div",null,{"className":"hidden md:flex items-center gap-6","children":[["$","$L8",null,{"href":"/es/defense","className":"text-sm text-muted-foreground hover:text-foreground transition-colors","children":"Defensa"}],["$","$L8",null,{"href":"/es/tech","className":"text-sm text-muted-foreground hover:text-foreground transition-colors","children":"Tecnología"}],["$","$L8",null,{"href":"/es/byok","className":"text-sm text-muted-foreground hover:text-foreground transition-colors","children":"BYOK"}],["$","$L8",null,{"href":"/es/private-ai-for-business","className":"text-sm text-muted-foreground hover:text-foreground transition-colors","children":"IA Privada"}],["$","$L8",null,{"href":"/es/guides","className":"text-sm text-muted-foreground hover:text-foreground transition-colors","children":"Guías"}],["$","$L8",null,{"href":"/es/trust","className":"text-sm text-muted-foreground hover:text-foreground transition-colors","children":"Trust Center"}],["$","$Lf",null,{"currentLocale":"es"}],["$","a",null,{"href":"https://portal.juicefactory.ai/auth/login","className":"px-4 py-2 bg-primary text-primary-foreground rounded-md text-sm font-medium hover:bg-primary/90 transition-colors","children":"Iniciar sesión"}]]}],["$","$L10",null,{"locale":"es","nav":{"home":"Inicio","defense":"Defensa","tech":"Tecnología","byok":"BYOK","privateAi":"IA Privada","guides":"Guías","trust":"Trust Center","login":"Iniciar sesión"}}]]}]}],["$","$L3",null,{"parallelRouterKey":"children","segmentPath":["children","$4","children"],"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L6",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined"}],["$","footer",null,{"className":"py-12 px-4 border-t border-border","children":["$","div",null,{"className":"container mx-auto flex flex-col md:flex-row justify-between items-center gap-4","children":[["$","p",null,{"className":"text-sm text-muted-foreground","children":["© ",2026," ","Juice Factory. Todos los derechos reservados."]}],["$","div",null,{"className":"flex gap-6","children":[["$","$L8",null,{"href":"/en/trust","className":"text-sm text-muted-foreground hover:text-foreground transition-colors","children":"Trust Center"}],["$","$L8",null,{"href":"/en/security","className":"text-sm text-muted-foreground hover:text-foreground transition-colors","children":"Security"}],["$","$L8",null,{"href":"/en/data-processing","className":"text-sm text-muted-foreground hover:text-foreground transition-colors","children":"Data Processing"}],["$","$L8",null,{"href":"/es/privacy","className":"text-sm text-muted-foreground hover:text-foreground transition-colors","children":"Política de privacidad"}],["$","$L8",null,{"href":"/es/terms","className":"text-sm text-muted-foreground hover:text-foreground transition-colors","children":"Términos de servicio"}]]}]]}]}]]}],["$","$L11",null,{"src":"https://www.googletagmanager.com/gtag/js?id=G-HGZMPNZK5F","strategy":"afterInteractive"}],["$","$L11",null,{"id":"ga4-init","strategy":"afterInteractive","children":"\n            window.dataLayer = window.dataLayer || [];\n            function gtag(){dataLayer.push(arguments);}\n            gtag('js', new Date());\n            gtag('config', 'G-HGZMPNZK5F');\n          "}],["$","$L11",null,{"id":"matomo-init","strategy":"afterInteractive","children":"\n            var _paq = window._paq = window._paq || [];\n            _paq.push(['trackPageView']);\n            _paq.push(['enableLinkTracking']);\n            (function() {\n              var u=\"https://matomo.manprogroup.com/\";\n              _paq.push(['setTrackerUrl', u+'matomo.php']);\n              _paq.push(['setSiteId', '14']);\n              var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];\n              g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);\n            })();\n          "}]]}]]}]
9:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"RAG en Python: Crea una API de búsqueda documental conforme al RGPD (2026)"}],["$","meta","3",{"name":"description","content":"Construye un sistema RAG en Python con FastAPI, Qdrant e inferencia alojada en la UE. Búsqueda documental conforme al RGPD con PyMuPDF, embeddings y LLM privado."}],["$","meta","4",{"name":"robots","content":"noindex, follow"}],["$","link","5",{"rel":"canonical","href":"https://juicefactory.ai/en/guides/rag-python-gdpr-document-search"}],["$","meta","6",{"property":"og:title","content":"RAG en Python: Crea una API de búsqueda documental conforme al RGPD (2026)"}],["$","meta","7",{"property":"og:description","content":"Construye un sistema RAG en Python con FastAPI, Qdrant e inferencia alojada en la UE. Búsqueda documental conforme al RGPD con PyMuPDF, embeddings y LLM privado."}],["$","meta","8",{"property":"og:url","content":"https://juicefactory.ai/es/guides/rag-python-gdpr-document-search"}],["$","meta","9",{"property":"og:site_name","content":"Juice Factory"}],["$","meta","10",{"property":"og:locale","content":"es"}],["$","meta","11",{"property":"og:image:alt","content":"Juice Factory AI Guide"}],["$","meta","12",{"property":"og:image:type","content":"image/png"}],["$","meta","13",{"property":"og:image","content":"http://localhost:3000/es/guides/rag-python-gdpr-document-search/opengraph-image?b184df05bd4ad81a"}],["$","meta","14",{"property":"og:image:width","content":"1200"}],["$","meta","15",{"property":"og:image:height","content":"630"}],["$","meta","16",{"property":"og:type","content":"article"}],["$","meta","17",{"name":"twitter:card","content":"summary_large_image"}],["$","meta","18",{"name":"twitter:title","content":"RAG en Python: Crea una API de búsqueda documental conforme al RGPD (2026)"}],["$","meta","19",{"name":"twitter:description","content":"Construye un sistema RAG en Python con FastAPI, Qdrant e inferencia alojada en la UE. Búsqueda documental conforme al RGPD con PyMuPDF, embeddings y LLM privado."}],["$","meta","20",{"name":"twitter:image","content":"https://juicefactory.ai/logo-opengraph.png"}],["$","link","21",{"rel":"icon","href":"/favicon.png"}]]
1:null
