From 0d83d3177eba0ed5a8848b8073162cfdb6c06931 Mon Sep 17 00:00:00 2001
From: Christian Gick <service@agiliton.eu>
Date: Thu, 5 Mar 2026 16:43:13 +0200
Subject: [PATCH] fix: instruct LLM to trust title/summary over garbled OCR
 content

Scanned passport PDFs have completely garbled OCR text that makes
the LLM think they're not passports, even though the AI-generated
title and summary correctly identify them. Added explicit instruction
to trust title/summary fields.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 bot.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/bot.py b/bot.py
index a638761..1c58b69 100644
--- a/bot.py
+++ b/bot.py
@@ -425,15 +425,19 @@ class DocumentRAG:
             parts.append("")  # blank line between docs
 
         parts.append("IMPORTANT INSTRUCTIONS FOR DOCUMENT RESPONSES:\n"
-                      "1. Answer the user's question using ALL the document content above.\n"
+                      "1. Answer the user's question using ALL the documents above.\n"
                       "2. These are FRESH search results — they override anything from chat history.\n"
                       "   If previous messages said 'only one passport' but documents show more, trust the documents.\n"
-                      "3. You MUST include a source link for EVERY document you reference.\n"
-                      "4. Format links as markdown: [Document Title](url)\n"
-                      "5. Place the link right after mentioning or quoting the document.\n"
-                      "6. If a document has no link, skip the link but still reference the title.\n"
-                      "7. Never show raw URLs without markdown formatting.\n"
-                      "8. List ALL matching documents, not just the first one.")
+                      "3. TRUST the document TITLE and SUMMARY — they are AI-generated and accurate.\n"
+                      "   The Content field may be garbled OCR from scanned PDFs (random characters, broken text).\n"
+                      "   If the title says 'Christian's Passport' and summary says 'passport belonging to Christian',\n"
+                      "   then it IS a passport — even if the content looks like gibberish.\n"
+                      "4. You MUST include a source link for EVERY document you reference.\n"
+                      "5. Format links as markdown: [Document Title](url)\n"
+                      "6. Place the link right after mentioning or quoting the document.\n"
+                      "7. If a document has no link, skip the link but still reference the title.\n"
+                      "8. Never show raw URLs without markdown formatting.\n"
+                      "9. List ALL matching documents, not just the first one.")
         return "\n".join(parts)