From 0d83d3177eba0ed5a8848b8073162cfdb6c06931 Mon Sep 17 00:00:00 2001 From: Christian Gick Date: Thu, 5 Mar 2026 16:43:13 +0200 Subject: [PATCH] fix: instruct LLM to trust title/summary over garbled OCR content Scanned passport PDFs have completely garbled OCR text that makes the LLM think they're not passports, even though the AI-generated title and summary correctly identify them. Added explicit instruction to trust title/summary fields. Co-Authored-By: Claude Opus 4.6 --- bot.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/bot.py b/bot.py index a638761..1c58b69 100644 --- a/bot.py +++ b/bot.py @@ -425,15 +425,19 @@ class DocumentRAG: parts.append("") # blank line between docs parts.append("IMPORTANT INSTRUCTIONS FOR DOCUMENT RESPONSES:\n" - "1. Answer the user's question using ALL the document content above.\n" + "1. Answer the user's question using ALL the documents above.\n" "2. These are FRESH search results — they override anything from chat history.\n" " If previous messages said 'only one passport' but documents show more, trust the documents.\n" - "3. You MUST include a source link for EVERY document you reference.\n" - "4. Format links as markdown: [Document Title](url)\n" - "5. Place the link right after mentioning or quoting the document.\n" - "6. If a document has no link, skip the link but still reference the title.\n" - "7. Never show raw URLs without markdown formatting.\n" - "8. List ALL matching documents, not just the first one.") + "3. TRUST the document TITLE and SUMMARY — they are AI-generated and accurate.\n" + " The Content field may be garbled OCR from scanned PDFs (random characters, broken text).\n" + " If the title says 'Christian's Passport' and summary says 'passport belonging to Christian',\n" + " then it IS a passport — even if the content looks like gibberish.\n" + "4. You MUST include a source link for EVERY document you reference.\n" + "5. Format links as markdown: [Document Title](url)\n" + "6. Place the link right after mentioning or quoting the document.\n" + "7. If a document has no link, skip the link but still reference the title.\n" + "8. Never show raw URLs without markdown formatting.\n" + "9. List ALL matching documents, not just the first one.") return "\n".join(parts)