feat: scanned PDF fallback via vision for both plain and E2EE rooms (MAT-156)

Render scanned/image-based PDF pages to PNG at 200 DPI and send to AI model as image content when text extraction returns empty. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-10 11:35:47 +02:00
parent 0c7070ebc4
commit f73de35fd4
1 changed files with 58 additions and 0 deletions
--- a/bot.py
+++ b/bot.py
@@ -2212,6 +2212,24 @@ class Bot:
            extracted = self._extract_text_file(file_bytes)
            doc_type = "text"

+        # Scanned PDF fallback: render pages as images for vision analysis
+        if not extracted and is_pdf:
+            page_images = self._render_pdf_pages_as_images(file_bytes)
+            if page_images:
+                await self.client.room_typing(room.room_id, typing_state=True)
+                try:
+                    user_message = f'The user sent a scanned PDF named "{filename}" ({len(page_images)} page(s)). Analyze the document content and summarize it.'
+                    reply = await self._respond_with_ai(room, user_message, sender=event.sender, image_data=page_images[0])
+                    if reply:
+                        docs = self._room_document_context.setdefault(room.room_id, [])
+                        docs.append({"type": "pdf", "filename": filename,
+                                     "text": reply, "timestamp": time.time()})
+                        if len(docs) > 5:
+                            del docs[:-5]
+                finally:
+                    await self.client.room_typing(room.room_id, typing_state=False)
+                return
+
        if not extracted:
            await self._send_text(room.room_id, f"I couldn't extract any text from that file ({filename}).")
            return
@@ -2303,6 +2321,24 @@ class Bot:
            extracted = self._extract_text_file(file_bytes)
            doc_type = "text"

+        # Scanned PDF fallback: render pages as images for vision analysis
+        if not extracted and is_pdf:
+            page_images = self._render_pdf_pages_as_images(file_bytes)
+            if page_images:
+                await self.client.room_typing(room.room_id, typing_state=True)
+                try:
+                    user_message = f'The user sent a scanned PDF named "{filename}" ({len(page_images)} page(s)). Analyze the document content and summarize it.'
+                    reply = await self._respond_with_ai(room, user_message, sender=event.sender, image_data=page_images[0])
+                    if reply:
+                        docs = self._room_document_context.setdefault(room.room_id, [])
+                        docs.append({"type": "pdf", "filename": filename,
+                                     "text": reply, "timestamp": time.time()})
+                        if len(docs) > 5:
+                            del docs[:-5]
+                finally:
+                    await self.client.room_typing(room.room_id, typing_state=False)
+                return
+
        if not extracted:
            await self._send_text(room.room_id, f"I couldn't extract any text from that file ({filename}).")
            return
@@ -2345,6 +2381,28 @@ class Bot:
            logger.exception("PDF text extraction failed")
            return ""

+    @staticmethod
+    def _render_pdf_pages_as_images(pdf_bytes: bytes, max_pages: int = 5) -> list[tuple[str, str]]:
+        """Render PDF pages to PNG images for vision fallback (scanned PDFs).
+
+        Returns list of (base64_data, mime_type) tuples, one per page.
+        """
+        try:
+            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+            images = []
+            for i, page in enumerate(doc):
+                if i >= max_pages:
+                    break
+                pix = page.get_pixmap(dpi=200)
+                png_bytes = pix.tobytes("png")
+                b64 = base64.b64encode(png_bytes).decode("utf-8")
+                images.append((b64, "image/png"))
+            doc.close()
+            return images
+        except Exception:
+            logger.exception("PDF page rendering failed")
+            return []
+
    @staticmethod
    def _extract_docx_text(docx_bytes: bytes) -> str:
        """Extract text from .docx bytes using python-docx."""