feat: scanned PDF fallback via vision for both plain and E2EE rooms (MAT-156)

Render scanned/image-based PDF pages to PNG at 200 DPI and send to AI model as image content when text extraction returns empty. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-10 11:35:47 +02:00
parent 0c7070ebc4
commit f73de35fd4
1 changed files with 58 additions and 0 deletions
--- a/bot.py
+++ b/bot.py
@@ -2212,6 +2212,24 @@ class Bot:
            extracted = self._extract_text_file(file_bytes)
            doc_type = "text"
        # Scanned PDF fallback: render pages as images for vision analysis
        if not extracted and is_pdf:
            page_images = self._render_pdf_pages_as_images(file_bytes)
            if page_images:
                await self.client.room_typing(room.room_id, typing_state=True)
                try:
                    user_message = f'The user sent a scanned PDF named "{filename}" ({len(page_images)} page(s)). Analyze the document content and summarize it.'
                    reply = await self._respond_with_ai(room, user_message, sender=event.sender, image_data=page_images[0])
                    if reply:
                        docs = self._room_document_context.setdefault(room.room_id, [])
                        docs.append({"type": "pdf", "filename": filename,
                                     "text": reply, "timestamp": time.time()})
                        if len(docs) > 5:
                            del docs[:-5]
                finally:
                    await self.client.room_typing(room.room_id, typing_state=False)
                return
        if not extracted:
            await self._send_text(room.room_id, f"I couldn't extract any text from that file ({filename}).")
            return
@@ -2303,6 +2321,24 @@ class Bot:
            extracted = self._extract_text_file(file_bytes)
            doc_type = "text"
        # Scanned PDF fallback: render pages as images for vision analysis
        if not extracted and is_pdf:
            page_images = self._render_pdf_pages_as_images(file_bytes)
            if page_images:
                await self.client.room_typing(room.room_id, typing_state=True)
                try:
                    user_message = f'The user sent a scanned PDF named "{filename}" ({len(page_images)} page(s)). Analyze the document content and summarize it.'
                    reply = await self._respond_with_ai(room, user_message, sender=event.sender, image_data=page_images[0])
                    if reply:
                        docs = self._room_document_context.setdefault(room.room_id, [])
                        docs.append({"type": "pdf", "filename": filename,
                                     "text": reply, "timestamp": time.time()})
                        if len(docs) > 5:
                            del docs[:-5]
                finally:
                    await self.client.room_typing(room.room_id, typing_state=False)
                return
        if not extracted:
            await self._send_text(room.room_id, f"I couldn't extract any text from that file ({filename}).")
            return
@@ -2345,6 +2381,28 @@ class Bot:
            logger.exception("PDF text extraction failed")
            return ""
    @staticmethod
    def _render_pdf_pages_as_images(pdf_bytes: bytes, max_pages: int = 5) -> list[tuple[str, str]]:
        """Render PDF pages to PNG images for vision fallback (scanned PDFs).
        Returns list of (base64_data, mime_type) tuples, one per page.
        """
        try:
            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
            images = []
            for i, page in enumerate(doc):
                if i >= max_pages:
                    break
                pix = page.get_pixmap(dpi=200)
                png_bytes = pix.tobytes("png")
                b64 = base64.b64encode(png_bytes).decode("utf-8")
                images.append((b64, "image/png"))
            doc.close()
            return images
        except Exception:
            logger.exception("PDF page rendering failed")
            return []
    @staticmethod
    def _extract_docx_text(docx_bytes: bytes) -> str:
        """Extract text from .docx bytes using python-docx."""