feat: scanned PDF fallback via vision for both plain and E2EE rooms (MAT-156)

Render scanned/image-based PDF pages to PNG at 200 DPI and send to AI
model as image content when text extraction returns empty.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Christian Gick
2026-03-10 11:35:47 +02:00
parent 0c7070ebc4
commit f73de35fd4

58
bot.py
View File

@@ -2212,6 +2212,24 @@ class Bot:
extracted = self._extract_text_file(file_bytes) extracted = self._extract_text_file(file_bytes)
doc_type = "text" doc_type = "text"
# Scanned PDF fallback: render pages as images for vision analysis
if not extracted and is_pdf:
page_images = self._render_pdf_pages_as_images(file_bytes)
if page_images:
await self.client.room_typing(room.room_id, typing_state=True)
try:
user_message = f'The user sent a scanned PDF named "{filename}" ({len(page_images)} page(s)). Analyze the document content and summarize it.'
reply = await self._respond_with_ai(room, user_message, sender=event.sender, image_data=page_images[0])
if reply:
docs = self._room_document_context.setdefault(room.room_id, [])
docs.append({"type": "pdf", "filename": filename,
"text": reply, "timestamp": time.time()})
if len(docs) > 5:
del docs[:-5]
finally:
await self.client.room_typing(room.room_id, typing_state=False)
return
if not extracted: if not extracted:
await self._send_text(room.room_id, f"I couldn't extract any text from that file ({filename}).") await self._send_text(room.room_id, f"I couldn't extract any text from that file ({filename}).")
return return
@@ -2303,6 +2321,24 @@ class Bot:
extracted = self._extract_text_file(file_bytes) extracted = self._extract_text_file(file_bytes)
doc_type = "text" doc_type = "text"
# Scanned PDF fallback: render pages as images for vision analysis
if not extracted and is_pdf:
page_images = self._render_pdf_pages_as_images(file_bytes)
if page_images:
await self.client.room_typing(room.room_id, typing_state=True)
try:
user_message = f'The user sent a scanned PDF named "{filename}" ({len(page_images)} page(s)). Analyze the document content and summarize it.'
reply = await self._respond_with_ai(room, user_message, sender=event.sender, image_data=page_images[0])
if reply:
docs = self._room_document_context.setdefault(room.room_id, [])
docs.append({"type": "pdf", "filename": filename,
"text": reply, "timestamp": time.time()})
if len(docs) > 5:
del docs[:-5]
finally:
await self.client.room_typing(room.room_id, typing_state=False)
return
if not extracted: if not extracted:
await self._send_text(room.room_id, f"I couldn't extract any text from that file ({filename}).") await self._send_text(room.room_id, f"I couldn't extract any text from that file ({filename}).")
return return
@@ -2345,6 +2381,28 @@ class Bot:
logger.exception("PDF text extraction failed") logger.exception("PDF text extraction failed")
return "" return ""
@staticmethod
def _render_pdf_pages_as_images(pdf_bytes: bytes, max_pages: int = 5) -> list[tuple[str, str]]:
"""Render PDF pages to PNG images for vision fallback (scanned PDFs).
Returns list of (base64_data, mime_type) tuples, one per page.
"""
try:
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
images = []
for i, page in enumerate(doc):
if i >= max_pages:
break
pix = page.get_pixmap(dpi=200)
png_bytes = pix.tobytes("png")
b64 = base64.b64encode(png_bytes).decode("utf-8")
images.append((b64, "image/png"))
doc.close()
return images
except Exception:
logger.exception("PDF page rendering failed")
return []
@staticmethod @staticmethod
def _extract_docx_text(docx_bytes: bytes) -> str: def _extract_docx_text(docx_bytes: bytes) -> str:
"""Extract text from .docx bytes using python-docx.""" """Extract text from .docx bytes using python-docx."""