feat: scanned PDF fallback via vision for both plain and E2EE rooms (MAT-156)
Render scanned/image-based PDF pages to PNG at 200 DPI and send to AI model as image content when text extraction returns empty. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
58
bot.py
58
bot.py
@@ -2212,6 +2212,24 @@ class Bot:
|
||||
extracted = self._extract_text_file(file_bytes)
|
||||
doc_type = "text"
|
||||
|
||||
# Scanned PDF fallback: render pages as images for vision analysis
|
||||
if not extracted and is_pdf:
|
||||
page_images = self._render_pdf_pages_as_images(file_bytes)
|
||||
if page_images:
|
||||
await self.client.room_typing(room.room_id, typing_state=True)
|
||||
try:
|
||||
user_message = f'The user sent a scanned PDF named "{filename}" ({len(page_images)} page(s)). Analyze the document content and summarize it.'
|
||||
reply = await self._respond_with_ai(room, user_message, sender=event.sender, image_data=page_images[0])
|
||||
if reply:
|
||||
docs = self._room_document_context.setdefault(room.room_id, [])
|
||||
docs.append({"type": "pdf", "filename": filename,
|
||||
"text": reply, "timestamp": time.time()})
|
||||
if len(docs) > 5:
|
||||
del docs[:-5]
|
||||
finally:
|
||||
await self.client.room_typing(room.room_id, typing_state=False)
|
||||
return
|
||||
|
||||
if not extracted:
|
||||
await self._send_text(room.room_id, f"I couldn't extract any text from that file ({filename}).")
|
||||
return
|
||||
@@ -2303,6 +2321,24 @@ class Bot:
|
||||
extracted = self._extract_text_file(file_bytes)
|
||||
doc_type = "text"
|
||||
|
||||
# Scanned PDF fallback: render pages as images for vision analysis
|
||||
if not extracted and is_pdf:
|
||||
page_images = self._render_pdf_pages_as_images(file_bytes)
|
||||
if page_images:
|
||||
await self.client.room_typing(room.room_id, typing_state=True)
|
||||
try:
|
||||
user_message = f'The user sent a scanned PDF named "{filename}" ({len(page_images)} page(s)). Analyze the document content and summarize it.'
|
||||
reply = await self._respond_with_ai(room, user_message, sender=event.sender, image_data=page_images[0])
|
||||
if reply:
|
||||
docs = self._room_document_context.setdefault(room.room_id, [])
|
||||
docs.append({"type": "pdf", "filename": filename,
|
||||
"text": reply, "timestamp": time.time()})
|
||||
if len(docs) > 5:
|
||||
del docs[:-5]
|
||||
finally:
|
||||
await self.client.room_typing(room.room_id, typing_state=False)
|
||||
return
|
||||
|
||||
if not extracted:
|
||||
await self._send_text(room.room_id, f"I couldn't extract any text from that file ({filename}).")
|
||||
return
|
||||
@@ -2345,6 +2381,28 @@ class Bot:
|
||||
logger.exception("PDF text extraction failed")
|
||||
return ""
|
||||
|
||||
@staticmethod
|
||||
def _render_pdf_pages_as_images(pdf_bytes: bytes, max_pages: int = 5) -> list[tuple[str, str]]:
|
||||
"""Render PDF pages to PNG images for vision fallback (scanned PDFs).
|
||||
|
||||
Returns list of (base64_data, mime_type) tuples, one per page.
|
||||
"""
|
||||
try:
|
||||
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
||||
images = []
|
||||
for i, page in enumerate(doc):
|
||||
if i >= max_pages:
|
||||
break
|
||||
pix = page.get_pixmap(dpi=200)
|
||||
png_bytes = pix.tobytes("png")
|
||||
b64 = base64.b64encode(png_bytes).decode("utf-8")
|
||||
images.append((b64, "image/png"))
|
||||
doc.close()
|
||||
return images
|
||||
except Exception:
|
||||
logger.exception("PDF page rendering failed")
|
||||
return []
|
||||
|
||||
@staticmethod
|
||||
def _extract_docx_text(docx_bytes: bytes) -> str:
|
||||
"""Extract text from .docx bytes using python-docx."""
|
||||
|
||||
Reference in New Issue
Block a user