feat: scanned PDF fallback via vision for both plain and E2EE rooms (MAT-156)
Render scanned/image-based PDF pages to PNG at 200 DPI and send to AI model as image content when text extraction returns empty. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
58
bot.py
58
bot.py
@@ -2212,6 +2212,24 @@ class Bot:
|
|||||||
extracted = self._extract_text_file(file_bytes)
|
extracted = self._extract_text_file(file_bytes)
|
||||||
doc_type = "text"
|
doc_type = "text"
|
||||||
|
|
||||||
|
# Scanned PDF fallback: render pages as images for vision analysis
|
||||||
|
if not extracted and is_pdf:
|
||||||
|
page_images = self._render_pdf_pages_as_images(file_bytes)
|
||||||
|
if page_images:
|
||||||
|
await self.client.room_typing(room.room_id, typing_state=True)
|
||||||
|
try:
|
||||||
|
user_message = f'The user sent a scanned PDF named "{filename}" ({len(page_images)} page(s)). Analyze the document content and summarize it.'
|
||||||
|
reply = await self._respond_with_ai(room, user_message, sender=event.sender, image_data=page_images[0])
|
||||||
|
if reply:
|
||||||
|
docs = self._room_document_context.setdefault(room.room_id, [])
|
||||||
|
docs.append({"type": "pdf", "filename": filename,
|
||||||
|
"text": reply, "timestamp": time.time()})
|
||||||
|
if len(docs) > 5:
|
||||||
|
del docs[:-5]
|
||||||
|
finally:
|
||||||
|
await self.client.room_typing(room.room_id, typing_state=False)
|
||||||
|
return
|
||||||
|
|
||||||
if not extracted:
|
if not extracted:
|
||||||
await self._send_text(room.room_id, f"I couldn't extract any text from that file ({filename}).")
|
await self._send_text(room.room_id, f"I couldn't extract any text from that file ({filename}).")
|
||||||
return
|
return
|
||||||
@@ -2303,6 +2321,24 @@ class Bot:
|
|||||||
extracted = self._extract_text_file(file_bytes)
|
extracted = self._extract_text_file(file_bytes)
|
||||||
doc_type = "text"
|
doc_type = "text"
|
||||||
|
|
||||||
|
# Scanned PDF fallback: render pages as images for vision analysis
|
||||||
|
if not extracted and is_pdf:
|
||||||
|
page_images = self._render_pdf_pages_as_images(file_bytes)
|
||||||
|
if page_images:
|
||||||
|
await self.client.room_typing(room.room_id, typing_state=True)
|
||||||
|
try:
|
||||||
|
user_message = f'The user sent a scanned PDF named "{filename}" ({len(page_images)} page(s)). Analyze the document content and summarize it.'
|
||||||
|
reply = await self._respond_with_ai(room, user_message, sender=event.sender, image_data=page_images[0])
|
||||||
|
if reply:
|
||||||
|
docs = self._room_document_context.setdefault(room.room_id, [])
|
||||||
|
docs.append({"type": "pdf", "filename": filename,
|
||||||
|
"text": reply, "timestamp": time.time()})
|
||||||
|
if len(docs) > 5:
|
||||||
|
del docs[:-5]
|
||||||
|
finally:
|
||||||
|
await self.client.room_typing(room.room_id, typing_state=False)
|
||||||
|
return
|
||||||
|
|
||||||
if not extracted:
|
if not extracted:
|
||||||
await self._send_text(room.room_id, f"I couldn't extract any text from that file ({filename}).")
|
await self._send_text(room.room_id, f"I couldn't extract any text from that file ({filename}).")
|
||||||
return
|
return
|
||||||
@@ -2345,6 +2381,28 @@ class Bot:
|
|||||||
logger.exception("PDF text extraction failed")
|
logger.exception("PDF text extraction failed")
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _render_pdf_pages_as_images(pdf_bytes: bytes, max_pages: int = 5) -> list[tuple[str, str]]:
|
||||||
|
"""Render PDF pages to PNG images for vision fallback (scanned PDFs).
|
||||||
|
|
||||||
|
Returns list of (base64_data, mime_type) tuples, one per page.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
||||||
|
images = []
|
||||||
|
for i, page in enumerate(doc):
|
||||||
|
if i >= max_pages:
|
||||||
|
break
|
||||||
|
pix = page.get_pixmap(dpi=200)
|
||||||
|
png_bytes = pix.tobytes("png")
|
||||||
|
b64 = base64.b64encode(png_bytes).decode("utf-8")
|
||||||
|
images.append((b64, "image/png"))
|
||||||
|
doc.close()
|
||||||
|
return images
|
||||||
|
except Exception:
|
||||||
|
logger.exception("PDF page rendering failed")
|
||||||
|
return []
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _extract_docx_text(docx_bytes: bytes) -> str:
|
def _extract_docx_text(docx_bytes: bytes) -> str:
|
||||||
"""Extract text from .docx bytes using python-docx."""
|
"""Extract text from .docx bytes using python-docx."""
|
||||||
|
|||||||
Reference in New Issue
Block a user