diff --git a/bot.py b/bot.py index 069991a..90c9b4b 100644 --- a/bot.py +++ b/bot.py @@ -2212,6 +2212,24 @@ class Bot: extracted = self._extract_text_file(file_bytes) doc_type = "text" + # Scanned PDF fallback: render pages as images for vision analysis + if not extracted and is_pdf: + page_images = self._render_pdf_pages_as_images(file_bytes) + if page_images: + await self.client.room_typing(room.room_id, typing_state=True) + try: + user_message = f'The user sent a scanned PDF named "{filename}" ({len(page_images)} page(s)). Analyze the document content and summarize it.' + reply = await self._respond_with_ai(room, user_message, sender=event.sender, image_data=page_images[0]) + if reply: + docs = self._room_document_context.setdefault(room.room_id, []) + docs.append({"type": "pdf", "filename": filename, + "text": reply, "timestamp": time.time()}) + if len(docs) > 5: + del docs[:-5] + finally: + await self.client.room_typing(room.room_id, typing_state=False) + return + if not extracted: await self._send_text(room.room_id, f"I couldn't extract any text from that file ({filename}).") return @@ -2303,6 +2321,24 @@ class Bot: extracted = self._extract_text_file(file_bytes) doc_type = "text" + # Scanned PDF fallback: render pages as images for vision analysis + if not extracted and is_pdf: + page_images = self._render_pdf_pages_as_images(file_bytes) + if page_images: + await self.client.room_typing(room.room_id, typing_state=True) + try: + user_message = f'The user sent a scanned PDF named "{filename}" ({len(page_images)} page(s)). Analyze the document content and summarize it.' + reply = await self._respond_with_ai(room, user_message, sender=event.sender, image_data=page_images[0]) + if reply: + docs = self._room_document_context.setdefault(room.room_id, []) + docs.append({"type": "pdf", "filename": filename, + "text": reply, "timestamp": time.time()}) + if len(docs) > 5: + del docs[:-5] + finally: + await self.client.room_typing(room.room_id, typing_state=False) + return + if not extracted: await self._send_text(room.room_id, f"I couldn't extract any text from that file ({filename}).") return @@ -2345,6 +2381,28 @@ class Bot: logger.exception("PDF text extraction failed") return "" + @staticmethod + def _render_pdf_pages_as_images(pdf_bytes: bytes, max_pages: int = 5) -> list[tuple[str, str]]: + """Render PDF pages to PNG images for vision fallback (scanned PDFs). + + Returns list of (base64_data, mime_type) tuples, one per page. + """ + try: + doc = fitz.open(stream=pdf_bytes, filetype="pdf") + images = [] + for i, page in enumerate(doc): + if i >= max_pages: + break + pix = page.get_pixmap(dpi=200) + png_bytes = pix.tobytes("png") + b64 = base64.b64encode(png_bytes).decode("utf-8") + images.append((b64, "image/png")) + doc.close() + return images + except Exception: + logger.exception("PDF page rendering failed") + return [] + @staticmethod def _extract_docx_text(docx_bytes: bytes) -> str: """Extract text from .docx bytes using python-docx."""