feat: Add PDF reading support to Matrix AI bot (MAT-10)

- Register RoomMessageFile callback, filter for application/pdf - Extract text from PDFs using pymupdf (fitz) - Send extracted text as context to LLM for summarization/Q&A - Truncate at 50k chars to avoid token limits - Add pymupdf to requirements.txt Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-18 22:09:24 +02:00
parent 9b509e899f
commit 5c5f442a74
2 changed files with 92 additions and 0 deletions
--- a/bot.py
+++ b/bot.py
@@ -8,6 +8,7 @@ import re
 import time
 import uuid
 import fitz  # pymupdf
 import httpx
 from openai import AsyncOpenAI
 from olm import sas as olm_sas
@@ -18,6 +19,7 @@ from nio import (
    LoginResponse,
    InviteMemberEvent,
    MegolmEvent,
    RoomMessageFile,
    RoomMessageImage,
    RoomMessageText,
    RoomMessageUnknown,
@@ -68,6 +70,7 @@ IMPORTANT RULES — FOLLOW THESE STRICTLY:
 - NEVER ask follow-up questions about document storage or file locations.
 - If no relevant documents were found, simply say you don't have information on that topic and ask if you can help with something else. Do NOT speculate about why or suggest the user look elsewhere.
 - You can see and analyze images that users send. Describe what you see when asked about an image.
 - You can read and analyze PDF documents that users send. Summarize content and answer questions about them.
 - You can generate images when asked — use the generate_image tool for any image creation, drawing, or illustration requests."""
 IMAGE_GEN_TOOLS = [{
@@ -256,6 +259,7 @@ class Bot:
        self.client.add_event_callback(self.on_unknown, UnknownEvent)
        self.client.add_event_callback(self.on_text_message, RoomMessageText)
        self.client.add_event_callback(self.on_image_message, RoomMessageImage)
        self.client.add_event_callback(self.on_file_message, RoomMessageFile)
        self.client.add_event_callback(self.on_room_unknown, RoomMessageUnknown)
        self.client.add_response_callback(self.on_sync, SyncResponse)
        self.client.add_to_device_callback(self.on_key_verification, KeyVerificationStart)
@@ -494,6 +498,93 @@ class Bot:
        finally:
            await self.client.room_typing(room.room_id, typing_state=False)
    async def on_file_message(self, room, event: RoomMessageFile):
        """Handle file messages: extract text from PDFs and send to AI."""
        if event.sender == BOT_USER:
            return
        if not self._sync_token_received:
            return
        server_ts = event.server_timestamp / 1000
        if time.time() - server_ts > 30:
            return
        # Only handle PDFs
        source = event.source or {}
        content = source.get("content", {})
        info = content.get("info", {})
        mime_type = info.get("mimetype", "")
        filename = content.get("body", "file")
        if mime_type != "application/pdf" and not filename.lower().endswith(".pdf"):
            return
        await self._load_room_settings(room.room_id)
        # In DMs respond to all files; in groups only if bot was recently @mentioned
        is_dm = room.member_count == 2
        if not is_dm:
            body = (event.body or "").strip()
            bot_display = self.client.user_id.split(":")[0].lstrip("@")
            mentioned = (
                BOT_USER in body
                or f"@{bot_display}" in body.lower()
                or bot_display.lower() in body.lower()
            )
            if not mentioned:
                return
        if not self.llm:
            await self._send_text(room.room_id, "LLM not configured (LITELLM_BASE_URL not set).")
            return
        # Download PDF
        mxc_url = event.url
        if not mxc_url:
            return
        try:
            resp = await self.client.download(mxc=mxc_url)
            if not hasattr(resp, "body"):
                logger.warning("File download failed for %s", mxc_url)
                return
            pdf_bytes = resp.body
        except Exception:
            logger.exception("Failed to download file %s", mxc_url)
            return
        # Extract text from PDF
        pdf_text = self._extract_pdf_text(pdf_bytes)
        if not pdf_text:
            await self._send_text(room.room_id, "I couldn't extract any text from that PDF.")
            return
        # Truncate to avoid token limits (roughly 50k chars ≈ 12k tokens)
        if len(pdf_text) > 50000:
            pdf_text = pdf_text[:50000] + "\n\n[... truncated, PDF too long ...]"
        user_message = f'The user sent a PDF file named "{filename}". Here is the extracted text:\n\n{pdf_text}\n\nPlease summarize or answer questions about this document.'
        await self.client.room_typing(room.room_id, typing_state=True)
        try:
            await self._respond_with_ai(room, user_message, sender=event.sender)
        finally:
            await self.client.room_typing(room.room_id, typing_state=False)
    @staticmethod
    def _extract_pdf_text(pdf_bytes: bytes) -> str:
        """Extract text from PDF bytes using pymupdf."""
        try:
            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
            pages = []
            for i, page in enumerate(doc):
                text = page.get_text().strip()
                if text:
                    pages.append(f"--- Page {i + 1} ---\n{text}")
            doc.close()
            return "\n\n".join(pages)
        except Exception:
            logger.exception("PDF text extraction failed")
            return ""
    async def _handle_command(self, room, cmd: str, event=None):
        if cmd == "help":
            await self._send_text(room.room_id, HELP_TEXT)
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,3 +8,4 @@ matrix-nio[e2e]>=0.25,<1.0
 canonicaljson>=2.0,<3.0
 httpx>=0.27,<1.0
 openai>=2.0,<3.0
 pymupdf>=1.24,<2.0