feat: Add PDF reading support to Matrix AI bot (MAT-10)

- Register RoomMessageFile callback, filter for application/pdf
- Extract text from PDFs using pymupdf (fitz)
- Send extracted text as context to LLM for summarization/Q&A
- Truncate at 50k chars to avoid token limits
- Add pymupdf to requirements.txt

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Christian Gick
2026-02-18 22:09:24 +02:00
parent 9b509e899f
commit 5c5f442a74
2 changed files with 92 additions and 0 deletions

91
bot.py
View File

@@ -8,6 +8,7 @@ import re
import time import time
import uuid import uuid
import fitz # pymupdf
import httpx import httpx
from openai import AsyncOpenAI from openai import AsyncOpenAI
from olm import sas as olm_sas from olm import sas as olm_sas
@@ -18,6 +19,7 @@ from nio import (
LoginResponse, LoginResponse,
InviteMemberEvent, InviteMemberEvent,
MegolmEvent, MegolmEvent,
RoomMessageFile,
RoomMessageImage, RoomMessageImage,
RoomMessageText, RoomMessageText,
RoomMessageUnknown, RoomMessageUnknown,
@@ -68,6 +70,7 @@ IMPORTANT RULES — FOLLOW THESE STRICTLY:
- NEVER ask follow-up questions about document storage or file locations. - NEVER ask follow-up questions about document storage or file locations.
- If no relevant documents were found, simply say you don't have information on that topic and ask if you can help with something else. Do NOT speculate about why or suggest the user look elsewhere. - If no relevant documents were found, simply say you don't have information on that topic and ask if you can help with something else. Do NOT speculate about why or suggest the user look elsewhere.
- You can see and analyze images that users send. Describe what you see when asked about an image. - You can see and analyze images that users send. Describe what you see when asked about an image.
- You can read and analyze PDF documents that users send. Summarize content and answer questions about them.
- You can generate images when asked — use the generate_image tool for any image creation, drawing, or illustration requests.""" - You can generate images when asked — use the generate_image tool for any image creation, drawing, or illustration requests."""
IMAGE_GEN_TOOLS = [{ IMAGE_GEN_TOOLS = [{
@@ -256,6 +259,7 @@ class Bot:
self.client.add_event_callback(self.on_unknown, UnknownEvent) self.client.add_event_callback(self.on_unknown, UnknownEvent)
self.client.add_event_callback(self.on_text_message, RoomMessageText) self.client.add_event_callback(self.on_text_message, RoomMessageText)
self.client.add_event_callback(self.on_image_message, RoomMessageImage) self.client.add_event_callback(self.on_image_message, RoomMessageImage)
self.client.add_event_callback(self.on_file_message, RoomMessageFile)
self.client.add_event_callback(self.on_room_unknown, RoomMessageUnknown) self.client.add_event_callback(self.on_room_unknown, RoomMessageUnknown)
self.client.add_response_callback(self.on_sync, SyncResponse) self.client.add_response_callback(self.on_sync, SyncResponse)
self.client.add_to_device_callback(self.on_key_verification, KeyVerificationStart) self.client.add_to_device_callback(self.on_key_verification, KeyVerificationStart)
@@ -494,6 +498,93 @@ class Bot:
finally: finally:
await self.client.room_typing(room.room_id, typing_state=False) await self.client.room_typing(room.room_id, typing_state=False)
async def on_file_message(self, room, event: RoomMessageFile):
"""Handle file messages: extract text from PDFs and send to AI."""
if event.sender == BOT_USER:
return
if not self._sync_token_received:
return
server_ts = event.server_timestamp / 1000
if time.time() - server_ts > 30:
return
# Only handle PDFs
source = event.source or {}
content = source.get("content", {})
info = content.get("info", {})
mime_type = info.get("mimetype", "")
filename = content.get("body", "file")
if mime_type != "application/pdf" and not filename.lower().endswith(".pdf"):
return
await self._load_room_settings(room.room_id)
# In DMs respond to all files; in groups only if bot was recently @mentioned
is_dm = room.member_count == 2
if not is_dm:
body = (event.body or "").strip()
bot_display = self.client.user_id.split(":")[0].lstrip("@")
mentioned = (
BOT_USER in body
or f"@{bot_display}" in body.lower()
or bot_display.lower() in body.lower()
)
if not mentioned:
return
if not self.llm:
await self._send_text(room.room_id, "LLM not configured (LITELLM_BASE_URL not set).")
return
# Download PDF
mxc_url = event.url
if not mxc_url:
return
try:
resp = await self.client.download(mxc=mxc_url)
if not hasattr(resp, "body"):
logger.warning("File download failed for %s", mxc_url)
return
pdf_bytes = resp.body
except Exception:
logger.exception("Failed to download file %s", mxc_url)
return
# Extract text from PDF
pdf_text = self._extract_pdf_text(pdf_bytes)
if not pdf_text:
await self._send_text(room.room_id, "I couldn't extract any text from that PDF.")
return
# Truncate to avoid token limits (roughly 50k chars ≈ 12k tokens)
if len(pdf_text) > 50000:
pdf_text = pdf_text[:50000] + "\n\n[... truncated, PDF too long ...]"
user_message = f'The user sent a PDF file named "{filename}". Here is the extracted text:\n\n{pdf_text}\n\nPlease summarize or answer questions about this document.'
await self.client.room_typing(room.room_id, typing_state=True)
try:
await self._respond_with_ai(room, user_message, sender=event.sender)
finally:
await self.client.room_typing(room.room_id, typing_state=False)
@staticmethod
def _extract_pdf_text(pdf_bytes: bytes) -> str:
"""Extract text from PDF bytes using pymupdf."""
try:
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
pages = []
for i, page in enumerate(doc):
text = page.get_text().strip()
if text:
pages.append(f"--- Page {i + 1} ---\n{text}")
doc.close()
return "\n\n".join(pages)
except Exception:
logger.exception("PDF text extraction failed")
return ""
async def _handle_command(self, room, cmd: str, event=None): async def _handle_command(self, room, cmd: str, event=None):
if cmd == "help": if cmd == "help":
await self._send_text(room.room_id, HELP_TEXT) await self._send_text(room.room_id, HELP_TEXT)

View File

@@ -8,3 +8,4 @@ matrix-nio[e2e]>=0.25,<1.0
canonicaljson>=2.0,<3.0 canonicaljson>=2.0,<3.0
httpx>=0.27,<1.0 httpx>=0.27,<1.0
openai>=2.0,<3.0 openai>=2.0,<3.0
pymupdf>=1.24,<2.0