feat: Add PDF reading support to Matrix AI bot (MAT-10)
- Register RoomMessageFile callback, filter for application/pdf - Extract text from PDFs using pymupdf (fitz) - Send extracted text as context to LLM for summarization/Q&A - Truncate at 50k chars to avoid token limits - Add pymupdf to requirements.txt Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
91
bot.py
91
bot.py
@@ -8,6 +8,7 @@ import re
|
|||||||
import time
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
|
import fitz # pymupdf
|
||||||
import httpx
|
import httpx
|
||||||
from openai import AsyncOpenAI
|
from openai import AsyncOpenAI
|
||||||
from olm import sas as olm_sas
|
from olm import sas as olm_sas
|
||||||
@@ -18,6 +19,7 @@ from nio import (
|
|||||||
LoginResponse,
|
LoginResponse,
|
||||||
InviteMemberEvent,
|
InviteMemberEvent,
|
||||||
MegolmEvent,
|
MegolmEvent,
|
||||||
|
RoomMessageFile,
|
||||||
RoomMessageImage,
|
RoomMessageImage,
|
||||||
RoomMessageText,
|
RoomMessageText,
|
||||||
RoomMessageUnknown,
|
RoomMessageUnknown,
|
||||||
@@ -68,6 +70,7 @@ IMPORTANT RULES — FOLLOW THESE STRICTLY:
|
|||||||
- NEVER ask follow-up questions about document storage or file locations.
|
- NEVER ask follow-up questions about document storage or file locations.
|
||||||
- If no relevant documents were found, simply say you don't have information on that topic and ask if you can help with something else. Do NOT speculate about why or suggest the user look elsewhere.
|
- If no relevant documents were found, simply say you don't have information on that topic and ask if you can help with something else. Do NOT speculate about why or suggest the user look elsewhere.
|
||||||
- You can see and analyze images that users send. Describe what you see when asked about an image.
|
- You can see and analyze images that users send. Describe what you see when asked about an image.
|
||||||
|
- You can read and analyze PDF documents that users send. Summarize content and answer questions about them.
|
||||||
- You can generate images when asked — use the generate_image tool for any image creation, drawing, or illustration requests."""
|
- You can generate images when asked — use the generate_image tool for any image creation, drawing, or illustration requests."""
|
||||||
|
|
||||||
IMAGE_GEN_TOOLS = [{
|
IMAGE_GEN_TOOLS = [{
|
||||||
@@ -256,6 +259,7 @@ class Bot:
|
|||||||
self.client.add_event_callback(self.on_unknown, UnknownEvent)
|
self.client.add_event_callback(self.on_unknown, UnknownEvent)
|
||||||
self.client.add_event_callback(self.on_text_message, RoomMessageText)
|
self.client.add_event_callback(self.on_text_message, RoomMessageText)
|
||||||
self.client.add_event_callback(self.on_image_message, RoomMessageImage)
|
self.client.add_event_callback(self.on_image_message, RoomMessageImage)
|
||||||
|
self.client.add_event_callback(self.on_file_message, RoomMessageFile)
|
||||||
self.client.add_event_callback(self.on_room_unknown, RoomMessageUnknown)
|
self.client.add_event_callback(self.on_room_unknown, RoomMessageUnknown)
|
||||||
self.client.add_response_callback(self.on_sync, SyncResponse)
|
self.client.add_response_callback(self.on_sync, SyncResponse)
|
||||||
self.client.add_to_device_callback(self.on_key_verification, KeyVerificationStart)
|
self.client.add_to_device_callback(self.on_key_verification, KeyVerificationStart)
|
||||||
@@ -494,6 +498,93 @@ class Bot:
|
|||||||
finally:
|
finally:
|
||||||
await self.client.room_typing(room.room_id, typing_state=False)
|
await self.client.room_typing(room.room_id, typing_state=False)
|
||||||
|
|
||||||
|
async def on_file_message(self, room, event: RoomMessageFile):
|
||||||
|
"""Handle file messages: extract text from PDFs and send to AI."""
|
||||||
|
if event.sender == BOT_USER:
|
||||||
|
return
|
||||||
|
if not self._sync_token_received:
|
||||||
|
return
|
||||||
|
server_ts = event.server_timestamp / 1000
|
||||||
|
if time.time() - server_ts > 30:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Only handle PDFs
|
||||||
|
source = event.source or {}
|
||||||
|
content = source.get("content", {})
|
||||||
|
info = content.get("info", {})
|
||||||
|
mime_type = info.get("mimetype", "")
|
||||||
|
filename = content.get("body", "file")
|
||||||
|
|
||||||
|
if mime_type != "application/pdf" and not filename.lower().endswith(".pdf"):
|
||||||
|
return
|
||||||
|
|
||||||
|
await self._load_room_settings(room.room_id)
|
||||||
|
|
||||||
|
# In DMs respond to all files; in groups only if bot was recently @mentioned
|
||||||
|
is_dm = room.member_count == 2
|
||||||
|
if not is_dm:
|
||||||
|
body = (event.body or "").strip()
|
||||||
|
bot_display = self.client.user_id.split(":")[0].lstrip("@")
|
||||||
|
mentioned = (
|
||||||
|
BOT_USER in body
|
||||||
|
or f"@{bot_display}" in body.lower()
|
||||||
|
or bot_display.lower() in body.lower()
|
||||||
|
)
|
||||||
|
if not mentioned:
|
||||||
|
return
|
||||||
|
|
||||||
|
if not self.llm:
|
||||||
|
await self._send_text(room.room_id, "LLM not configured (LITELLM_BASE_URL not set).")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Download PDF
|
||||||
|
mxc_url = event.url
|
||||||
|
if not mxc_url:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
resp = await self.client.download(mxc=mxc_url)
|
||||||
|
if not hasattr(resp, "body"):
|
||||||
|
logger.warning("File download failed for %s", mxc_url)
|
||||||
|
return
|
||||||
|
pdf_bytes = resp.body
|
||||||
|
except Exception:
|
||||||
|
logger.exception("Failed to download file %s", mxc_url)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Extract text from PDF
|
||||||
|
pdf_text = self._extract_pdf_text(pdf_bytes)
|
||||||
|
if not pdf_text:
|
||||||
|
await self._send_text(room.room_id, "I couldn't extract any text from that PDF.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Truncate to avoid token limits (roughly 50k chars ≈ 12k tokens)
|
||||||
|
if len(pdf_text) > 50000:
|
||||||
|
pdf_text = pdf_text[:50000] + "\n\n[... truncated, PDF too long ...]"
|
||||||
|
|
||||||
|
user_message = f'The user sent a PDF file named "{filename}". Here is the extracted text:\n\n{pdf_text}\n\nPlease summarize or answer questions about this document.'
|
||||||
|
|
||||||
|
await self.client.room_typing(room.room_id, typing_state=True)
|
||||||
|
try:
|
||||||
|
await self._respond_with_ai(room, user_message, sender=event.sender)
|
||||||
|
finally:
|
||||||
|
await self.client.room_typing(room.room_id, typing_state=False)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _extract_pdf_text(pdf_bytes: bytes) -> str:
|
||||||
|
"""Extract text from PDF bytes using pymupdf."""
|
||||||
|
try:
|
||||||
|
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
||||||
|
pages = []
|
||||||
|
for i, page in enumerate(doc):
|
||||||
|
text = page.get_text().strip()
|
||||||
|
if text:
|
||||||
|
pages.append(f"--- Page {i + 1} ---\n{text}")
|
||||||
|
doc.close()
|
||||||
|
return "\n\n".join(pages)
|
||||||
|
except Exception:
|
||||||
|
logger.exception("PDF text extraction failed")
|
||||||
|
return ""
|
||||||
|
|
||||||
async def _handle_command(self, room, cmd: str, event=None):
|
async def _handle_command(self, room, cmd: str, event=None):
|
||||||
if cmd == "help":
|
if cmd == "help":
|
||||||
await self._send_text(room.room_id, HELP_TEXT)
|
await self._send_text(room.room_id, HELP_TEXT)
|
||||||
|
|||||||
@@ -8,3 +8,4 @@ matrix-nio[e2e]>=0.25,<1.0
|
|||||||
canonicaljson>=2.0,<3.0
|
canonicaljson>=2.0,<3.0
|
||||||
httpx>=0.27,<1.0
|
httpx>=0.27,<1.0
|
||||||
openai>=2.0,<3.0
|
openai>=2.0,<3.0
|
||||||
|
pymupdf>=1.24,<2.0
|
||||||
|
|||||||
Reference in New Issue
Block a user