feat(voice): all file types + images in voice context (MAT-10)
Generalize PDF-only voice context to support all document types: - Rename _room_pdf_context → _room_document_context (list-based, 5 cap) - Handle .docx (python-docx), .txt, .md, .csv, .json, .xml, .html, .yaml, .log - Store AI image descriptions for voice context - Multi-document context building with type labels and per-type truncation - _respond_with_ai now returns reply text for caller use Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
125
bot.py
125
bot.py
@@ -9,6 +9,7 @@ import re
|
|||||||
import time
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
|
import docx
|
||||||
import fitz # pymupdf
|
import fitz # pymupdf
|
||||||
import httpx
|
import httpx
|
||||||
from openai import AsyncOpenAI
|
from openai import AsyncOpenAI
|
||||||
@@ -281,7 +282,7 @@ class Bot:
|
|||||||
self._pending_connects: dict[str, str] = {} # matrix_user_id -> device_code
|
self._pending_connects: dict[str, str] = {} # matrix_user_id -> device_code
|
||||||
self._pending_translate: dict[str, dict] = {} # sender -> {text, detected_lang, room_id}
|
self._pending_translate: dict[str, dict] = {} # sender -> {text, detected_lang, room_id}
|
||||||
self._pending_reply: dict[str, dict] = {} # sender -> {target_lang}
|
self._pending_reply: dict[str, dict] = {} # sender -> {target_lang}
|
||||||
self._room_pdf_context: dict[str, dict] = {} # room_id -> {filename, text, timestamp}
|
self._room_document_context: dict[str, list[dict]] = {} # room_id -> [{type, filename, text, timestamp}, ...]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _load_user_keys() -> dict[str, str]:
|
def _load_user_keys() -> dict[str, str]:
|
||||||
@@ -463,13 +464,19 @@ class Bot:
|
|||||||
import secrets
|
import secrets
|
||||||
bot_key = secrets.token_bytes(16)
|
bot_key = secrets.token_bytes(16)
|
||||||
|
|
||||||
# Get PDF context if recently uploaded (within 1 hour)
|
# Collect all recent document contexts (< 1 hour)
|
||||||
pdf_ctx = self._room_pdf_context.get(room_id, {})
|
doc_entries = [e for e in self._room_document_context.get(room_id, [])
|
||||||
pdf_text = None
|
if time.time() - e["timestamp"] < 3600]
|
||||||
if pdf_ctx and time.time() - pdf_ctx.get("timestamp", 0) < 3600:
|
document_context = None
|
||||||
pdf_text = pdf_ctx.get("text")
|
if doc_entries:
|
||||||
logger.info("Passing PDF context to voice session: %s (%d chars)",
|
parts = []
|
||||||
pdf_ctx.get("filename", "?"), len(pdf_text) if pdf_text else 0)
|
for e in doc_entries:
|
||||||
|
label = {"pdf": "PDF", "image": "Bild", "text": "Datei"}.get(e["type"], "Dokument")
|
||||||
|
text = e["text"][:10000] if e["type"] != "image" else e["text"][:2000]
|
||||||
|
parts.append(f"[{label}: {e['filename']}]\n{text}")
|
||||||
|
document_context = "\n\n".join(parts)
|
||||||
|
logger.info("Passing %d document context(s) to voice session (%d chars total)",
|
||||||
|
len(doc_entries), len(document_context))
|
||||||
|
|
||||||
vs = VoiceSession(
|
vs = VoiceSession(
|
||||||
nio_client=self.client,
|
nio_client=self.client,
|
||||||
@@ -482,7 +489,7 @@ class Bot:
|
|||||||
self._publish_encryption_key(rid, key)),
|
self._publish_encryption_key(rid, key)),
|
||||||
memory=self.memory,
|
memory=self.memory,
|
||||||
caller_user_id=event.sender,
|
caller_user_id=event.sender,
|
||||||
document_context=pdf_text,
|
document_context=document_context,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Check timeline for caller's key
|
# Check timeline for caller's key
|
||||||
@@ -858,7 +865,13 @@ class Bot:
|
|||||||
|
|
||||||
await self.client.room_typing(room.room_id, typing_state=True)
|
await self.client.room_typing(room.room_id, typing_state=True)
|
||||||
try:
|
try:
|
||||||
await self._respond_with_ai(room, text, sender=event.sender, image_data=(b64_data, mime_type))
|
reply = await self._respond_with_ai(room, text, sender=event.sender, image_data=(b64_data, mime_type))
|
||||||
|
if reply:
|
||||||
|
docs = self._room_document_context.setdefault(room.room_id, [])
|
||||||
|
docs.append({"type": "image", "filename": caption or "image",
|
||||||
|
"text": reply, "timestamp": time.time()})
|
||||||
|
if len(docs) > 5:
|
||||||
|
del docs[:-5]
|
||||||
finally:
|
finally:
|
||||||
await self.client.room_typing(room.room_id, typing_state=False)
|
await self.client.room_typing(room.room_id, typing_state=False)
|
||||||
|
|
||||||
@@ -916,12 +929,23 @@ class Bot:
|
|||||||
|
|
||||||
await self.client.room_typing(room.room_id, typing_state=True)
|
await self.client.room_typing(room.room_id, typing_state=True)
|
||||||
try:
|
try:
|
||||||
await self._respond_with_ai(room, text, sender=event.sender, image_data=(b64_data, mime_type))
|
reply = await self._respond_with_ai(room, text, sender=event.sender, image_data=(b64_data, mime_type))
|
||||||
|
if reply:
|
||||||
|
docs = self._room_document_context.setdefault(room.room_id, [])
|
||||||
|
docs.append({"type": "image", "filename": caption or "image",
|
||||||
|
"text": reply, "timestamp": time.time()})
|
||||||
|
if len(docs) > 5:
|
||||||
|
del docs[:-5]
|
||||||
finally:
|
finally:
|
||||||
await self.client.room_typing(room.room_id, typing_state=False)
|
await self.client.room_typing(room.room_id, typing_state=False)
|
||||||
|
|
||||||
|
# Supported text-based file extensions
|
||||||
|
_TEXT_EXTENSIONS = frozenset({
|
||||||
|
".txt", ".md", ".csv", ".json", ".xml", ".html", ".yaml", ".yml", ".log",
|
||||||
|
})
|
||||||
|
|
||||||
async def on_file_message(self, room, event: RoomMessageFile):
|
async def on_file_message(self, room, event: RoomMessageFile):
|
||||||
"""Handle file messages: extract text from PDFs and send to AI."""
|
"""Handle file messages: extract text from PDFs, docx, and text files."""
|
||||||
if event.sender == BOT_USER:
|
if event.sender == BOT_USER:
|
||||||
return
|
return
|
||||||
if not self._sync_token_received:
|
if not self._sync_token_received:
|
||||||
@@ -930,14 +954,19 @@ class Bot:
|
|||||||
if time.time() - server_ts > 30:
|
if time.time() - server_ts > 30:
|
||||||
return
|
return
|
||||||
|
|
||||||
# Only handle PDFs
|
|
||||||
source = event.source or {}
|
source = event.source or {}
|
||||||
content = source.get("content", {})
|
content = source.get("content", {})
|
||||||
info = content.get("info", {})
|
info = content.get("info", {})
|
||||||
mime_type = info.get("mimetype", "")
|
mime_type = info.get("mimetype", "")
|
||||||
filename = content.get("body", "file")
|
filename = content.get("body", "file")
|
||||||
|
ext = os.path.splitext(filename.lower())[1]
|
||||||
|
|
||||||
if mime_type != "application/pdf" and not filename.lower().endswith(".pdf"):
|
# Determine file type
|
||||||
|
is_pdf = mime_type == "application/pdf" or ext == ".pdf"
|
||||||
|
is_docx = mime_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" or ext == ".docx"
|
||||||
|
is_text = ext in self._TEXT_EXTENSIONS or mime_type.startswith("text/")
|
||||||
|
|
||||||
|
if not (is_pdf or is_docx or is_text):
|
||||||
return
|
return
|
||||||
|
|
||||||
await self._load_room_settings(room.room_id)
|
await self._load_room_settings(room.room_id)
|
||||||
@@ -959,7 +988,7 @@ class Bot:
|
|||||||
await self._send_text(room.room_id, "LLM not configured (LITELLM_BASE_URL not set).")
|
await self._send_text(room.room_id, "LLM not configured (LITELLM_BASE_URL not set).")
|
||||||
return
|
return
|
||||||
|
|
||||||
# Download PDF
|
# Download file
|
||||||
mxc_url = event.url
|
mxc_url = event.url
|
||||||
if not mxc_url:
|
if not mxc_url:
|
||||||
return
|
return
|
||||||
@@ -968,29 +997,43 @@ class Bot:
|
|||||||
if not hasattr(resp, "body"):
|
if not hasattr(resp, "body"):
|
||||||
logger.warning("File download failed for %s", mxc_url)
|
logger.warning("File download failed for %s", mxc_url)
|
||||||
return
|
return
|
||||||
pdf_bytes = resp.body
|
file_bytes = resp.body
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.exception("Failed to download file %s", mxc_url)
|
logger.exception("Failed to download file %s", mxc_url)
|
||||||
return
|
return
|
||||||
|
|
||||||
# Extract text from PDF
|
# Extract text based on file type
|
||||||
pdf_text = self._extract_pdf_text(pdf_bytes)
|
if is_pdf:
|
||||||
if not pdf_text:
|
extracted = self._extract_pdf_text(file_bytes)
|
||||||
await self._send_text(room.room_id, "I couldn't extract any text from that PDF.")
|
doc_type = "pdf"
|
||||||
|
elif is_docx:
|
||||||
|
extracted = self._extract_docx_text(file_bytes)
|
||||||
|
doc_type = "text"
|
||||||
|
else:
|
||||||
|
extracted = self._extract_text_file(file_bytes)
|
||||||
|
doc_type = "text"
|
||||||
|
|
||||||
|
if not extracted:
|
||||||
|
await self._send_text(room.room_id, f"I couldn't extract any text from that file ({filename}).")
|
||||||
return
|
return
|
||||||
|
|
||||||
# Truncate to avoid token limits (roughly 50k chars ≈ 12k tokens)
|
# Truncate to avoid token limits (roughly 50k chars ≈ 12k tokens)
|
||||||
if len(pdf_text) > 50000:
|
if len(extracted) > 50000:
|
||||||
pdf_text = pdf_text[:50000] + "\n\n[... truncated, PDF too long ...]"
|
extracted = extracted[:50000] + "\n\n[... truncated, file too long ...]"
|
||||||
|
|
||||||
# Store PDF context for voice session pickup
|
# Store document context for voice session pickup
|
||||||
self._room_pdf_context[room.room_id] = {
|
docs = self._room_document_context.setdefault(room.room_id, [])
|
||||||
|
docs.append({
|
||||||
|
"type": doc_type,
|
||||||
"filename": filename,
|
"filename": filename,
|
||||||
"text": pdf_text,
|
"text": extracted,
|
||||||
"timestamp": time.time(),
|
"timestamp": time.time(),
|
||||||
}
|
})
|
||||||
|
if len(docs) > 5:
|
||||||
|
del docs[:-5]
|
||||||
|
|
||||||
user_message = f'The user sent a PDF file named "{filename}". Here is the extracted text:\n\n{pdf_text}\n\nPlease summarize or answer questions about this document.'
|
label = "PDF" if is_pdf else "Word document" if is_docx else "file"
|
||||||
|
user_message = f'The user sent a {label} named "{filename}". Here is the extracted text:\n\n{extracted}\n\nPlease summarize or answer questions about this document.'
|
||||||
|
|
||||||
await self.client.room_typing(room.room_id, typing_state=True)
|
await self.client.room_typing(room.room_id, typing_state=True)
|
||||||
try:
|
try:
|
||||||
@@ -1014,6 +1057,28 @@ class Bot:
|
|||||||
logger.exception("PDF text extraction failed")
|
logger.exception("PDF text extraction failed")
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _extract_docx_text(docx_bytes: bytes) -> str:
|
||||||
|
"""Extract text from .docx bytes using python-docx."""
|
||||||
|
try:
|
||||||
|
doc = docx.Document(io.BytesIO(docx_bytes))
|
||||||
|
return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
|
||||||
|
except Exception:
|
||||||
|
logger.exception("DOCX text extraction failed")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _extract_text_file(file_bytes: bytes) -> str:
|
||||||
|
"""Decode text file bytes as UTF-8 with fallback to latin-1."""
|
||||||
|
try:
|
||||||
|
return file_bytes.decode("utf-8")
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
try:
|
||||||
|
return file_bytes.decode("latin-1")
|
||||||
|
except Exception:
|
||||||
|
logger.exception("Text file decode failed")
|
||||||
|
return ""
|
||||||
|
|
||||||
async def _handle_command(self, room, cmd: str, event=None):
|
async def _handle_command(self, room, cmd: str, event=None):
|
||||||
if cmd == "help":
|
if cmd == "help":
|
||||||
await self._send_text(room.room_id, HELP_TEXT)
|
await self._send_text(room.room_id, HELP_TEXT)
|
||||||
@@ -1239,7 +1304,8 @@ class Bot:
|
|||||||
finally:
|
finally:
|
||||||
self._pending_connects.pop(sender, None)
|
self._pending_connects.pop(sender, None)
|
||||||
|
|
||||||
async def _respond_with_ai(self, room, user_message: str, sender: str = None, image_data: tuple = None):
|
async def _respond_with_ai(self, room, user_message: str, sender: str = None, image_data: tuple = None) -> str | None:
|
||||||
|
"""Send AI response and return the reply text (or None on failure)."""
|
||||||
model = self.room_models.get(room.room_id, DEFAULT_MODEL)
|
model = self.room_models.get(room.room_id, DEFAULT_MODEL)
|
||||||
|
|
||||||
# Fetch conversation history FIRST (needed for query rewriting)
|
# Fetch conversation history FIRST (needed for query rewriting)
|
||||||
@@ -1333,9 +1399,12 @@ class Bot:
|
|||||||
gap_seconds = time.time() - last_rename if last_rename else float("inf")
|
gap_seconds = time.time() - last_rename if last_rename else float("inf")
|
||||||
if gap_seconds > 300:
|
if gap_seconds > 300:
|
||||||
await self._auto_rename_room(room, user_message, reply)
|
await self._auto_rename_room(room, user_message, reply)
|
||||||
|
|
||||||
|
return reply
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.exception("LLM call failed")
|
logger.exception("LLM call failed")
|
||||||
await self._send_text(room.room_id, "Sorry, I couldn't generate a response.")
|
await self._send_text(room.room_id, "Sorry, I couldn't generate a response.")
|
||||||
|
return None
|
||||||
|
|
||||||
async def _rewrite_query(self, user_message: str, history: list[dict], model: str) -> str:
|
async def _rewrite_query(self, user_message: str, history: list[dict], model: str) -> str:
|
||||||
"""Rewrite user message into a standalone search query using conversation context."""
|
"""Rewrite user message into a standalone search query using conversation context."""
|
||||||
|
|||||||
@@ -9,3 +9,4 @@ canonicaljson>=2.0,<3.0
|
|||||||
httpx>=0.27,<1.0
|
httpx>=0.27,<1.0
|
||||||
openai>=2.0,<3.0
|
openai>=2.0,<3.0
|
||||||
pymupdf>=1.24,<2.0
|
pymupdf>=1.24,<2.0
|
||||||
|
python-docx>=1.0,<2.0
|
||||||
|
|||||||
Reference in New Issue
Block a user