diff --git a/bot.py b/bot.py index 69569f0..85438f2 100644 --- a/bot.py +++ b/bot.py @@ -9,6 +9,7 @@ import re import time import uuid +import docx import fitz # pymupdf import httpx from openai import AsyncOpenAI @@ -281,7 +282,7 @@ class Bot: self._pending_connects: dict[str, str] = {} # matrix_user_id -> device_code self._pending_translate: dict[str, dict] = {} # sender -> {text, detected_lang, room_id} self._pending_reply: dict[str, dict] = {} # sender -> {target_lang} - self._room_pdf_context: dict[str, dict] = {} # room_id -> {filename, text, timestamp} + self._room_document_context: dict[str, list[dict]] = {} # room_id -> [{type, filename, text, timestamp}, ...] @staticmethod def _load_user_keys() -> dict[str, str]: @@ -463,13 +464,19 @@ class Bot: import secrets bot_key = secrets.token_bytes(16) - # Get PDF context if recently uploaded (within 1 hour) - pdf_ctx = self._room_pdf_context.get(room_id, {}) - pdf_text = None - if pdf_ctx and time.time() - pdf_ctx.get("timestamp", 0) < 3600: - pdf_text = pdf_ctx.get("text") - logger.info("Passing PDF context to voice session: %s (%d chars)", - pdf_ctx.get("filename", "?"), len(pdf_text) if pdf_text else 0) + # Collect all recent document contexts (< 1 hour) + doc_entries = [e for e in self._room_document_context.get(room_id, []) + if time.time() - e["timestamp"] < 3600] + document_context = None + if doc_entries: + parts = [] + for e in doc_entries: + label = {"pdf": "PDF", "image": "Bild", "text": "Datei"}.get(e["type"], "Dokument") + text = e["text"][:10000] if e["type"] != "image" else e["text"][:2000] + parts.append(f"[{label}: {e['filename']}]\n{text}") + document_context = "\n\n".join(parts) + logger.info("Passing %d document context(s) to voice session (%d chars total)", + len(doc_entries), len(document_context)) vs = VoiceSession( nio_client=self.client, @@ -482,7 +489,7 @@ class Bot: self._publish_encryption_key(rid, key)), memory=self.memory, caller_user_id=event.sender, - document_context=pdf_text, + document_context=document_context, ) # Check timeline for caller's key @@ -858,7 +865,13 @@ class Bot: await self.client.room_typing(room.room_id, typing_state=True) try: - await self._respond_with_ai(room, text, sender=event.sender, image_data=(b64_data, mime_type)) + reply = await self._respond_with_ai(room, text, sender=event.sender, image_data=(b64_data, mime_type)) + if reply: + docs = self._room_document_context.setdefault(room.room_id, []) + docs.append({"type": "image", "filename": caption or "image", + "text": reply, "timestamp": time.time()}) + if len(docs) > 5: + del docs[:-5] finally: await self.client.room_typing(room.room_id, typing_state=False) @@ -916,12 +929,23 @@ class Bot: await self.client.room_typing(room.room_id, typing_state=True) try: - await self._respond_with_ai(room, text, sender=event.sender, image_data=(b64_data, mime_type)) + reply = await self._respond_with_ai(room, text, sender=event.sender, image_data=(b64_data, mime_type)) + if reply: + docs = self._room_document_context.setdefault(room.room_id, []) + docs.append({"type": "image", "filename": caption or "image", + "text": reply, "timestamp": time.time()}) + if len(docs) > 5: + del docs[:-5] finally: await self.client.room_typing(room.room_id, typing_state=False) + # Supported text-based file extensions + _TEXT_EXTENSIONS = frozenset({ + ".txt", ".md", ".csv", ".json", ".xml", ".html", ".yaml", ".yml", ".log", + }) + async def on_file_message(self, room, event: RoomMessageFile): - """Handle file messages: extract text from PDFs and send to AI.""" + """Handle file messages: extract text from PDFs, docx, and text files.""" if event.sender == BOT_USER: return if not self._sync_token_received: @@ -930,14 +954,19 @@ class Bot: if time.time() - server_ts > 30: return - # Only handle PDFs source = event.source or {} content = source.get("content", {}) info = content.get("info", {}) mime_type = info.get("mimetype", "") filename = content.get("body", "file") + ext = os.path.splitext(filename.lower())[1] - if mime_type != "application/pdf" and not filename.lower().endswith(".pdf"): + # Determine file type + is_pdf = mime_type == "application/pdf" or ext == ".pdf" + is_docx = mime_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" or ext == ".docx" + is_text = ext in self._TEXT_EXTENSIONS or mime_type.startswith("text/") + + if not (is_pdf or is_docx or is_text): return await self._load_room_settings(room.room_id) @@ -959,7 +988,7 @@ class Bot: await self._send_text(room.room_id, "LLM not configured (LITELLM_BASE_URL not set).") return - # Download PDF + # Download file mxc_url = event.url if not mxc_url: return @@ -968,29 +997,43 @@ class Bot: if not hasattr(resp, "body"): logger.warning("File download failed for %s", mxc_url) return - pdf_bytes = resp.body + file_bytes = resp.body except Exception: logger.exception("Failed to download file %s", mxc_url) return - # Extract text from PDF - pdf_text = self._extract_pdf_text(pdf_bytes) - if not pdf_text: - await self._send_text(room.room_id, "I couldn't extract any text from that PDF.") + # Extract text based on file type + if is_pdf: + extracted = self._extract_pdf_text(file_bytes) + doc_type = "pdf" + elif is_docx: + extracted = self._extract_docx_text(file_bytes) + doc_type = "text" + else: + extracted = self._extract_text_file(file_bytes) + doc_type = "text" + + if not extracted: + await self._send_text(room.room_id, f"I couldn't extract any text from that file ({filename}).") return # Truncate to avoid token limits (roughly 50k chars ≈ 12k tokens) - if len(pdf_text) > 50000: - pdf_text = pdf_text[:50000] + "\n\n[... truncated, PDF too long ...]" + if len(extracted) > 50000: + extracted = extracted[:50000] + "\n\n[... truncated, file too long ...]" - # Store PDF context for voice session pickup - self._room_pdf_context[room.room_id] = { + # Store document context for voice session pickup + docs = self._room_document_context.setdefault(room.room_id, []) + docs.append({ + "type": doc_type, "filename": filename, - "text": pdf_text, + "text": extracted, "timestamp": time.time(), - } + }) + if len(docs) > 5: + del docs[:-5] - user_message = f'The user sent a PDF file named "{filename}". Here is the extracted text:\n\n{pdf_text}\n\nPlease summarize or answer questions about this document.' + label = "PDF" if is_pdf else "Word document" if is_docx else "file" + user_message = f'The user sent a {label} named "{filename}". Here is the extracted text:\n\n{extracted}\n\nPlease summarize or answer questions about this document.' await self.client.room_typing(room.room_id, typing_state=True) try: @@ -1014,6 +1057,28 @@ class Bot: logger.exception("PDF text extraction failed") return "" + @staticmethod + def _extract_docx_text(docx_bytes: bytes) -> str: + """Extract text from .docx bytes using python-docx.""" + try: + doc = docx.Document(io.BytesIO(docx_bytes)) + return "\n".join(p.text for p in doc.paragraphs if p.text.strip()) + except Exception: + logger.exception("DOCX text extraction failed") + return "" + + @staticmethod + def _extract_text_file(file_bytes: bytes) -> str: + """Decode text file bytes as UTF-8 with fallback to latin-1.""" + try: + return file_bytes.decode("utf-8") + except UnicodeDecodeError: + try: + return file_bytes.decode("latin-1") + except Exception: + logger.exception("Text file decode failed") + return "" + async def _handle_command(self, room, cmd: str, event=None): if cmd == "help": await self._send_text(room.room_id, HELP_TEXT) @@ -1239,7 +1304,8 @@ class Bot: finally: self._pending_connects.pop(sender, None) - async def _respond_with_ai(self, room, user_message: str, sender: str = None, image_data: tuple = None): + async def _respond_with_ai(self, room, user_message: str, sender: str = None, image_data: tuple = None) -> str | None: + """Send AI response and return the reply text (or None on failure).""" model = self.room_models.get(room.room_id, DEFAULT_MODEL) # Fetch conversation history FIRST (needed for query rewriting) @@ -1333,9 +1399,12 @@ class Bot: gap_seconds = time.time() - last_rename if last_rename else float("inf") if gap_seconds > 300: await self._auto_rename_room(room, user_message, reply) + + return reply except Exception: logger.exception("LLM call failed") await self._send_text(room.room_id, "Sorry, I couldn't generate a response.") + return None async def _rewrite_query(self, user_message: str, history: list[dict], model: str) -> str: """Rewrite user message into a standalone search query using conversation context.""" diff --git a/requirements.txt b/requirements.txt index c42f1b1..f46830b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ canonicaljson>=2.0,<3.0 httpx>=0.27,<1.0 openai>=2.0,<3.0 pymupdf>=1.24,<2.0 +python-docx>=1.0,<2.0