fix: Handle encrypted images + link text to recent images

- Add RoomEncryptedImage callback with decrypt_attachment for E2E rooms - Cache recent images per room (60s TTL) so follow-up text messages like "was ist das" get the image context instead of hallucinating - Treat filenames (containing dots) as no-caption, default to "What's in this image?" Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-19 07:11:07 +02:00
parent 8fa6b7a49c
commit eef850f7ac
1 changed files with 78 additions and 2 deletions
--- a/bot.py
+++ b/bot.py
@@ -19,6 +19,7 @@ from nio import (
    LoginResponse,
    InviteMemberEvent,
    MegolmEvent,
    RoomEncryptedImage,
    RoomMessageFile,
    RoomMessageImage,
    RoomMessageText,
@@ -31,6 +32,7 @@ from nio import (
    KeyVerificationMac,
    ToDeviceError,
 )
 from nio.crypto.attachments import decrypt_attachment
 from livekit import api
 BOT_DEVICE_ID = "AIBOT"
@@ -200,6 +202,7 @@ class Bot:
        self.user_keys: dict[str, str] = self._load_user_keys()  # matrix_user_id -> api_key
        self.room_models: dict[str, str] = {}  # room_id -> model name
        self.auto_rename_rooms: set[str] = set()  # rooms with auto-rename enabled
        self._recent_images: dict[str, tuple[str, str, float]] = {}  # room_id -> (b64, mime, timestamp)
        self.renamed_rooms: dict[str, float] = {}  # room_id -> timestamp of last rename
        self._loaded_rooms: set[str] = set()  # rooms where we've loaded state
        self._sync_token_received = False
@@ -259,6 +262,7 @@ class Bot:
        self.client.add_event_callback(self.on_unknown, UnknownEvent)
        self.client.add_event_callback(self.on_text_message, RoomMessageText)
        self.client.add_event_callback(self.on_image_message, RoomMessageImage)
        self.client.add_event_callback(self.on_encrypted_image_message, RoomEncryptedImage)
        self.client.add_event_callback(self.on_file_message, RoomMessageFile)
        self.client.add_event_callback(self.on_room_unknown, RoomMessageUnknown)
        self.client.add_response_callback(self.on_sync, SyncResponse)
@@ -435,9 +439,18 @@ class Bot:
            await self._send_text(room.room_id, "LLM not configured (LITELLM_BASE_URL not set).")
            return
        # Check if a recent image was sent in this room (within 60s)
        image_data = None
        cached = self._recent_images.get(room.room_id)
        if cached:
            b64, mime, ts = cached
            if time.time() - ts < 60:
                image_data = (b64, mime)
                del self._recent_images[room.room_id]
        await self.client.room_typing(room.room_id, typing_state=True)
        try:
-            await self._respond_with_ai(room, body, sender=event.sender)
+            await self._respond_with_ai(room, body, sender=event.sender, image_data=image_data)
        finally:
            await self.client.room_typing(room.room_id, typing_state=False)
@@ -490,7 +503,70 @@ class Bot:
        b64_data = base64.b64encode(img_bytes).decode("utf-8")
        caption = (event.body or "").strip()
-        text = caption if caption and caption != "image" else "What's in this image?"
+        # Treat filenames (contain dots or are very long) as no caption
        is_filename = not caption or caption == "image" or "." in caption or len(caption) > 100
        text = "What's in this image?" if is_filename else caption
        # Cache image for follow-up text messages
        self._recent_images[room.room_id] = (b64_data, mime_type, time.time())
        await self.client.room_typing(room.room_id, typing_state=True)
        try:
            await self._respond_with_ai(room, text, sender=event.sender, image_data=(b64_data, mime_type))
        finally:
            await self.client.room_typing(room.room_id, typing_state=False)
    async def on_encrypted_image_message(self, room, event: RoomEncryptedImage):
        """Handle encrypted image messages: decrypt, encode, and send to AI."""
        if event.sender == BOT_USER:
            return
        if not self._sync_token_received:
            return
        server_ts = event.server_timestamp / 1000
        if time.time() - server_ts > 30:
            return
        await self._load_room_settings(room.room_id)
        is_dm = room.member_count == 2
        if not is_dm:
            body = (event.body or "").strip()
            bot_display = self.client.user_id.split(":")[0].lstrip("@")
            mentioned = (
                BOT_USER in body
                or f"@{bot_display}" in body.lower()
                or bot_display.lower() in body.lower()
            )
            if not mentioned:
                return
        if not self.llm:
            await self._send_text(room.room_id, "LLM not configured (LITELLM_BASE_URL not set).")
            return
        mxc_url = event.url
        if not mxc_url:
            return
        try:
            resp = await self.client.download(mxc=mxc_url)
            if not hasattr(resp, "body"):
                logger.warning("Encrypted image download failed for %s", mxc_url)
                return
            # Decrypt the attachment
            img_bytes = decrypt_attachment(resp.body, event.key["k"], event.hashes["sha256"], event.iv)
        except Exception:
            logger.exception("Failed to download/decrypt encrypted image %s", mxc_url)
            return
        mime_type = getattr(event, "mimetype", None) or "image/png"
        b64_data = base64.b64encode(img_bytes).decode("utf-8")
        caption = (event.body or "").strip()
        is_filename = not caption or caption == "image" or "." in caption or len(caption) > 100
        text = "What's in this image?" if is_filename else caption
        # Cache image for follow-up text messages
        self._recent_images[room.room_id] = (b64_data, mime_type, time.time())
        await self.client.room_typing(room.room_id, typing_state=True)
        try: