fix: Handle encrypted images + link text to recent images

- Add RoomEncryptedImage callback with decrypt_attachment for E2E rooms
- Cache recent images per room (60s TTL) so follow-up text messages
  like "was ist das" get the image context instead of hallucinating
- Treat filenames (containing dots) as no-caption, default to
  "What's in this image?"

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Christian Gick
2026-02-19 07:11:07 +02:00
parent 8fa6b7a49c
commit eef850f7ac

80
bot.py
View File

@@ -19,6 +19,7 @@ from nio import (
LoginResponse,
InviteMemberEvent,
MegolmEvent,
RoomEncryptedImage,
RoomMessageFile,
RoomMessageImage,
RoomMessageText,
@@ -31,6 +32,7 @@ from nio import (
KeyVerificationMac,
ToDeviceError,
)
from nio.crypto.attachments import decrypt_attachment
from livekit import api
BOT_DEVICE_ID = "AIBOT"
@@ -200,6 +202,7 @@ class Bot:
self.user_keys: dict[str, str] = self._load_user_keys() # matrix_user_id -> api_key
self.room_models: dict[str, str] = {} # room_id -> model name
self.auto_rename_rooms: set[str] = set() # rooms with auto-rename enabled
self._recent_images: dict[str, tuple[str, str, float]] = {} # room_id -> (b64, mime, timestamp)
self.renamed_rooms: dict[str, float] = {} # room_id -> timestamp of last rename
self._loaded_rooms: set[str] = set() # rooms where we've loaded state
self._sync_token_received = False
@@ -259,6 +262,7 @@ class Bot:
self.client.add_event_callback(self.on_unknown, UnknownEvent)
self.client.add_event_callback(self.on_text_message, RoomMessageText)
self.client.add_event_callback(self.on_image_message, RoomMessageImage)
self.client.add_event_callback(self.on_encrypted_image_message, RoomEncryptedImage)
self.client.add_event_callback(self.on_file_message, RoomMessageFile)
self.client.add_event_callback(self.on_room_unknown, RoomMessageUnknown)
self.client.add_response_callback(self.on_sync, SyncResponse)
@@ -435,9 +439,18 @@ class Bot:
await self._send_text(room.room_id, "LLM not configured (LITELLM_BASE_URL not set).")
return
# Check if a recent image was sent in this room (within 60s)
image_data = None
cached = self._recent_images.get(room.room_id)
if cached:
b64, mime, ts = cached
if time.time() - ts < 60:
image_data = (b64, mime)
del self._recent_images[room.room_id]
await self.client.room_typing(room.room_id, typing_state=True)
try:
await self._respond_with_ai(room, body, sender=event.sender)
await self._respond_with_ai(room, body, sender=event.sender, image_data=image_data)
finally:
await self.client.room_typing(room.room_id, typing_state=False)
@@ -490,7 +503,70 @@ class Bot:
b64_data = base64.b64encode(img_bytes).decode("utf-8")
caption = (event.body or "").strip()
text = caption if caption and caption != "image" else "What's in this image?"
# Treat filenames (contain dots or are very long) as no caption
is_filename = not caption or caption == "image" or "." in caption or len(caption) > 100
text = "What's in this image?" if is_filename else caption
# Cache image for follow-up text messages
self._recent_images[room.room_id] = (b64_data, mime_type, time.time())
await self.client.room_typing(room.room_id, typing_state=True)
try:
await self._respond_with_ai(room, text, sender=event.sender, image_data=(b64_data, mime_type))
finally:
await self.client.room_typing(room.room_id, typing_state=False)
async def on_encrypted_image_message(self, room, event: RoomEncryptedImage):
"""Handle encrypted image messages: decrypt, encode, and send to AI."""
if event.sender == BOT_USER:
return
if not self._sync_token_received:
return
server_ts = event.server_timestamp / 1000
if time.time() - server_ts > 30:
return
await self._load_room_settings(room.room_id)
is_dm = room.member_count == 2
if not is_dm:
body = (event.body or "").strip()
bot_display = self.client.user_id.split(":")[0].lstrip("@")
mentioned = (
BOT_USER in body
or f"@{bot_display}" in body.lower()
or bot_display.lower() in body.lower()
)
if not mentioned:
return
if not self.llm:
await self._send_text(room.room_id, "LLM not configured (LITELLM_BASE_URL not set).")
return
mxc_url = event.url
if not mxc_url:
return
try:
resp = await self.client.download(mxc=mxc_url)
if not hasattr(resp, "body"):
logger.warning("Encrypted image download failed for %s", mxc_url)
return
# Decrypt the attachment
img_bytes = decrypt_attachment(resp.body, event.key["k"], event.hashes["sha256"], event.iv)
except Exception:
logger.exception("Failed to download/decrypt encrypted image %s", mxc_url)
return
mime_type = getattr(event, "mimetype", None) or "image/png"
b64_data = base64.b64encode(img_bytes).decode("utf-8")
caption = (event.body or "").strip()
is_filename = not caption or caption == "image" or "." in caption or len(caption) > 100
text = "What's in this image?" if is_filename else caption
# Cache image for follow-up text messages
self._recent_images[room.room_id] = (b64_data, mime_type, time.time())
await self.client.room_typing(room.room_id, typing_state=True)
try: