fix: Handle encrypted images + link text to recent images
- Add RoomEncryptedImage callback with decrypt_attachment for E2E rooms - Cache recent images per room (60s TTL) so follow-up text messages like "was ist das" get the image context instead of hallucinating - Treat filenames (containing dots) as no-caption, default to "What's in this image?" Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
80
bot.py
80
bot.py
@@ -19,6 +19,7 @@ from nio import (
|
|||||||
LoginResponse,
|
LoginResponse,
|
||||||
InviteMemberEvent,
|
InviteMemberEvent,
|
||||||
MegolmEvent,
|
MegolmEvent,
|
||||||
|
RoomEncryptedImage,
|
||||||
RoomMessageFile,
|
RoomMessageFile,
|
||||||
RoomMessageImage,
|
RoomMessageImage,
|
||||||
RoomMessageText,
|
RoomMessageText,
|
||||||
@@ -31,6 +32,7 @@ from nio import (
|
|||||||
KeyVerificationMac,
|
KeyVerificationMac,
|
||||||
ToDeviceError,
|
ToDeviceError,
|
||||||
)
|
)
|
||||||
|
from nio.crypto.attachments import decrypt_attachment
|
||||||
from livekit import api
|
from livekit import api
|
||||||
|
|
||||||
BOT_DEVICE_ID = "AIBOT"
|
BOT_DEVICE_ID = "AIBOT"
|
||||||
@@ -200,6 +202,7 @@ class Bot:
|
|||||||
self.user_keys: dict[str, str] = self._load_user_keys() # matrix_user_id -> api_key
|
self.user_keys: dict[str, str] = self._load_user_keys() # matrix_user_id -> api_key
|
||||||
self.room_models: dict[str, str] = {} # room_id -> model name
|
self.room_models: dict[str, str] = {} # room_id -> model name
|
||||||
self.auto_rename_rooms: set[str] = set() # rooms with auto-rename enabled
|
self.auto_rename_rooms: set[str] = set() # rooms with auto-rename enabled
|
||||||
|
self._recent_images: dict[str, tuple[str, str, float]] = {} # room_id -> (b64, mime, timestamp)
|
||||||
self.renamed_rooms: dict[str, float] = {} # room_id -> timestamp of last rename
|
self.renamed_rooms: dict[str, float] = {} # room_id -> timestamp of last rename
|
||||||
self._loaded_rooms: set[str] = set() # rooms where we've loaded state
|
self._loaded_rooms: set[str] = set() # rooms where we've loaded state
|
||||||
self._sync_token_received = False
|
self._sync_token_received = False
|
||||||
@@ -259,6 +262,7 @@ class Bot:
|
|||||||
self.client.add_event_callback(self.on_unknown, UnknownEvent)
|
self.client.add_event_callback(self.on_unknown, UnknownEvent)
|
||||||
self.client.add_event_callback(self.on_text_message, RoomMessageText)
|
self.client.add_event_callback(self.on_text_message, RoomMessageText)
|
||||||
self.client.add_event_callback(self.on_image_message, RoomMessageImage)
|
self.client.add_event_callback(self.on_image_message, RoomMessageImage)
|
||||||
|
self.client.add_event_callback(self.on_encrypted_image_message, RoomEncryptedImage)
|
||||||
self.client.add_event_callback(self.on_file_message, RoomMessageFile)
|
self.client.add_event_callback(self.on_file_message, RoomMessageFile)
|
||||||
self.client.add_event_callback(self.on_room_unknown, RoomMessageUnknown)
|
self.client.add_event_callback(self.on_room_unknown, RoomMessageUnknown)
|
||||||
self.client.add_response_callback(self.on_sync, SyncResponse)
|
self.client.add_response_callback(self.on_sync, SyncResponse)
|
||||||
@@ -435,9 +439,18 @@ class Bot:
|
|||||||
await self._send_text(room.room_id, "LLM not configured (LITELLM_BASE_URL not set).")
|
await self._send_text(room.room_id, "LLM not configured (LITELLM_BASE_URL not set).")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Check if a recent image was sent in this room (within 60s)
|
||||||
|
image_data = None
|
||||||
|
cached = self._recent_images.get(room.room_id)
|
||||||
|
if cached:
|
||||||
|
b64, mime, ts = cached
|
||||||
|
if time.time() - ts < 60:
|
||||||
|
image_data = (b64, mime)
|
||||||
|
del self._recent_images[room.room_id]
|
||||||
|
|
||||||
await self.client.room_typing(room.room_id, typing_state=True)
|
await self.client.room_typing(room.room_id, typing_state=True)
|
||||||
try:
|
try:
|
||||||
await self._respond_with_ai(room, body, sender=event.sender)
|
await self._respond_with_ai(room, body, sender=event.sender, image_data=image_data)
|
||||||
finally:
|
finally:
|
||||||
await self.client.room_typing(room.room_id, typing_state=False)
|
await self.client.room_typing(room.room_id, typing_state=False)
|
||||||
|
|
||||||
@@ -490,7 +503,70 @@ class Bot:
|
|||||||
b64_data = base64.b64encode(img_bytes).decode("utf-8")
|
b64_data = base64.b64encode(img_bytes).decode("utf-8")
|
||||||
|
|
||||||
caption = (event.body or "").strip()
|
caption = (event.body or "").strip()
|
||||||
text = caption if caption and caption != "image" else "What's in this image?"
|
# Treat filenames (contain dots or are very long) as no caption
|
||||||
|
is_filename = not caption or caption == "image" or "." in caption or len(caption) > 100
|
||||||
|
text = "What's in this image?" if is_filename else caption
|
||||||
|
|
||||||
|
# Cache image for follow-up text messages
|
||||||
|
self._recent_images[room.room_id] = (b64_data, mime_type, time.time())
|
||||||
|
|
||||||
|
await self.client.room_typing(room.room_id, typing_state=True)
|
||||||
|
try:
|
||||||
|
await self._respond_with_ai(room, text, sender=event.sender, image_data=(b64_data, mime_type))
|
||||||
|
finally:
|
||||||
|
await self.client.room_typing(room.room_id, typing_state=False)
|
||||||
|
|
||||||
|
async def on_encrypted_image_message(self, room, event: RoomEncryptedImage):
|
||||||
|
"""Handle encrypted image messages: decrypt, encode, and send to AI."""
|
||||||
|
if event.sender == BOT_USER:
|
||||||
|
return
|
||||||
|
if not self._sync_token_received:
|
||||||
|
return
|
||||||
|
server_ts = event.server_timestamp / 1000
|
||||||
|
if time.time() - server_ts > 30:
|
||||||
|
return
|
||||||
|
|
||||||
|
await self._load_room_settings(room.room_id)
|
||||||
|
|
||||||
|
is_dm = room.member_count == 2
|
||||||
|
if not is_dm:
|
||||||
|
body = (event.body or "").strip()
|
||||||
|
bot_display = self.client.user_id.split(":")[0].lstrip("@")
|
||||||
|
mentioned = (
|
||||||
|
BOT_USER in body
|
||||||
|
or f"@{bot_display}" in body.lower()
|
||||||
|
or bot_display.lower() in body.lower()
|
||||||
|
)
|
||||||
|
if not mentioned:
|
||||||
|
return
|
||||||
|
|
||||||
|
if not self.llm:
|
||||||
|
await self._send_text(room.room_id, "LLM not configured (LITELLM_BASE_URL not set).")
|
||||||
|
return
|
||||||
|
|
||||||
|
mxc_url = event.url
|
||||||
|
if not mxc_url:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
resp = await self.client.download(mxc=mxc_url)
|
||||||
|
if not hasattr(resp, "body"):
|
||||||
|
logger.warning("Encrypted image download failed for %s", mxc_url)
|
||||||
|
return
|
||||||
|
# Decrypt the attachment
|
||||||
|
img_bytes = decrypt_attachment(resp.body, event.key["k"], event.hashes["sha256"], event.iv)
|
||||||
|
except Exception:
|
||||||
|
logger.exception("Failed to download/decrypt encrypted image %s", mxc_url)
|
||||||
|
return
|
||||||
|
|
||||||
|
mime_type = getattr(event, "mimetype", None) or "image/png"
|
||||||
|
b64_data = base64.b64encode(img_bytes).decode("utf-8")
|
||||||
|
|
||||||
|
caption = (event.body or "").strip()
|
||||||
|
is_filename = not caption or caption == "image" or "." in caption or len(caption) > 100
|
||||||
|
text = "What's in this image?" if is_filename else caption
|
||||||
|
|
||||||
|
# Cache image for follow-up text messages
|
||||||
|
self._recent_images[room.room_id] = (b64_data, mime_type, time.time())
|
||||||
|
|
||||||
await self.client.room_typing(room.room_id, typing_state=True)
|
await self.client.room_typing(room.room_id, typing_state=True)
|
||||||
try:
|
try:
|
||||||
|
|||||||
Reference in New Issue
Block a user