feat: Add image reading and generation to Matrix AI bot (MAT-9)

- Register RoomMessageImage callback to handle incoming images
- Download and base64-encode images, send as multimodal content to LLM
- Add LLM tool calling with generate_image tool for natural image generation
- Upload generated images back to Matrix via m.image events
- Update system prompt to inform LLM about vision and image gen capabilities

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Christian Gick
2026-02-18 21:54:45 +02:00
parent 4477c9d68f
commit 8b08056e0a

137
bot.py
View File

@@ -1,6 +1,8 @@
import os
import json
import asyncio
import base64
import io
import logging
import re
import time
@@ -16,6 +18,7 @@ from nio import (
LoginResponse,
InviteMemberEvent,
MegolmEvent,
RoomMessageImage,
RoomMessageText,
RoomMessageUnknown,
SyncResponse,
@@ -63,7 +66,24 @@ IMPORTANT RULES — FOLLOW THESE STRICTLY:
- NEVER ask the user where documents are stored, how they were uploaded, or under what filename.
- NEVER suggest contacting an administrator, using a web interface, or checking another system.
- NEVER ask follow-up questions about document storage or file locations.
- If no relevant documents were found, simply say you don't have information on that topic and ask if you can help with something else. Do NOT speculate about why or suggest the user look elsewhere."""
- If no relevant documents were found, simply say you don't have information on that topic and ask if you can help with something else. Do NOT speculate about why or suggest the user look elsewhere.
- You can see and analyze images that users send. Describe what you see when asked about an image.
- You can generate images when asked — use the generate_image tool for any image creation, drawing, or illustration requests."""
IMAGE_GEN_TOOLS = [{
"type": "function",
"function": {
"name": "generate_image",
"description": "Generate an image from a text description. Use when the user asks to create, draw, generate, design, or make an image/picture/photo/illustration.",
"parameters": {
"type": "object",
"properties": {
"prompt": {"type": "string", "description": "Detailed image generation prompt"}
},
"required": ["prompt"]
}
}
}]
HELP_TEXT = """**AI Bot Commands**
- `!ai help` — Show this help
@@ -235,6 +255,7 @@ class Bot:
self.client.add_event_callback(self.on_megolm, MegolmEvent)
self.client.add_event_callback(self.on_unknown, UnknownEvent)
self.client.add_event_callback(self.on_text_message, RoomMessageText)
self.client.add_event_callback(self.on_image_message, RoomMessageImage)
self.client.add_event_callback(self.on_room_unknown, RoomMessageUnknown)
self.client.add_response_callback(self.on_sync, SyncResponse)
self.client.add_to_device_callback(self.on_key_verification, KeyVerificationStart)
@@ -416,6 +437,63 @@ class Bot:
finally:
await self.client.room_typing(room.room_id, typing_state=False)
async def on_image_message(self, room, event: RoomMessageImage):
"""Handle image messages: download, encode, and send to AI for analysis."""
if event.sender == BOT_USER:
return
if not self._sync_token_received:
return
server_ts = event.server_timestamp / 1000
if time.time() - server_ts > 30:
return
await self._load_room_settings(room.room_id)
# In DMs respond to all images; in groups only if bot was recently @mentioned
is_dm = room.member_count == 2
if not is_dm:
# Check if bot was @mentioned in the image body (caption) or skip
body = (event.body or "").strip()
bot_display = self.client.user_id.split(":")[0].lstrip("@")
mentioned = (
BOT_USER in body
or f"@{bot_display}" in body.lower()
or bot_display.lower() in body.lower()
)
if not mentioned:
return
if not self.llm:
await self._send_text(room.room_id, "LLM not configured (LITELLM_BASE_URL not set).")
return
# Download image from Matrix homeserver
mxc_url = event.url
if not mxc_url:
return
try:
resp = await self.client.download(mxc=mxc_url)
if not hasattr(resp, "body"):
logger.warning("Image download failed for %s", mxc_url)
return
img_bytes = resp.body
except Exception:
logger.exception("Failed to download image %s", mxc_url)
return
# Determine MIME type
mime_type = getattr(event, "mimetype", None) or "image/png"
b64_data = base64.b64encode(img_bytes).decode("utf-8")
caption = (event.body or "").strip()
text = caption if caption and caption != "image" else "What's in this image?"
await self.client.room_typing(room.room_id, typing_state=True)
try:
await self._respond_with_ai(room, text, sender=event.sender, image_data=(b64_data, mime_type))
finally:
await self.client.room_typing(room.room_id, typing_state=False)
async def _handle_command(self, room, cmd: str, event=None):
if cmd == "help":
await self._send_text(room.room_id, HELP_TEXT)
@@ -618,7 +696,7 @@ class Bot:
finally:
self._pending_connects.pop(sender, None)
async def _respond_with_ai(self, room, user_message: str, sender: str = None):
async def _respond_with_ai(self, room, user_message: str, sender: str = None, image_data: tuple = None):
model = self.room_models.get(room.room_id, DEFAULT_MODEL)
# Fetch conversation history FIRST (needed for query rewriting)
@@ -654,7 +732,15 @@ class Bot:
messages.append({"role": "system", "content": doc_context})
messages.extend(history)
# Add current user message
# Add current user message (multimodal if image provided)
if image_data:
b64_str, mime_type = image_data
user_content = [
{"type": "text", "text": user_message},
{"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{b64_str}"}}
]
messages.append({"role": "user", "content": user_content})
else:
messages.append({"role": "user", "content": user_message})
try:
@@ -662,8 +748,19 @@ class Bot:
model=model,
messages=messages,
max_tokens=2048,
tools=IMAGE_GEN_TOOLS if not image_data else None,
)
reply = resp.choices[0].message.content
choice = resp.choices[0]
if choice.message.tool_calls:
for tc in choice.message.tool_calls:
if tc.function.name == "generate_image":
args = json.loads(tc.function.arguments)
await self._generate_and_send_image(room.room_id, args["prompt"])
if choice.message.content:
await self._send_text(room.room_id, choice.message.content)
else:
reply = choice.message.content
await self._send_text(room.room_id, reply)
# Auto-rename: only for group rooms with explicit opt-in (not DMs)
if room.room_id in self.auto_rename_rooms:
@@ -770,6 +867,38 @@ class Bot:
safe = safe.replace("\n", "<br/>")
return safe
async def _generate_and_send_image(self, room_id: str, prompt: str):
"""Generate an image via LiteLLM and send it to the Matrix room."""
try:
resp = await self.llm.images.generate(
model="dall-e-3", prompt=prompt, n=1, size="1024x1024",
response_format="b64_json",
)
img_b64 = resp.data[0].b64_json
img_bytes = base64.b64decode(img_b64)
await self._send_image(room_id, img_bytes, "image/png", "generated.png")
except Exception:
logger.exception("Image generation failed")
await self._send_text(room_id, "Sorry, I couldn't generate that image.")
async def _send_image(self, room_id: str, image_bytes: bytes, mime_type: str, filename: str):
"""Upload image to Matrix homeserver and send as m.image event."""
upload_resp, _ = await self.client.upload(
data_provider=io.BytesIO(image_bytes),
content_type=mime_type,
filename=filename,
)
await self.client.room_send(
room_id,
message_type="m.room.message",
content={
"msgtype": "m.image",
"body": filename,
"url": upload_resp.content_uri,
"info": {"mimetype": mime_type, "size": len(image_bytes)},
},
)
async def _send_text(self, room_id: str, text: str):
await self.client.room_send(
room_id,