feat: Add image reading and generation to Matrix AI bot (MAT-9)

- Register RoomMessageImage callback to handle incoming images - Download and base64-encode images, send as multimodal content to LLM - Add LLM tool calling with generate_image tool for natural image generation - Upload generated images back to Matrix via m.image events - Update system prompt to inform LLM about vision and image gen capabilities Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-18 21:54:45 +02:00
parent 4477c9d68f
commit 8b08056e0a
1 changed files with 135 additions and 6 deletions
--- a/bot.py
+++ b/bot.py
@@ -1,6 +1,8 @@
 import os
 import json
 import asyncio
+import base64
+import io
 import logging
 import re
 import time
@@ -16,6 +18,7 @@ from nio import (
    LoginResponse,
    InviteMemberEvent,
    MegolmEvent,
+    RoomMessageImage,
    RoomMessageText,
    RoomMessageUnknown,
    SyncResponse,
@@ -63,7 +66,24 @@ IMPORTANT RULES — FOLLOW THESE STRICTLY:
 - NEVER ask the user where documents are stored, how they were uploaded, or under what filename.
 - NEVER suggest contacting an administrator, using a web interface, or checking another system.
 - NEVER ask follow-up questions about document storage or file locations.
- If no relevant documents were found, simply say you don't have information on that topic and ask if you can help with something else. Do NOT speculate about why or suggest the user look elsewhere."""
+- If no relevant documents were found, simply say you don't have information on that topic and ask if you can help with something else. Do NOT speculate about why or suggest the user look elsewhere.
+- You can see and analyze images that users send. Describe what you see when asked about an image.
+- You can generate images when asked — use the generate_image tool for any image creation, drawing, or illustration requests."""
+
+IMAGE_GEN_TOOLS = [{
+    "type": "function",
+    "function": {
+        "name": "generate_image",
+        "description": "Generate an image from a text description. Use when the user asks to create, draw, generate, design, or make an image/picture/photo/illustration.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "prompt": {"type": "string", "description": "Detailed image generation prompt"}
+            },
+            "required": ["prompt"]
+        }
+    }
+}]

 HELP_TEXT = """**AI Bot Commands**
 - `!ai help` — Show this help
@@ -235,6 +255,7 @@ class Bot:
        self.client.add_event_callback(self.on_megolm, MegolmEvent)
        self.client.add_event_callback(self.on_unknown, UnknownEvent)
        self.client.add_event_callback(self.on_text_message, RoomMessageText)
+        self.client.add_event_callback(self.on_image_message, RoomMessageImage)
        self.client.add_event_callback(self.on_room_unknown, RoomMessageUnknown)
        self.client.add_response_callback(self.on_sync, SyncResponse)
        self.client.add_to_device_callback(self.on_key_verification, KeyVerificationStart)
@@ -416,6 +437,63 @@ class Bot:
        finally:
            await self.client.room_typing(room.room_id, typing_state=False)

+    async def on_image_message(self, room, event: RoomMessageImage):
+        """Handle image messages: download, encode, and send to AI for analysis."""
+        if event.sender == BOT_USER:
+            return
+        if not self._sync_token_received:
+            return
+        server_ts = event.server_timestamp / 1000
+        if time.time() - server_ts > 30:
+            return
+
+        await self._load_room_settings(room.room_id)
+
+        # In DMs respond to all images; in groups only if bot was recently @mentioned
+        is_dm = room.member_count == 2
+        if not is_dm:
+            # Check if bot was @mentioned in the image body (caption) or skip
+            body = (event.body or "").strip()
+            bot_display = self.client.user_id.split(":")[0].lstrip("@")
+            mentioned = (
+                BOT_USER in body
+                or f"@{bot_display}" in body.lower()
+                or bot_display.lower() in body.lower()
+            )
+            if not mentioned:
+                return
+
+        if not self.llm:
+            await self._send_text(room.room_id, "LLM not configured (LITELLM_BASE_URL not set).")
+            return
+
+        # Download image from Matrix homeserver
+        mxc_url = event.url
+        if not mxc_url:
+            return
+        try:
+            resp = await self.client.download(mxc=mxc_url)
+            if not hasattr(resp, "body"):
+                logger.warning("Image download failed for %s", mxc_url)
+                return
+            img_bytes = resp.body
+        except Exception:
+            logger.exception("Failed to download image %s", mxc_url)
+            return
+
+        # Determine MIME type
+        mime_type = getattr(event, "mimetype", None) or "image/png"
+        b64_data = base64.b64encode(img_bytes).decode("utf-8")
+
+        caption = (event.body or "").strip()
+        text = caption if caption and caption != "image" else "What's in this image?"
+
+        await self.client.room_typing(room.room_id, typing_state=True)
+        try:
+            await self._respond_with_ai(room, text, sender=event.sender, image_data=(b64_data, mime_type))
+        finally:
+            await self.client.room_typing(room.room_id, typing_state=False)
+
    async def _handle_command(self, room, cmd: str, event=None):
        if cmd == "help":
            await self._send_text(room.room_id, HELP_TEXT)
@@ -618,7 +696,7 @@ class Bot:
        finally:
            self._pending_connects.pop(sender, None)

-    async def _respond_with_ai(self, room, user_message: str, sender: str = None):
+    async def _respond_with_ai(self, room, user_message: str, sender: str = None, image_data: tuple = None):
        model = self.room_models.get(room.room_id, DEFAULT_MODEL)

        # Fetch conversation history FIRST (needed for query rewriting)
@@ -654,7 +732,15 @@ class Bot:
            messages.append({"role": "system", "content": doc_context})
        messages.extend(history)

-        # Add current user message
+        # Add current user message (multimodal if image provided)
+        if image_data:
+            b64_str, mime_type = image_data
+            user_content = [
+                {"type": "text", "text": user_message},
+                {"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{b64_str}"}}
+            ]
+            messages.append({"role": "user", "content": user_content})
+        else:
            messages.append({"role": "user", "content": user_message})

        try:
@@ -662,8 +748,19 @@ class Bot:
                model=model,
                messages=messages,
                max_tokens=2048,
+                tools=IMAGE_GEN_TOOLS if not image_data else None,
            )
-            reply = resp.choices[0].message.content
+            choice = resp.choices[0]
+
+            if choice.message.tool_calls:
+                for tc in choice.message.tool_calls:
+                    if tc.function.name == "generate_image":
+                        args = json.loads(tc.function.arguments)
+                        await self._generate_and_send_image(room.room_id, args["prompt"])
+                if choice.message.content:
+                    await self._send_text(room.room_id, choice.message.content)
+            else:
+                reply = choice.message.content
                await self._send_text(room.room_id, reply)
            # Auto-rename: only for group rooms with explicit opt-in (not DMs)
            if room.room_id in self.auto_rename_rooms:
@@ -770,6 +867,38 @@ class Bot:
        safe = safe.replace("\n", "<br/>")
        return safe

+    async def _generate_and_send_image(self, room_id: str, prompt: str):
+        """Generate an image via LiteLLM and send it to the Matrix room."""
+        try:
+            resp = await self.llm.images.generate(
+                model="dall-e-3", prompt=prompt, n=1, size="1024x1024",
+                response_format="b64_json",
+            )
+            img_b64 = resp.data[0].b64_json
+            img_bytes = base64.b64decode(img_b64)
+            await self._send_image(room_id, img_bytes, "image/png", "generated.png")
+        except Exception:
+            logger.exception("Image generation failed")
+            await self._send_text(room_id, "Sorry, I couldn't generate that image.")
+
+    async def _send_image(self, room_id: str, image_bytes: bytes, mime_type: str, filename: str):
+        """Upload image to Matrix homeserver and send as m.image event."""
+        upload_resp, _ = await self.client.upload(
+            data_provider=io.BytesIO(image_bytes),
+            content_type=mime_type,
+            filename=filename,
+        )
+        await self.client.room_send(
+            room_id,
+            message_type="m.room.message",
+            content={
+                "msgtype": "m.image",
+                "body": filename,
+                "url": upload_resp.content_uri,
+                "info": {"mimetype": mime_type, "size": len(image_bytes)},
+            },
+        )
+
    async def _send_text(self, room_id: str, text: str):
        await self.client.room_send(
            room_id,