feat: Blinkist-style audio summary bot (MAT-74)

Add interactive article summary feature: user pastes URL → bot asks language/duration/topics → generates audio summary via LLM + ElevenLabs TTS → posts MP3 inline with transcript and follow-up Q&A. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-04 17:39:09 +02:00
parent 1000891a97
commit 4ec4054db4
6 changed files with 789 additions and 0 deletions
--- a/bot.py
+++ b/bot.py
@@ -40,6 +40,7 @@ from nio import (
 from nio.crypto.attachments import decrypt_attachment
 from livekit import api
 from voice import VoiceSession
+from article_summary import ArticleSummaryHandler

 BOT_DEVICE_ID = "AIBOT"
 CALL_MEMBER_TYPE = "org.matrix.msc3401.call.member"
@@ -77,6 +78,9 @@ BOT_API_KEY = os.environ.get("BOT_API_KEY", "")
 RAG_ENDPOINT = os.environ.get("RAG_ENDPOINT", "")  # Customer-VM RAG service (e.g. http://127.0.0.1:8765)
 RAG_AUTH_TOKEN = os.environ.get("RAG_AUTH_TOKEN", "")  # Bearer token for local RAG
 BRAVE_API_KEY = os.environ.get("BRAVE_API_KEY", "")
+ELEVENLABS_API_KEY = os.environ.get("ELEVENLABS_API_KEY", "")
+ELEVENLABS_VOICE_ID = os.environ.get("ELEVENLABS_VOICE_ID", "ML23UVoFL5mI6APbRAeR")
+FIRECRAWL_URL = os.environ.get("FIRECRAWL_URL", "")
 MAX_TOOL_ITERATIONS = 5

 SYSTEM_PROMPT = """You are a helpful AI assistant in a Matrix chat room.
@@ -962,6 +966,17 @@ class Bot:
        self._sync_token_received = False
        self._verifications: dict[str, dict] = {}  # txn_id -> verification state
        self._room_document_context: dict[str, list[dict]] = {}  # room_id -> [{type, filename, text, timestamp}, ...]
+        # Article summary handler (Blinkist-style audio summaries)
+        if self.llm and ELEVENLABS_API_KEY:
+            self.article_handler = ArticleSummaryHandler(
+                llm_client=self.llm,
+                model=DEFAULT_MODEL,
+                elevenlabs_key=ELEVENLABS_API_KEY,
+                voice_id=ELEVENLABS_VOICE_ID,
+                firecrawl_url=FIRECRAWL_URL or None,
+            )
+        else:
+            self.article_handler = None

    async def _has_documents(self, matrix_user_id: str) -> bool:
        """Check if user has documents via local RAG or MatrixHost portal API.
@@ -1530,6 +1545,24 @@ class Bot:
            logger.info("Confluence page %s detected in room %s",
                         confluence_page_id, room.room_id)

+        # Check article summary FSM (Blinkist-style audio summaries)
+        if self.article_handler:
+            summary_response = await self.article_handler.handle_message(
+                room.room_id, sender, body
+            )
+            if summary_response is not None:
+                if summary_response == "__GENERATE__":
+                    await self.client.room_typing(room.room_id, typing_state=True)
+                    try:
+                        await self.article_handler.generate_and_post(
+                            self, room.room_id, sender
+                        )
+                    finally:
+                        await self.client.room_typing(room.room_id, typing_state=False)
+                elif summary_response:
+                    await self._send_text(room.room_id, summary_response)
+                return
+
        await self.client.room_typing(room.room_id, typing_state=True)
        try:
            await self._respond_with_ai(room, body, sender=sender, image_data=image_data)
@@ -2331,6 +2364,47 @@ class Bot:
            content=content,
        )

+    async def _send_audio(self, room_id: str, audio_bytes: bytes, filename: str, duration_seconds: float):
+        """Upload audio to Matrix homeserver and send as m.audio event."""
+        from nio import UploadResponse
+        upload_resp, maybe_keys = await self.client.upload(
+            data_provider=io.BytesIO(audio_bytes),
+            content_type="audio/mpeg",
+            filename=filename,
+            filesize=len(audio_bytes),
+            encrypt=True,
+        )
+        if not isinstance(upload_resp, UploadResponse):
+            logger.error("Audio upload failed: %s", upload_resp)
+            await self._send_text(room_id, "Sorry, I couldn't upload the audio file.")
+            return
+
+        content = {
+            "msgtype": "m.audio",
+            "body": filename,
+            "info": {
+                "mimetype": "audio/mpeg",
+                "size": len(audio_bytes),
+                "duration": int(duration_seconds * 1000),  # Matrix uses milliseconds
+            },
+        }
+        if maybe_keys:
+            content["file"] = {
+                "url": upload_resp.content_uri,
+                "key": maybe_keys["key"],
+                "iv": maybe_keys["iv"],
+                "hashes": maybe_keys["hashes"],
+                "v": maybe_keys["v"],
+            }
+        else:
+            content["url"] = upload_resp.content_uri
+
+        await self.client.room_send(
+            room_id,
+            message_type="m.room.message",
+            content=content,
+        )
+
    async def _summarize_call(self, transcript: list[dict], room_id: str) -> str:
        """Generate a concise summary of a voice call transcript via LLM."""
        # Format transcript for the LLM