feat: Blinkist-style audio summary bot (MAT-74)

Add interactive article summary feature: user pastes URL → bot asks
language/duration/topics → generates audio summary via LLM + ElevenLabs
TTS → posts MP3 inline with transcript and follow-up Q&A.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Christian Gick
2026-03-04 17:39:09 +02:00
parent 1000891a97
commit 4ec4054db4
6 changed files with 789 additions and 0 deletions

74
bot.py
View File

@@ -40,6 +40,7 @@ from nio import (
from nio.crypto.attachments import decrypt_attachment
from livekit import api
from voice import VoiceSession
from article_summary import ArticleSummaryHandler
BOT_DEVICE_ID = "AIBOT"
CALL_MEMBER_TYPE = "org.matrix.msc3401.call.member"
@@ -77,6 +78,9 @@ BOT_API_KEY = os.environ.get("BOT_API_KEY", "")
RAG_ENDPOINT = os.environ.get("RAG_ENDPOINT", "") # Customer-VM RAG service (e.g. http://127.0.0.1:8765)
RAG_AUTH_TOKEN = os.environ.get("RAG_AUTH_TOKEN", "") # Bearer token for local RAG
BRAVE_API_KEY = os.environ.get("BRAVE_API_KEY", "")
ELEVENLABS_API_KEY = os.environ.get("ELEVENLABS_API_KEY", "")
ELEVENLABS_VOICE_ID = os.environ.get("ELEVENLABS_VOICE_ID", "ML23UVoFL5mI6APbRAeR")
FIRECRAWL_URL = os.environ.get("FIRECRAWL_URL", "")
MAX_TOOL_ITERATIONS = 5
SYSTEM_PROMPT = """You are a helpful AI assistant in a Matrix chat room.
@@ -962,6 +966,17 @@ class Bot:
self._sync_token_received = False
self._verifications: dict[str, dict] = {} # txn_id -> verification state
self._room_document_context: dict[str, list[dict]] = {} # room_id -> [{type, filename, text, timestamp}, ...]
# Article summary handler (Blinkist-style audio summaries)
if self.llm and ELEVENLABS_API_KEY:
self.article_handler = ArticleSummaryHandler(
llm_client=self.llm,
model=DEFAULT_MODEL,
elevenlabs_key=ELEVENLABS_API_KEY,
voice_id=ELEVENLABS_VOICE_ID,
firecrawl_url=FIRECRAWL_URL or None,
)
else:
self.article_handler = None
async def _has_documents(self, matrix_user_id: str) -> bool:
"""Check if user has documents via local RAG or MatrixHost portal API.
@@ -1530,6 +1545,24 @@ class Bot:
logger.info("Confluence page %s detected in room %s",
confluence_page_id, room.room_id)
# Check article summary FSM (Blinkist-style audio summaries)
if self.article_handler:
summary_response = await self.article_handler.handle_message(
room.room_id, sender, body
)
if summary_response is not None:
if summary_response == "__GENERATE__":
await self.client.room_typing(room.room_id, typing_state=True)
try:
await self.article_handler.generate_and_post(
self, room.room_id, sender
)
finally:
await self.client.room_typing(room.room_id, typing_state=False)
elif summary_response:
await self._send_text(room.room_id, summary_response)
return
await self.client.room_typing(room.room_id, typing_state=True)
try:
await self._respond_with_ai(room, body, sender=sender, image_data=image_data)
@@ -2331,6 +2364,47 @@ class Bot:
content=content,
)
async def _send_audio(self, room_id: str, audio_bytes: bytes, filename: str, duration_seconds: float):
"""Upload audio to Matrix homeserver and send as m.audio event."""
from nio import UploadResponse
upload_resp, maybe_keys = await self.client.upload(
data_provider=io.BytesIO(audio_bytes),
content_type="audio/mpeg",
filename=filename,
filesize=len(audio_bytes),
encrypt=True,
)
if not isinstance(upload_resp, UploadResponse):
logger.error("Audio upload failed: %s", upload_resp)
await self._send_text(room_id, "Sorry, I couldn't upload the audio file.")
return
content = {
"msgtype": "m.audio",
"body": filename,
"info": {
"mimetype": "audio/mpeg",
"size": len(audio_bytes),
"duration": int(duration_seconds * 1000), # Matrix uses milliseconds
},
}
if maybe_keys:
content["file"] = {
"url": upload_resp.content_uri,
"key": maybe_keys["key"],
"iv": maybe_keys["iv"],
"hashes": maybe_keys["hashes"],
"v": maybe_keys["v"],
}
else:
content["url"] = upload_resp.content_uri
await self.client.room_send(
room_id,
message_type="m.room.message",
content=content,
)
async def _summarize_call(self, transcript: list[dict], room_id: str) -> str:
"""Generate a concise summary of a voice call transcript via LLM."""
# Format transcript for the LLM