feat(voice): add cross-call memory and Brave Search tool

- Query user memories at call start and inject into agent system prompt - Extract new facts after each exchange using claude-haiku via LiteLLM - Add Brave Search tool (@function_tool) for current data queries - Pass memory client and caller_user_id through VoiceSession constructor - Pre-compute 8 HMAC-ratcheted EC keys for reliable E2EE decryption Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-22 15:27:59 +02:00
parent 2b8744de6e
commit 52f8cb569c
2 changed files with 133 additions and 3 deletions
--- a/bot.py
+++ b/bot.py
@@ -463,6 +463,8 @@ class Bot:
                        bot_key=bot_key,
                        publish_key_cb=lambda key, rid=room_id: asyncio.ensure_future(
                            self._publish_encryption_key(rid, key)),
                        memory=self.memory,
                        caller_user_id=event.sender,
                    )
                    # Check timeline for caller's key
--- a/voice.py
+++ b/voice.py
@@ -10,10 +10,15 @@ import os
 import zoneinfo
 import json
 import re
 import aiohttp
 import httpx
 from livekit import rtc, api as lkapi
-from livekit.agents import Agent, AgentSession, room_io
+from livekit.agents import Agent, AgentSession, function_tool, room_io
 from livekit.plugins import openai as lk_openai, elevenlabs, silero
 from openai import AsyncOpenAI
 logger = logging.getLogger("matrix-ai-voice")
@@ -23,6 +28,8 @@ LITELLM_KEY = os.environ.get("LITELLM_API_KEY", "not-needed")
 LK_API_KEY = os.environ.get("LIVEKIT_API_KEY", "")
 LK_API_SECRET = os.environ.get("LIVEKIT_API_SECRET", "")
 ELEVENLABS_KEY = os.environ.get("ELEVENLABS_API_KEY", "")
 BRAVE_API_KEY = os.environ.get("BRAVE_API_KEY", "")
 MEMORY_SERVICE_URL = os.environ.get("MEMORY_SERVICE_URL", "http://memory-service:8090")
 DEFAULT_VOICE_ID = "JBFqnCBsd6RMkjVDRZzb"  # George - warm, British male, multilingual
 _VOICE_PROMPT_TEMPLATE = """Du bist ein hilfreicher Sprachassistent in einem Matrix-Anruf.
@@ -112,6 +119,86 @@ def _ratchet_keys(base_raw: bytes, count: int = 6) -> dict[int, bytes]:
    return keys
 async def _brave_search(query: str, count: int = 5) -> str:
    """Call Brave Search API and return formatted results."""
    if not BRAVE_API_KEY:
        return "Search unavailable (no API key configured)."
    try:
        async with httpx.AsyncClient(timeout=10.0) as client:
            resp = await client.get(
                "https://api.search.brave.com/res/v1/web/search",
                headers={"Accept": "application/json", "X-Subscription-Token": BRAVE_API_KEY},
                params={"q": query, "count": count, "text_decorations": False},
            )
            resp.raise_for_status()
            data = resp.json()
        results = data.get("web", {}).get("results", [])
        if not results:
            return "No results found."
        lines = []
        for r in results[:count]:
            lines.append(f"- {r.get('title', '')}: {r.get('description', '')} ({r.get('url', '')})")
        return "\n".join(lines)
    except Exception as exc:
        logger.warning("Brave search error: %s", exc)
        return f"Search failed: {exc}"
 async def _extract_voice_memories(user_text: str, agent_text: str,
                                   user_id: str, room_id: str) -> None:
    """Extract memorable facts from a voice exchange and store them."""
    if not LITELLM_URL or not MEMORY_SERVICE_URL:
        return
    try:
        # Fetch existing facts to avoid duplicates
        async with httpx.AsyncClient(timeout=10.0) as client:
            resp = await client.post(
                f"{MEMORY_SERVICE_URL}/memories/query",
                json={"user_id": user_id, "query": "all facts", "top_k": 20},
            )
            existing = [m["fact"] for m in resp.json().get("results", [])] if resp.is_success else []
        existing_text = "\n".join(f"- {f}" for f in existing) if existing else "(none)"
        llm = AsyncOpenAI(base_url=LITELLM_URL, api_key=LITELLM_KEY)
        resp = await llm.chat.completions.create(
            model="claude-haiku",
            messages=[
                {"role": "system", "content": (
                    "Extract memorable facts about the user from this voice conversation snippet. "
                    "Return a JSON array of concise strings. Include: name, preferences, location, "
                    "occupation, interests, family, projects. Skip duplicate or temporary info. "
                    "Return [] if nothing new."
                )},
                {"role": "user", "content": (
                    f"Existing memories:\n{existing_text}\n\n"
                    f"User said: {user_text}\nAssistant replied: {agent_text}\n\n"
                    "New facts (JSON array):"
                )},
            ],
            max_tokens=200,
        )
        raw = resp.choices[0].message.content.strip()
        if raw.startswith("```"):
            raw = re.sub(r"^```\w*\n?", "", raw)
            raw = re.sub(r"\n?```$", "", raw)
        match = re.search(r"\[.*\]", raw, re.DOTALL)
        if match:
            raw = match.group(0)
        new_facts = json.loads(raw)
        if not isinstance(new_facts, list):
            return
        async with httpx.AsyncClient(timeout=10.0) as client:
            for fact in new_facts:
                if isinstance(fact, str) and fact.strip():
                    await client.post(
                        f"{MEMORY_SERVICE_URL}/memories/store",
                        json={"user_id": user_id, "fact": fact.strip(), "source_room": room_id},
                    )
                    logger.info("Memory stored for %s: %s", user_id, fact[:80])
    except Exception as exc:
        logger.warning("Voice memory extraction failed: %s", exc)
 def _build_e2ee_options() -> rtc.E2EEOptions:
    """Build E2EE options — let Rust FFI apply HKDF internally (KDF_HKDF=1).
@@ -133,7 +220,8 @@ def _build_e2ee_options() -> rtc.E2EEOptions:
 class VoiceSession:
    def __init__(self, nio_client, room_id, device_id, lk_url, model="claude-sonnet",
-                 publish_key_cb=None, bot_key: bytes | None = None):
+                 publish_key_cb=None, bot_key: bytes | None = None,
                 memory=None, caller_user_id: str | None = None):
        self.nio_client = nio_client
        self.room_id = room_id
        self.device_id = device_id
@@ -149,6 +237,8 @@ class VoiceSession:
        self._caller_all_keys: dict = {}  # {index: bytes} — all caller keys by index
        self._bot_key: bytes = bot_key or os.urandom(16)
        self._publish_key_cb = publish_key_cb
        self._memory = memory          # MemoryClient instance from bot.py
        self._caller_user_id = caller_user_id  # Matrix user ID for memory lookup
    def on_encryption_key(self, sender, device_id, key, index):
        """Receive E2EE key from Element Call participant."""
@@ -451,6 +541,18 @@ class VoiceSession:
            if remote_identity:
                logger.info("Linking to remote participant: %s", remote_identity)
            # Load memories for this caller
            memory_section = ""
            if self._memory and self._caller_user_id:
                try:
                    mems = await self._memory.query(self._caller_user_id, "voice call", top_k=10)
                    if mems:
                        memory_section = "\n\nKontext aus früheren Gesprächen mit diesem Nutzer:\n" + \
                            "\n".join(f"- {m['fact']}" for m in mems)
                        logger.info("Loaded %d memories for %s", len(mems), self._caller_user_id)
                except Exception as exc:
                    logger.warning("Memory query failed: %s", exc)
            # Voice pipeline — George (British male, multilingual DE/EN)
            self._http_session = aiohttp.ClientSession()
            self._stt_session = aiohttp.ClientSession()  # separate session avoids WS/HTTP conflicts
@@ -468,9 +570,13 @@ class VoiceSession:
            def _on_user_state(ev):
                logger.info("VAD: user_state=%s", ev.new_state)
            _last_user_speech: list[str] = []
            @self.session.on("user_input_transcribed")
            def _on_user_speech(ev):
                logger.info("USER_SPEECH: %s", ev.transcript)
                if ev.transcript:
                    _last_user_speech.append(ev.transcript)
            @self.session.on("conversation_item_added")
            def _on_conversation_item(ev):
@@ -478,8 +584,30 @@ class VoiceSession:
                text = getattr(ev.item, "text_content", "") or ""
                if role == "assistant" and text:
                    logger.info("AGENT_SPEECH: %s", text)
                    if self._memory and self._caller_user_id and _last_user_speech:
                        user_text = " ".join(_last_user_speech)
                        _last_user_speech.clear()
                        asyncio.ensure_future(
                            _extract_voice_memories(user_text, text,
                                                    self._caller_user_id, self.room_id))
-            agent = Agent(instructions=_build_voice_prompt())
+            # Brave Search tool — lets the agent answer questions about current events
            @function_tool
            async def search_web(query: str) -> str:
                """Search the web for current information using Brave Search.
                Use this when asked about recent news, current events, prices,
                weather, or any information that may have changed recently.
                """
                logger.info("SEARCH: %s", query)
                result = await _brave_search(query)
                logger.info("SEARCH_RESULT: %s", result[:200])
                return result
            agent = Agent(
                instructions=_build_voice_prompt() + memory_section,
                tools=[search_web],
            )
            io_opts = room_io.RoomOptions(
                participant_identity=remote_identity,
                close_on_disconnect=False,