revert: Restore voice.py and bot.py to last known working state (9aef846)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-21 20:47:51 +02:00
parent 463286a61e
commit 4a93827de3
2 changed files with 32 additions and 49 deletions
--- a/bot.py
+++ b/bot.py
@@ -448,13 +448,12 @@ class Bot:
                    model = self.room_models.get(room_id, DEFAULT_MODEL)
                    caller_device_id = content.get("device_id", "")

-                    # Generate bot's E2EE key and publish it so Element Call
-                    # can decrypt our audio. This also triggers Element Call
-                    # to share its key with us.
+                    # Publish a placeholder key first to trigger Element Call
+                    # to share its key with us. We'll republish the real shared
+                    # key once we receive the caller's key.
                    import secrets
-                    bot_key = secrets.token_bytes(16)
-                    # Publish bot's key early so Element Call can decrypt our audio
-                    await self._publish_encryption_key(room_id, bot_key)
+                    placeholder_key = secrets.token_bytes(16)
+                    await self._publish_encryption_key(room_id, placeholder_key)

                    vs = VoiceSession(
                        nio_client=self.client,
@@ -462,9 +461,8 @@ class Bot:
                        device_id=BOT_DEVICE_ID,
                        lk_url=LK_URL,
                        model=model,
-                        publish_key_cb=lambda key, rid=room_id: asyncio.ensure_future(
-                            self._publish_encryption_key(rid, key)),
-                        bot_key=bot_key,
+                        publish_key_cb=lambda key: asyncio.ensure_future(
+                            self._publish_encryption_key(room_id, key)),
                    )

                    # Check timeline for caller's key
--- a/voice.py
+++ b/voice.py
@@ -24,16 +24,12 @@ LITELLM_KEY = os.environ.get("LITELLM_API_KEY", "not-needed")
 LK_API_KEY = os.environ.get("LIVEKIT_API_KEY", "")
 LK_API_SECRET = os.environ.get("LIVEKIT_API_SECRET", "")
 ELEVENLABS_KEY = os.environ.get("ELEVENLABS_API_KEY", "")
-DEFAULT_VOICE_ID = "JBFqnCBsd6RMkjVDRZzb"  # George - warm, captivating, British male
+DEFAULT_VOICE_ID = "onwK4e9ZLuTAKqWW03F9"  # Daniel - male, free tier

-VOICE_PROMPT_TEMPLATE = """Du bist ein hilfreicher Sprachassistent von Agiliton in einem Matrix-Anruf.
-Du heisst "Agiliton Assistant". Du basierst auf dem Modell {model}.
-Wenn jemand fragt welches Modell du bist, sei transparent und sage es.
-
-Aktuelle Zeit: {datetime}
+VOICE_PROMPT = """Du bist ein hilfreicher Sprachassistent in einem Matrix-Anruf.

 STRIKTE Regeln:
- Antworte in der Sprache in der der Nutzer spricht
+- Antworte IMMER auf Deutsch
 - Halte JEDE Antwort auf MAXIMAL 1-2 kurze Saetze
 - Sei direkt und praezise, keine Fuellwoerter
 - Erfinde NICHTS - keine Geschichten, keine Musik, keine Fantasie
@@ -91,7 +87,7 @@ def _build_e2ee_options(shared_key: bytes) -> rtc.E2EEOptions:

 class VoiceSession:
    def __init__(self, nio_client, room_id, device_id, lk_url, model="claude-sonnet",
-                 publish_key_cb=None, bot_key: bytes | None = None):
+                 publish_key_cb=None):
        self.nio_client = nio_client
        self.room_id = room_id
        self.device_id = device_id
@@ -101,26 +97,20 @@ class VoiceSession:
        self.session = None
        self._task = None
        self._http_session = None
-        self._caller_key: bytes | None = None
-        self._caller_identity: str | None = None  # "sender:device_id" format
-        self._bot_key: bytes = bot_key or os.urandom(16)
+        self._e2ee_key: bytes | None = None
        self._publish_key_cb = publish_key_cb

    def on_encryption_key(self, sender, device_id, key, index):
        """Receive E2EE key from Element Call participant."""
-        if not key:
-            return
-        identity = _make_lk_identity(sender, device_id)
-        self._caller_key = key
-        self._caller_identity = identity
-        logger.info("E2EE key received from %s:%s (identity=%s, index=%d, %d bytes)",
-                    sender, device_id, identity, index, len(key))
+        if key and not self._e2ee_key:
+            self._e2ee_key = key
+            logger.info("E2EE key received from %s:%s (index=%d, %d bytes)",
+                        sender, device_id, index, len(key))

    async def _fetch_encryption_key_http(self) -> bytes | None:
        """Fetch encryption key from room timeline (NOT state) via Matrix HTTP API.

        Element Call distributes encryption keys as timeline events, not state.
-        Also sets self._caller_identity from the event sender + device_id.
        """
        import httpx
        homeserver = str(self.nio_client.homeserver)
@@ -144,16 +134,14 @@ class VoiceSession:
                        if sender == user_id:
                            continue  # skip our own key
                        content = evt.get("content", {})
-                        caller_device = content.get("device_id", "")
-                        logger.info("Found encryption_keys timeline event: sender=%s device=%s",
-                                    sender, caller_device)
-                        if caller_device:
-                            self._caller_identity = _make_lk_identity(sender, caller_device)
+                        logger.info("Found encryption_keys timeline event: sender=%s content=%s",
+                                    sender, content)
                        for k in content.get("keys", []):
                            key_b64 = k.get("key", "")
                            if key_b64:
                                key_b64 += "=" * (-len(key_b64) % 4)
-                                return base64.urlsafe_b64decode(key_b64)
+                                import base64 as b64
+                                return b64.urlsafe_b64decode(key_b64)
                logger.info("No encryption_keys events in last %d timeline events", len(events))
        except Exception as e:
            logger.warning("HTTP encryption key fetch failed: %s", e)
@@ -189,26 +177,25 @@ class VoiceSession:
            # Check timeline for caller's encryption key
            caller_key = await self._fetch_encryption_key_http()
            if caller_key:
-                self._caller_key = caller_key
+                self._e2ee_key = caller_key
                logger.info("Got caller E2EE key via timeline (%d bytes)", len(caller_key))

-            if not self._caller_key:
+            if not self._e2ee_key:
                # Wait up to 15s for key via sync handler (bot.py forwards
                # encryption_keys timeline events to on_encryption_key)
                logger.info("No key in timeline yet, waiting for sync...")
                for _ in range(150):
-                    if self._caller_key:
+                    if self._e2ee_key:
                        break
                    await asyncio.sleep(0.1)

-            # Publish bot key so Element Call sees us as an E2EE participant
+            # E2EE disabled — Element Call key derivation mismatch not yet resolved.
+            # Audio pipeline confirmed working without E2EE.
+            if self._e2ee_key:
+                logger.info("Caller E2EE key available (%d bytes) — E2EE disabled pending fix",
+                            len(self._e2ee_key))
                if self._publish_key_cb:
-                self._publish_key_cb(self._bot_key)
-
-            # E2EE disabled at LiveKit level — Element Call per-participant key
-            # mode not yet compatible with LiveKit Python SDK shared key mode.
-            # Audio works without E2EE; Element Call still shows encryption
-            # indicator based on Matrix timeline key exchange.
+                    self._publish_key_cb(self._e2ee_key)
            e2ee_opts = None

            room_opts = rtc.RoomOptions(e2ee=e2ee_opts)
@@ -227,7 +214,7 @@ class VoiceSession:
                logger.info("Track sub: %s %s kind=%s", p.identity, pub.sid, t.kind)

            await self.lk_room.connect(self.lk_url, jwt, options=room_opts)
-            logger.info("Connected (E2EE=shared key), remote=%d",
+            logger.info("Connected (E2EE=HKDF), remote=%d",
                        len(self.lk_room.remote_participants))

            # Find the remote participant, wait up to 10s if not yet connected
@@ -267,9 +254,7 @@ class VoiceSession:
            def _on_agent_speech(msg):
                logger.info("AGENT_SPEECH: %s", msg.text_content)

-            now = datetime.datetime.now(datetime.timezone.utc).strftime("%A, %B %d, %Y %H:%M UTC")
-            prompt = VOICE_PROMPT_TEMPLATE.format(model=self.model, datetime=now)
-            agent = Agent(instructions=prompt)
+            agent = Agent(instructions=VOICE_PROMPT)
            io_opts = room_io.RoomOptions(
                participant_identity=remote_identity,
            ) if remote_identity else room_io.RoomOptions()