revert: Restore voice.py and bot.py to last known working state (9aef846)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Christian Gick
2026-02-21 20:47:51 +02:00
parent 463286a61e
commit 4a93827de3
2 changed files with 32 additions and 49 deletions

16
bot.py
View File

@@ -448,13 +448,12 @@ class Bot:
model = self.room_models.get(room_id, DEFAULT_MODEL) model = self.room_models.get(room_id, DEFAULT_MODEL)
caller_device_id = content.get("device_id", "") caller_device_id = content.get("device_id", "")
# Generate bot's E2EE key and publish it so Element Call # Publish a placeholder key first to trigger Element Call
# can decrypt our audio. This also triggers Element Call # to share its key with us. We'll republish the real shared
# to share its key with us. # key once we receive the caller's key.
import secrets import secrets
bot_key = secrets.token_bytes(16) placeholder_key = secrets.token_bytes(16)
# Publish bot's key early so Element Call can decrypt our audio await self._publish_encryption_key(room_id, placeholder_key)
await self._publish_encryption_key(room_id, bot_key)
vs = VoiceSession( vs = VoiceSession(
nio_client=self.client, nio_client=self.client,
@@ -462,9 +461,8 @@ class Bot:
device_id=BOT_DEVICE_ID, device_id=BOT_DEVICE_ID,
lk_url=LK_URL, lk_url=LK_URL,
model=model, model=model,
publish_key_cb=lambda key, rid=room_id: asyncio.ensure_future( publish_key_cb=lambda key: asyncio.ensure_future(
self._publish_encryption_key(rid, key)), self._publish_encryption_key(room_id, key)),
bot_key=bot_key,
) )
# Check timeline for caller's key # Check timeline for caller's key

View File

@@ -24,16 +24,12 @@ LITELLM_KEY = os.environ.get("LITELLM_API_KEY", "not-needed")
LK_API_KEY = os.environ.get("LIVEKIT_API_KEY", "") LK_API_KEY = os.environ.get("LIVEKIT_API_KEY", "")
LK_API_SECRET = os.environ.get("LIVEKIT_API_SECRET", "") LK_API_SECRET = os.environ.get("LIVEKIT_API_SECRET", "")
ELEVENLABS_KEY = os.environ.get("ELEVENLABS_API_KEY", "") ELEVENLABS_KEY = os.environ.get("ELEVENLABS_API_KEY", "")
DEFAULT_VOICE_ID = "JBFqnCBsd6RMkjVDRZzb" # George - warm, captivating, British male DEFAULT_VOICE_ID = "onwK4e9ZLuTAKqWW03F9" # Daniel - male, free tier
VOICE_PROMPT_TEMPLATE = """Du bist ein hilfreicher Sprachassistent von Agiliton in einem Matrix-Anruf. VOICE_PROMPT = """Du bist ein hilfreicher Sprachassistent in einem Matrix-Anruf.
Du heisst "Agiliton Assistant". Du basierst auf dem Modell {model}.
Wenn jemand fragt welches Modell du bist, sei transparent und sage es.
Aktuelle Zeit: {datetime}
STRIKTE Regeln: STRIKTE Regeln:
- Antworte in der Sprache in der der Nutzer spricht - Antworte IMMER auf Deutsch
- Halte JEDE Antwort auf MAXIMAL 1-2 kurze Saetze - Halte JEDE Antwort auf MAXIMAL 1-2 kurze Saetze
- Sei direkt und praezise, keine Fuellwoerter - Sei direkt und praezise, keine Fuellwoerter
- Erfinde NICHTS - keine Geschichten, keine Musik, keine Fantasie - Erfinde NICHTS - keine Geschichten, keine Musik, keine Fantasie
@@ -91,7 +87,7 @@ def _build_e2ee_options(shared_key: bytes) -> rtc.E2EEOptions:
class VoiceSession: class VoiceSession:
def __init__(self, nio_client, room_id, device_id, lk_url, model="claude-sonnet", def __init__(self, nio_client, room_id, device_id, lk_url, model="claude-sonnet",
publish_key_cb=None, bot_key: bytes | None = None): publish_key_cb=None):
self.nio_client = nio_client self.nio_client = nio_client
self.room_id = room_id self.room_id = room_id
self.device_id = device_id self.device_id = device_id
@@ -101,26 +97,20 @@ class VoiceSession:
self.session = None self.session = None
self._task = None self._task = None
self._http_session = None self._http_session = None
self._caller_key: bytes | None = None self._e2ee_key: bytes | None = None
self._caller_identity: str | None = None # "sender:device_id" format
self._bot_key: bytes = bot_key or os.urandom(16)
self._publish_key_cb = publish_key_cb self._publish_key_cb = publish_key_cb
def on_encryption_key(self, sender, device_id, key, index): def on_encryption_key(self, sender, device_id, key, index):
"""Receive E2EE key from Element Call participant.""" """Receive E2EE key from Element Call participant."""
if not key: if key and not self._e2ee_key:
return self._e2ee_key = key
identity = _make_lk_identity(sender, device_id) logger.info("E2EE key received from %s:%s (index=%d, %d bytes)",
self._caller_key = key sender, device_id, index, len(key))
self._caller_identity = identity
logger.info("E2EE key received from %s:%s (identity=%s, index=%d, %d bytes)",
sender, device_id, identity, index, len(key))
async def _fetch_encryption_key_http(self) -> bytes | None: async def _fetch_encryption_key_http(self) -> bytes | None:
"""Fetch encryption key from room timeline (NOT state) via Matrix HTTP API. """Fetch encryption key from room timeline (NOT state) via Matrix HTTP API.
Element Call distributes encryption keys as timeline events, not state. Element Call distributes encryption keys as timeline events, not state.
Also sets self._caller_identity from the event sender + device_id.
""" """
import httpx import httpx
homeserver = str(self.nio_client.homeserver) homeserver = str(self.nio_client.homeserver)
@@ -144,16 +134,14 @@ class VoiceSession:
if sender == user_id: if sender == user_id:
continue # skip our own key continue # skip our own key
content = evt.get("content", {}) content = evt.get("content", {})
caller_device = content.get("device_id", "") logger.info("Found encryption_keys timeline event: sender=%s content=%s",
logger.info("Found encryption_keys timeline event: sender=%s device=%s", sender, content)
sender, caller_device)
if caller_device:
self._caller_identity = _make_lk_identity(sender, caller_device)
for k in content.get("keys", []): for k in content.get("keys", []):
key_b64 = k.get("key", "") key_b64 = k.get("key", "")
if key_b64: if key_b64:
key_b64 += "=" * (-len(key_b64) % 4) key_b64 += "=" * (-len(key_b64) % 4)
return base64.urlsafe_b64decode(key_b64) import base64 as b64
return b64.urlsafe_b64decode(key_b64)
logger.info("No encryption_keys events in last %d timeline events", len(events)) logger.info("No encryption_keys events in last %d timeline events", len(events))
except Exception as e: except Exception as e:
logger.warning("HTTP encryption key fetch failed: %s", e) logger.warning("HTTP encryption key fetch failed: %s", e)
@@ -189,26 +177,25 @@ class VoiceSession:
# Check timeline for caller's encryption key # Check timeline for caller's encryption key
caller_key = await self._fetch_encryption_key_http() caller_key = await self._fetch_encryption_key_http()
if caller_key: if caller_key:
self._caller_key = caller_key self._e2ee_key = caller_key
logger.info("Got caller E2EE key via timeline (%d bytes)", len(caller_key)) logger.info("Got caller E2EE key via timeline (%d bytes)", len(caller_key))
if not self._caller_key: if not self._e2ee_key:
# Wait up to 15s for key via sync handler (bot.py forwards # Wait up to 15s for key via sync handler (bot.py forwards
# encryption_keys timeline events to on_encryption_key) # encryption_keys timeline events to on_encryption_key)
logger.info("No key in timeline yet, waiting for sync...") logger.info("No key in timeline yet, waiting for sync...")
for _ in range(150): for _ in range(150):
if self._caller_key: if self._e2ee_key:
break break
await asyncio.sleep(0.1) await asyncio.sleep(0.1)
# Publish bot key so Element Call sees us as an E2EE participant # E2EE disabled — Element Call key derivation mismatch not yet resolved.
if self._publish_key_cb: # Audio pipeline confirmed working without E2EE.
self._publish_key_cb(self._bot_key) if self._e2ee_key:
logger.info("Caller E2EE key available (%d bytes) — E2EE disabled pending fix",
# E2EE disabled at LiveKit level — Element Call per-participant key len(self._e2ee_key))
# mode not yet compatible with LiveKit Python SDK shared key mode. if self._publish_key_cb:
# Audio works without E2EE; Element Call still shows encryption self._publish_key_cb(self._e2ee_key)
# indicator based on Matrix timeline key exchange.
e2ee_opts = None e2ee_opts = None
room_opts = rtc.RoomOptions(e2ee=e2ee_opts) room_opts = rtc.RoomOptions(e2ee=e2ee_opts)
@@ -227,7 +214,7 @@ class VoiceSession:
logger.info("Track sub: %s %s kind=%s", p.identity, pub.sid, t.kind) logger.info("Track sub: %s %s kind=%s", p.identity, pub.sid, t.kind)
await self.lk_room.connect(self.lk_url, jwt, options=room_opts) await self.lk_room.connect(self.lk_url, jwt, options=room_opts)
logger.info("Connected (E2EE=shared key), remote=%d", logger.info("Connected (E2EE=HKDF), remote=%d",
len(self.lk_room.remote_participants)) len(self.lk_room.remote_participants))
# Find the remote participant, wait up to 10s if not yet connected # Find the remote participant, wait up to 10s if not yet connected
@@ -267,9 +254,7 @@ class VoiceSession:
def _on_agent_speech(msg): def _on_agent_speech(msg):
logger.info("AGENT_SPEECH: %s", msg.text_content) logger.info("AGENT_SPEECH: %s", msg.text_content)
now = datetime.datetime.now(datetime.timezone.utc).strftime("%A, %B %d, %Y %H:%M UTC") agent = Agent(instructions=VOICE_PROMPT)
prompt = VOICE_PROMPT_TEMPLATE.format(model=self.model, datetime=now)
agent = Agent(instructions=prompt)
io_opts = room_io.RoomOptions( io_opts = room_io.RoomOptions(
participant_identity=remote_identity, participant_identity=remote_identity,
) if remote_identity else room_io.RoomOptions() ) if remote_identity else room_io.RoomOptions()