revert: Restore voice.py and bot.py to last known working state (9aef846)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Christian Gick
2026-02-21 20:47:51 +02:00
parent 463286a61e
commit 4a93827de3
2 changed files with 32 additions and 49 deletions

16
bot.py
View File

@@ -448,13 +448,12 @@ class Bot:
model = self.room_models.get(room_id, DEFAULT_MODEL)
caller_device_id = content.get("device_id", "")
# Generate bot's E2EE key and publish it so Element Call
# can decrypt our audio. This also triggers Element Call
# to share its key with us.
# Publish a placeholder key first to trigger Element Call
# to share its key with us. We'll republish the real shared
# key once we receive the caller's key.
import secrets
bot_key = secrets.token_bytes(16)
# Publish bot's key early so Element Call can decrypt our audio
await self._publish_encryption_key(room_id, bot_key)
placeholder_key = secrets.token_bytes(16)
await self._publish_encryption_key(room_id, placeholder_key)
vs = VoiceSession(
nio_client=self.client,
@@ -462,9 +461,8 @@ class Bot:
device_id=BOT_DEVICE_ID,
lk_url=LK_URL,
model=model,
publish_key_cb=lambda key, rid=room_id: asyncio.ensure_future(
self._publish_encryption_key(rid, key)),
bot_key=bot_key,
publish_key_cb=lambda key: asyncio.ensure_future(
self._publish_encryption_key(room_id, key)),
)
# Check timeline for caller's key

View File

@@ -24,16 +24,12 @@ LITELLM_KEY = os.environ.get("LITELLM_API_KEY", "not-needed")
LK_API_KEY = os.environ.get("LIVEKIT_API_KEY", "")
LK_API_SECRET = os.environ.get("LIVEKIT_API_SECRET", "")
ELEVENLABS_KEY = os.environ.get("ELEVENLABS_API_KEY", "")
DEFAULT_VOICE_ID = "JBFqnCBsd6RMkjVDRZzb" # George - warm, captivating, British male
DEFAULT_VOICE_ID = "onwK4e9ZLuTAKqWW03F9" # Daniel - male, free tier
VOICE_PROMPT_TEMPLATE = """Du bist ein hilfreicher Sprachassistent von Agiliton in einem Matrix-Anruf.
Du heisst "Agiliton Assistant". Du basierst auf dem Modell {model}.
Wenn jemand fragt welches Modell du bist, sei transparent und sage es.
Aktuelle Zeit: {datetime}
VOICE_PROMPT = """Du bist ein hilfreicher Sprachassistent in einem Matrix-Anruf.
STRIKTE Regeln:
- Antworte in der Sprache in der der Nutzer spricht
- Antworte IMMER auf Deutsch
- Halte JEDE Antwort auf MAXIMAL 1-2 kurze Saetze
- Sei direkt und praezise, keine Fuellwoerter
- Erfinde NICHTS - keine Geschichten, keine Musik, keine Fantasie
@@ -91,7 +87,7 @@ def _build_e2ee_options(shared_key: bytes) -> rtc.E2EEOptions:
class VoiceSession:
def __init__(self, nio_client, room_id, device_id, lk_url, model="claude-sonnet",
publish_key_cb=None, bot_key: bytes | None = None):
publish_key_cb=None):
self.nio_client = nio_client
self.room_id = room_id
self.device_id = device_id
@@ -101,26 +97,20 @@ class VoiceSession:
self.session = None
self._task = None
self._http_session = None
self._caller_key: bytes | None = None
self._caller_identity: str | None = None # "sender:device_id" format
self._bot_key: bytes = bot_key or os.urandom(16)
self._e2ee_key: bytes | None = None
self._publish_key_cb = publish_key_cb
def on_encryption_key(self, sender, device_id, key, index):
"""Receive E2EE key from Element Call participant."""
if not key:
return
identity = _make_lk_identity(sender, device_id)
self._caller_key = key
self._caller_identity = identity
logger.info("E2EE key received from %s:%s (identity=%s, index=%d, %d bytes)",
sender, device_id, identity, index, len(key))
if key and not self._e2ee_key:
self._e2ee_key = key
logger.info("E2EE key received from %s:%s (index=%d, %d bytes)",
sender, device_id, index, len(key))
async def _fetch_encryption_key_http(self) -> bytes | None:
"""Fetch encryption key from room timeline (NOT state) via Matrix HTTP API.
Element Call distributes encryption keys as timeline events, not state.
Also sets self._caller_identity from the event sender + device_id.
"""
import httpx
homeserver = str(self.nio_client.homeserver)
@@ -144,16 +134,14 @@ class VoiceSession:
if sender == user_id:
continue # skip our own key
content = evt.get("content", {})
caller_device = content.get("device_id", "")
logger.info("Found encryption_keys timeline event: sender=%s device=%s",
sender, caller_device)
if caller_device:
self._caller_identity = _make_lk_identity(sender, caller_device)
logger.info("Found encryption_keys timeline event: sender=%s content=%s",
sender, content)
for k in content.get("keys", []):
key_b64 = k.get("key", "")
if key_b64:
key_b64 += "=" * (-len(key_b64) % 4)
return base64.urlsafe_b64decode(key_b64)
import base64 as b64
return b64.urlsafe_b64decode(key_b64)
logger.info("No encryption_keys events in last %d timeline events", len(events))
except Exception as e:
logger.warning("HTTP encryption key fetch failed: %s", e)
@@ -189,26 +177,25 @@ class VoiceSession:
# Check timeline for caller's encryption key
caller_key = await self._fetch_encryption_key_http()
if caller_key:
self._caller_key = caller_key
self._e2ee_key = caller_key
logger.info("Got caller E2EE key via timeline (%d bytes)", len(caller_key))
if not self._caller_key:
if not self._e2ee_key:
# Wait up to 15s for key via sync handler (bot.py forwards
# encryption_keys timeline events to on_encryption_key)
logger.info("No key in timeline yet, waiting for sync...")
for _ in range(150):
if self._caller_key:
if self._e2ee_key:
break
await asyncio.sleep(0.1)
# Publish bot key so Element Call sees us as an E2EE participant
# E2EE disabled — Element Call key derivation mismatch not yet resolved.
# Audio pipeline confirmed working without E2EE.
if self._e2ee_key:
logger.info("Caller E2EE key available (%d bytes) — E2EE disabled pending fix",
len(self._e2ee_key))
if self._publish_key_cb:
self._publish_key_cb(self._bot_key)
# E2EE disabled at LiveKit level — Element Call per-participant key
# mode not yet compatible with LiveKit Python SDK shared key mode.
# Audio works without E2EE; Element Call still shows encryption
# indicator based on Matrix timeline key exchange.
self._publish_key_cb(self._e2ee_key)
e2ee_opts = None
room_opts = rtc.RoomOptions(e2ee=e2ee_opts)
@@ -227,7 +214,7 @@ class VoiceSession:
logger.info("Track sub: %s %s kind=%s", p.identity, pub.sid, t.kind)
await self.lk_room.connect(self.lk_url, jwt, options=room_opts)
logger.info("Connected (E2EE=shared key), remote=%d",
logger.info("Connected (E2EE=HKDF), remote=%d",
len(self.lk_room.remote_participants))
# Find the remote participant, wait up to 10s if not yet connected
@@ -267,9 +254,7 @@ class VoiceSession:
def _on_agent_speech(msg):
logger.info("AGENT_SPEECH: %s", msg.text_content)
now = datetime.datetime.now(datetime.timezone.utc).strftime("%A, %B %d, %Y %H:%M UTC")
prompt = VOICE_PROMPT_TEMPLATE.format(model=self.model, datetime=now)
agent = Agent(instructions=prompt)
agent = Agent(instructions=VOICE_PROMPT)
io_opts = room_io.RoomOptions(
participant_identity=remote_identity,
) if remote_identity else room_io.RoomOptions()