fix(e2ee): pre-derive HKDF keys in Python instead of Rust FFI (MAT-144)

Switch from Rust-side HKDF (KDF_HKDF=1) to Python-side HKDF derivation
with raw key mode (KDF_RAW=0). This eliminates potential HKDF implementation
mismatches between Rust FFI and Element Call JS that caused video frame
decryption failures (audio worked, video showed 8x8 garbage frames).

Changes:
- Add _derive_and_set_key() helper that pre-derives HKDF then calls set_key()
- Set key_derivation_function=KDF_RAW (proto 0 = no Rust-side derivation)
- Replace all direct set_key() calls with _derive_and_set_key()
- Add per-track diagnostic logging (audio vs video)
- Add frame size check in look_at_screen (detect E2EE failure early)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Christian Gick
2026-03-09 17:05:59 +02:00
parent 19a973b9eb
commit d586ddfa6d

105
voice.py
View File

@@ -190,10 +190,10 @@ KDF_HKDF = 1 # Rust FFI applies HKDF internally (proto enum value 1)
def _hkdf_derive(ikm: bytes) -> bytes:
"""Pre-derive AES key via HKDF-SHA256 matching livekit-client-sdk-js deriveEncryptionKey().
"""Pre-derive AES-128 key via HKDF-SHA256, matching Element Call JS deriveEncryptionKey().
JS params: hash=SHA-256, salt=encode("LKFrameEncryptionKey"), info=ArrayBuffer(128), length=128bit
We set this pre-derived key via set_shared_key() which bypasses Rust FFI KDF entirely.
Result is passed to set_key() with KDF_RAW so Rust uses it as-is (no double-derivation).
"""
import hmac
salt = b"LKFrameEncryptionKey"
@@ -208,7 +208,7 @@ def _ratchet_keys(base_raw: bytes, count: int = 6) -> dict[int, bytes]:
EC JS ratchet: new_raw = HMAC(key=current_raw, data="LKFrameEncryptionKey")[:16]
Returns {index: raw_key} for all indices 0..count-1.
Set these via set_key(identity, raw, index) with KDF_HKDF=1 so Rust applies HKDF.
Each raw key is then HKDF-derived before passing to set_key() with KDF_RAW.
"""
import hmac as _hmac
keys = {}
@@ -219,6 +219,20 @@ def _ratchet_keys(base_raw: bytes, count: int = 6) -> dict[int, bytes]:
return keys
def _derive_and_set_key(kp, identity: str, raw_key: bytes, index: int) -> None:
"""Pre-derive HKDF key in Python, then set via KeyProvider (MAT-144).
This replaces direct set_key(identity, raw_key, index) calls.
Pre-derivation in Python ensures exact match with Element Call JS HKDF,
eliminating Rust FFI HKDF implementation differences that caused
video frame decryption failures (audio worked, video didn't).
"""
derived = _hkdf_derive(raw_key)
ok = kp.set_key(identity, derived, index)
logger.debug("set_key[%d] %s: raw=%s → derived=%s (ok=%s)",
index, identity, raw_key.hex()[:8], derived.hex(), ok)
async def _brave_search(query: str, count: int = 5) -> str:
"""Call Brave Search API and return formatted results."""
if not BRAVE_API_KEY:
@@ -429,21 +443,24 @@ async def _confluence_recent_pages(limit: int = 5) -> list[dict]:
return results
def _build_e2ee_options() -> rtc.E2EEOptions:
"""Build E2EE options — let Rust FFI apply HKDF internally (KDF_HKDF=1).
KDF_RAW = 0 # proto value 0 = PBKDF2 in proto, but Rust returns None (= no KDF = raw key)
Pass raw base keys from Matrix key exchange events directly to set_key().
The Rust FFI derives the AES frame key via HKDF(base_key, ratchetSalt, ...) internally.
Element Call uses: ratchetWindowSize=10, keyringSize=256, ratchetSalt="LKFrameEncryptionKey"
NOTE: proto value 0 = PBKDF2 (not raw/none) — must use KDF_HKDF=1.
def _build_e2ee_options() -> rtc.E2EEOptions:
"""Build E2EE options — Python pre-derives HKDF keys, Rust uses them raw (MAT-144).
We pre-derive AES keys via _hkdf_derive() in Python (matching EC's JS deriveEncryptionKey),
then pass derived keys to set_key() with KDF_RAW (proto 0 = no Rust-side derivation).
This eliminates any HKDF implementation mismatch between Rust FFI and Element Call JS.
Ratcheted keys are also pre-derived in Python via _ratchet_keys().
"""
key_opts = rtc.KeyProviderOptions(
shared_key=b"", # empty = per-participant mode
ratchet_window_size=10,
ratchet_window_size=0, # we handle ratcheting in Python
ratchet_salt=b"LKFrameEncryptionKey",
failure_tolerance=10,
key_ring_size=256,
key_derivation_function=KDF_HKDF, # Rust FFI applies HKDF; we pass raw base keys
key_derivation_function=KDF_RAW, # no Rust-side KDF; keys are pre-derived in Python
)
return rtc.E2EEOptions(
encryption_type=rtc.EncryptionType.GCM,
@@ -504,9 +521,8 @@ class VoiceSession:
if has_subscribed:
try:
kp = self.lk_room.e2ee_manager.key_provider
ok = kp.set_key(p.identity, key, index)
logger.info("Late key set_key[%d] for %s (ok=%s)",
index, p.identity, ok)
_derive_and_set_key(kp, p.identity, key, index)
logger.info("Late key[%d] set for %s", index, p.identity)
except Exception as e:
logger.warning("Late key set_key failed: %s", e)
break
@@ -666,11 +682,8 @@ class VoiceSession:
# NOTE: Do NOT create rtc.AudioStream here — it competes with AgentSession's
# internal audio pipeline for event loop time, causing intermittent VAD failures
# (user_state stuck on "away"). See MAT-40. Use e2ee_state_changed for flow confirmation.
# *** KEY FIX: set_key() with KDF_HKDF only applies HKDF when the frame cryptor
# for this participant already exists. The frame cryptor is created at track
# subscription time. Calling set_key() BEFORE track subscription (at connect)
# skips HKDF derivation → raw key stored → DEC_FAILED.
# Solution: set caller key HERE, after frame cryptor is initialized.
# MAT-144: Pre-derive HKDF in Python, pass derived key with KDF_RAW.
# This ensures exact HKDF match with Element Call JS for both audio AND video.
# Store video track for on-demand vision (look_at_screen tool)
# Screen share = source "screen_share" or "screenshare"; camera = "camera" or default
if int(t.kind) == 2: # video track (LiveKit: 1=audio, 2=video)
@@ -679,15 +692,16 @@ class VoiceSession:
logger.info("Video track stored from %s source=%s for on-demand vision", p.identity, track_source)
if int(t.kind) in (1, 2) and e2ee_opts is not None: # audio + video tracks
caller_id = p.identity
logger.info("E2EE_DIAG: track_subscribed for %s, have %d caller keys",
caller_id, len(self._caller_all_keys))
track_type = "video" if int(t.kind) == 2 else "audio"
logger.info("E2EE_DIAG: track_subscribed %s for %s, have %d caller keys",
track_type, caller_id, len(self._caller_all_keys))
try:
kp_local = self.lk_room.e2ee_manager.key_provider
if self._caller_all_keys:
for idx, base_k in sorted(self._caller_all_keys.items()):
ok = kp_local.set_key(caller_id, base_k, idx)
logger.info("on_ts: set_key[%d] for %s (ok=%s, %d bytes, raw=%s)",
idx, caller_id, ok, len(base_k), base_k.hex())
_derive_and_set_key(kp_local, caller_id, base_k, idx)
logger.info("on_ts: derived+set key[%d] for %s (%s track)",
idx, caller_id, track_type)
else:
logger.warning("on_ts: no caller keys yet — scheduling 0.5s retry")
async def _brief_key_retry(pid=caller_id):
@@ -696,8 +710,8 @@ class VoiceSession:
try:
kp_r = self.lk_room.e2ee_manager.key_provider
for idx, base_k in sorted(self._caller_all_keys.items()):
ok = kp_r.set_key(pid, base_k, idx)
logger.info("on_ts_retry: set_key[%d] for %s (ok=%s)", idx, pid, ok)
_derive_and_set_key(kp_r, pid, base_k, idx)
logger.info("on_ts_retry: derived+set key[%d] for %s", idx, pid)
except Exception as exc:
logger.warning("on_ts_retry: set_key failed: %s", exc)
else:
@@ -718,21 +732,21 @@ class VoiceSession:
try:
kp_e = self.lk_room.e2ee_manager.key_provider
for idx, base_k in sorted(self._caller_all_keys.items()):
ok = kp_e.set_key(p_id, base_k, idx)
logger.info("e2ee_state set_key[%d] for %s (ok=%s)", idx, p_id, ok)
_derive_and_set_key(kp_e, p_id, base_k, idx)
logger.info("e2ee_state: derived+set key[%d] for %s on %s",
idx, p_id, state_name)
except Exception as exc:
logger.warning("e2ee_state set_key failed: %s", exc)
await self.lk_room.connect(self.lk_url, jwt, options=room_opts)
logger.info("Connected (E2EE=HKDF), remote=%d",
logger.info("Connected (E2EE=Python-HKDF+RAW), remote=%d",
len(self.lk_room.remote_participants))
# Set bot's own key immediately after connect — local frame cryptor exists at connect time.
# CALLER keys are set in on_track_subscribed (NOT here) because the caller's frame cryptor
# is only created when their track arrives. Calling set_key() before that skips HKDF.
# Pre-derive via HKDF in Python since KDF_RAW is set (no Rust-side derivation).
kp = self.lk_room.e2ee_manager.key_provider
ok = kp.set_key(bot_identity, self._bot_key, 0)
logger.info("Set bot key for %s (ok=%s, %d bytes)", bot_identity, ok, len(self._bot_key))
_derive_and_set_key(kp, bot_identity, self._bot_key, 0)
logger.info("Set bot derived key for %s (%d raw bytes)", bot_identity, len(self._bot_key))
# Element Call rotates its key when bot joins. Wait up to 3s for the
# rotated key to arrive via nio sync before proceeding. If it arrives,
@@ -768,18 +782,14 @@ class VoiceSession:
if remote_identity:
break
# Set shared_key with pre-derived AES key for caller decryption.
# NOT using set_key() for caller — Rust HKDF may produce different result than EC's JS HKDF.
# set_shared_key() stores key raw (no KDF applied) — we pre-derive in Python.
# After key rotation wait: if track already subscribed, set rotated key.
# (Usually on_track_subscribed handles this, but if track arrived before rotation,
# the rotated key needs to be set here for the already-subscribed participant.)
# Post-rotation: set caller keys with Python-derived HKDF (MAT-144).
# If track already subscribed, on_track_subscribed already set keys.
# This catches the case where track arrived before key rotation completed.
if self._caller_all_keys and remote_identity:
try:
for idx, base_k in sorted(self._caller_all_keys.items()):
ok = kp.set_key(remote_identity, base_k, idx)
logger.info("Post-rotation set_key[%d] for %s (ok=%s)",
idx, remote_identity, ok)
_derive_and_set_key(kp, remote_identity, base_k, idx)
logger.info("Post-rotation derived+set key[%d] for %s", idx, remote_identity)
except Exception as e:
logger.warning("Post-rotation set_key failed: %s", e)
elif not self._caller_all_keys:
@@ -1169,6 +1179,11 @@ class VoiceSession:
buf = io.BytesIO()
img.convert("RGB").save(buf, format="JPEG", quality=85)
img_b64 = base64.b64encode(buf.getvalue()).decode()
if rgba.width <= 16 or rgba.height <= 16:
logger.warning("LOOK_AT_SCREEN: frame %dx%d — E2EE decryption likely failed (garbage frame)",
rgba.width, rgba.height)
return ("E2EE Video-Entschluesselung fehlgeschlagen — das Bild ist nur "
f"{rgba.width}x{rgba.height} Pixel. Bitte Bildschirmfreigabe neu starten.")
logger.info("LOOK_AT_SCREEN: captured %dx%d frame (%d KB JPEG)",
rgba.width, rgba.height, len(buf.getvalue()) // 1024)
@@ -1305,9 +1320,9 @@ class VoiceSession:
kp_w = self.lk_room.e2ee_manager.key_provider
for p in self.lk_room.remote_participants.values():
for idx, base_k in sorted(self._caller_all_keys.items()):
ok = kp_w.set_key(p.identity, base_k, idx)
logger.info("VAD_WATCHDOG: recovery set_key[%d] for %s (ok=%s)",
idx, p.identity, ok)
_derive_and_set_key(kp_w, p.identity, base_k, idx)
logger.info("VAD_WATCHDOG: recovery derived+set key[%d] for %s",
idx, p.identity)
except Exception as exc:
logger.warning("VAD_WATCHDOG: recovery set_key failed: %s", exc)
_vad_state_log["away_since"] = None # only warn once per stuck period