From d586ddfa6d02929e35033ca5e23d79b319721fdc Mon Sep 17 00:00:00 2001 From: Christian Gick Date: Mon, 9 Mar 2026 17:05:59 +0200 Subject: [PATCH] fix(e2ee): pre-derive HKDF keys in Python instead of Rust FFI (MAT-144) Switch from Rust-side HKDF (KDF_HKDF=1) to Python-side HKDF derivation with raw key mode (KDF_RAW=0). This eliminates potential HKDF implementation mismatches between Rust FFI and Element Call JS that caused video frame decryption failures (audio worked, video showed 8x8 garbage frames). Changes: - Add _derive_and_set_key() helper that pre-derives HKDF then calls set_key() - Set key_derivation_function=KDF_RAW (proto 0 = no Rust-side derivation) - Replace all direct set_key() calls with _derive_and_set_key() - Add per-track diagnostic logging (audio vs video) - Add frame size check in look_at_screen (detect E2EE failure early) Co-Authored-By: Claude Opus 4.6 --- voice.py | 105 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 60 insertions(+), 45 deletions(-) diff --git a/voice.py b/voice.py index cb726a2..b35e003 100644 --- a/voice.py +++ b/voice.py @@ -190,10 +190,10 @@ KDF_HKDF = 1 # Rust FFI applies HKDF internally (proto enum value 1) def _hkdf_derive(ikm: bytes) -> bytes: - """Pre-derive AES key via HKDF-SHA256 matching livekit-client-sdk-js deriveEncryptionKey(). + """Pre-derive AES-128 key via HKDF-SHA256, matching Element Call JS deriveEncryptionKey(). JS params: hash=SHA-256, salt=encode("LKFrameEncryptionKey"), info=ArrayBuffer(128), length=128bit - We set this pre-derived key via set_shared_key() which bypasses Rust FFI KDF entirely. + Result is passed to set_key() with KDF_RAW so Rust uses it as-is (no double-derivation). """ import hmac salt = b"LKFrameEncryptionKey" @@ -208,7 +208,7 @@ def _ratchet_keys(base_raw: bytes, count: int = 6) -> dict[int, bytes]: EC JS ratchet: new_raw = HMAC(key=current_raw, data="LKFrameEncryptionKey")[:16] Returns {index: raw_key} for all indices 0..count-1. - Set these via set_key(identity, raw, index) with KDF_HKDF=1 so Rust applies HKDF. + Each raw key is then HKDF-derived before passing to set_key() with KDF_RAW. """ import hmac as _hmac keys = {} @@ -219,6 +219,20 @@ def _ratchet_keys(base_raw: bytes, count: int = 6) -> dict[int, bytes]: return keys +def _derive_and_set_key(kp, identity: str, raw_key: bytes, index: int) -> None: + """Pre-derive HKDF key in Python, then set via KeyProvider (MAT-144). + + This replaces direct set_key(identity, raw_key, index) calls. + Pre-derivation in Python ensures exact match with Element Call JS HKDF, + eliminating Rust FFI HKDF implementation differences that caused + video frame decryption failures (audio worked, video didn't). + """ + derived = _hkdf_derive(raw_key) + ok = kp.set_key(identity, derived, index) + logger.debug("set_key[%d] %s: raw=%s → derived=%s (ok=%s)", + index, identity, raw_key.hex()[:8], derived.hex(), ok) + + async def _brave_search(query: str, count: int = 5) -> str: """Call Brave Search API and return formatted results.""" if not BRAVE_API_KEY: @@ -429,21 +443,24 @@ async def _confluence_recent_pages(limit: int = 5) -> list[dict]: return results -def _build_e2ee_options() -> rtc.E2EEOptions: - """Build E2EE options — let Rust FFI apply HKDF internally (KDF_HKDF=1). +KDF_RAW = 0 # proto value 0 = PBKDF2 in proto, but Rust returns None (= no KDF = raw key) - Pass raw base keys from Matrix key exchange events directly to set_key(). - The Rust FFI derives the AES frame key via HKDF(base_key, ratchetSalt, ...) internally. - Element Call uses: ratchetWindowSize=10, keyringSize=256, ratchetSalt="LKFrameEncryptionKey" - NOTE: proto value 0 = PBKDF2 (not raw/none) — must use KDF_HKDF=1. + +def _build_e2ee_options() -> rtc.E2EEOptions: + """Build E2EE options — Python pre-derives HKDF keys, Rust uses them raw (MAT-144). + + We pre-derive AES keys via _hkdf_derive() in Python (matching EC's JS deriveEncryptionKey), + then pass derived keys to set_key() with KDF_RAW (proto 0 = no Rust-side derivation). + This eliminates any HKDF implementation mismatch between Rust FFI and Element Call JS. + Ratcheted keys are also pre-derived in Python via _ratchet_keys(). """ key_opts = rtc.KeyProviderOptions( shared_key=b"", # empty = per-participant mode - ratchet_window_size=10, + ratchet_window_size=0, # we handle ratcheting in Python ratchet_salt=b"LKFrameEncryptionKey", failure_tolerance=10, key_ring_size=256, - key_derivation_function=KDF_HKDF, # Rust FFI applies HKDF; we pass raw base keys + key_derivation_function=KDF_RAW, # no Rust-side KDF; keys are pre-derived in Python ) return rtc.E2EEOptions( encryption_type=rtc.EncryptionType.GCM, @@ -504,9 +521,8 @@ class VoiceSession: if has_subscribed: try: kp = self.lk_room.e2ee_manager.key_provider - ok = kp.set_key(p.identity, key, index) - logger.info("Late key set_key[%d] for %s (ok=%s)", - index, p.identity, ok) + _derive_and_set_key(kp, p.identity, key, index) + logger.info("Late key[%d] set for %s", index, p.identity) except Exception as e: logger.warning("Late key set_key failed: %s", e) break @@ -666,11 +682,8 @@ class VoiceSession: # NOTE: Do NOT create rtc.AudioStream here — it competes with AgentSession's # internal audio pipeline for event loop time, causing intermittent VAD failures # (user_state stuck on "away"). See MAT-40. Use e2ee_state_changed for flow confirmation. - # *** KEY FIX: set_key() with KDF_HKDF only applies HKDF when the frame cryptor - # for this participant already exists. The frame cryptor is created at track - # subscription time. Calling set_key() BEFORE track subscription (at connect) - # skips HKDF derivation → raw key stored → DEC_FAILED. - # Solution: set caller key HERE, after frame cryptor is initialized. + # MAT-144: Pre-derive HKDF in Python, pass derived key with KDF_RAW. + # This ensures exact HKDF match with Element Call JS for both audio AND video. # Store video track for on-demand vision (look_at_screen tool) # Screen share = source "screen_share" or "screenshare"; camera = "camera" or default if int(t.kind) == 2: # video track (LiveKit: 1=audio, 2=video) @@ -679,15 +692,16 @@ class VoiceSession: logger.info("Video track stored from %s source=%s for on-demand vision", p.identity, track_source) if int(t.kind) in (1, 2) and e2ee_opts is not None: # audio + video tracks caller_id = p.identity - logger.info("E2EE_DIAG: track_subscribed for %s, have %d caller keys", - caller_id, len(self._caller_all_keys)) + track_type = "video" if int(t.kind) == 2 else "audio" + logger.info("E2EE_DIAG: track_subscribed %s for %s, have %d caller keys", + track_type, caller_id, len(self._caller_all_keys)) try: kp_local = self.lk_room.e2ee_manager.key_provider if self._caller_all_keys: for idx, base_k in sorted(self._caller_all_keys.items()): - ok = kp_local.set_key(caller_id, base_k, idx) - logger.info("on_ts: set_key[%d] for %s (ok=%s, %d bytes, raw=%s)", - idx, caller_id, ok, len(base_k), base_k.hex()) + _derive_and_set_key(kp_local, caller_id, base_k, idx) + logger.info("on_ts: derived+set key[%d] for %s (%s track)", + idx, caller_id, track_type) else: logger.warning("on_ts: no caller keys yet — scheduling 0.5s retry") async def _brief_key_retry(pid=caller_id): @@ -696,8 +710,8 @@ class VoiceSession: try: kp_r = self.lk_room.e2ee_manager.key_provider for idx, base_k in sorted(self._caller_all_keys.items()): - ok = kp_r.set_key(pid, base_k, idx) - logger.info("on_ts_retry: set_key[%d] for %s (ok=%s)", idx, pid, ok) + _derive_and_set_key(kp_r, pid, base_k, idx) + logger.info("on_ts_retry: derived+set key[%d] for %s", idx, pid) except Exception as exc: logger.warning("on_ts_retry: set_key failed: %s", exc) else: @@ -718,21 +732,21 @@ class VoiceSession: try: kp_e = self.lk_room.e2ee_manager.key_provider for idx, base_k in sorted(self._caller_all_keys.items()): - ok = kp_e.set_key(p_id, base_k, idx) - logger.info("e2ee_state set_key[%d] for %s (ok=%s)", idx, p_id, ok) + _derive_and_set_key(kp_e, p_id, base_k, idx) + logger.info("e2ee_state: derived+set key[%d] for %s on %s", + idx, p_id, state_name) except Exception as exc: logger.warning("e2ee_state set_key failed: %s", exc) await self.lk_room.connect(self.lk_url, jwt, options=room_opts) - logger.info("Connected (E2EE=HKDF), remote=%d", + logger.info("Connected (E2EE=Python-HKDF+RAW), remote=%d", len(self.lk_room.remote_participants)) # Set bot's own key immediately after connect — local frame cryptor exists at connect time. - # CALLER keys are set in on_track_subscribed (NOT here) because the caller's frame cryptor - # is only created when their track arrives. Calling set_key() before that skips HKDF. + # Pre-derive via HKDF in Python since KDF_RAW is set (no Rust-side derivation). kp = self.lk_room.e2ee_manager.key_provider - ok = kp.set_key(bot_identity, self._bot_key, 0) - logger.info("Set bot key for %s (ok=%s, %d bytes)", bot_identity, ok, len(self._bot_key)) + _derive_and_set_key(kp, bot_identity, self._bot_key, 0) + logger.info("Set bot derived key for %s (%d raw bytes)", bot_identity, len(self._bot_key)) # Element Call rotates its key when bot joins. Wait up to 3s for the # rotated key to arrive via nio sync before proceeding. If it arrives, @@ -768,18 +782,14 @@ class VoiceSession: if remote_identity: break - # Set shared_key with pre-derived AES key for caller decryption. - # NOT using set_key() for caller — Rust HKDF may produce different result than EC's JS HKDF. - # set_shared_key() stores key raw (no KDF applied) — we pre-derive in Python. - # After key rotation wait: if track already subscribed, set rotated key. - # (Usually on_track_subscribed handles this, but if track arrived before rotation, - # the rotated key needs to be set here for the already-subscribed participant.) + # Post-rotation: set caller keys with Python-derived HKDF (MAT-144). + # If track already subscribed, on_track_subscribed already set keys. + # This catches the case where track arrived before key rotation completed. if self._caller_all_keys and remote_identity: try: for idx, base_k in sorted(self._caller_all_keys.items()): - ok = kp.set_key(remote_identity, base_k, idx) - logger.info("Post-rotation set_key[%d] for %s (ok=%s)", - idx, remote_identity, ok) + _derive_and_set_key(kp, remote_identity, base_k, idx) + logger.info("Post-rotation derived+set key[%d] for %s", idx, remote_identity) except Exception as e: logger.warning("Post-rotation set_key failed: %s", e) elif not self._caller_all_keys: @@ -1169,6 +1179,11 @@ class VoiceSession: buf = io.BytesIO() img.convert("RGB").save(buf, format="JPEG", quality=85) img_b64 = base64.b64encode(buf.getvalue()).decode() + if rgba.width <= 16 or rgba.height <= 16: + logger.warning("LOOK_AT_SCREEN: frame %dx%d — E2EE decryption likely failed (garbage frame)", + rgba.width, rgba.height) + return ("E2EE Video-Entschluesselung fehlgeschlagen — das Bild ist nur " + f"{rgba.width}x{rgba.height} Pixel. Bitte Bildschirmfreigabe neu starten.") logger.info("LOOK_AT_SCREEN: captured %dx%d frame (%d KB JPEG)", rgba.width, rgba.height, len(buf.getvalue()) // 1024) @@ -1305,9 +1320,9 @@ class VoiceSession: kp_w = self.lk_room.e2ee_manager.key_provider for p in self.lk_room.remote_participants.values(): for idx, base_k in sorted(self._caller_all_keys.items()): - ok = kp_w.set_key(p.identity, base_k, idx) - logger.info("VAD_WATCHDOG: recovery set_key[%d] for %s (ok=%s)", - idx, p.identity, ok) + _derive_and_set_key(kp_w, p.identity, base_k, idx) + logger.info("VAD_WATCHDOG: recovery derived+set key[%d] for %s", + idx, p.identity) except Exception as exc: logger.warning("VAD_WATCHDOG: recovery set_key failed: %s", exc) _vad_state_log["away_since"] = None # only warn once per stuck period