fix(e2ee): pre-derive HKDF keys in Python instead of Rust FFI (MAT-144)
Switch from Rust-side HKDF (KDF_HKDF=1) to Python-side HKDF derivation with raw key mode (KDF_RAW=0). This eliminates potential HKDF implementation mismatches between Rust FFI and Element Call JS that caused video frame decryption failures (audio worked, video showed 8x8 garbage frames). Changes: - Add _derive_and_set_key() helper that pre-derives HKDF then calls set_key() - Set key_derivation_function=KDF_RAW (proto 0 = no Rust-side derivation) - Replace all direct set_key() calls with _derive_and_set_key() - Add per-track diagnostic logging (audio vs video) - Add frame size check in look_at_screen (detect E2EE failure early) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
105
voice.py
105
voice.py
@@ -190,10 +190,10 @@ KDF_HKDF = 1 # Rust FFI applies HKDF internally (proto enum value 1)
|
||||
|
||||
|
||||
def _hkdf_derive(ikm: bytes) -> bytes:
|
||||
"""Pre-derive AES key via HKDF-SHA256 matching livekit-client-sdk-js deriveEncryptionKey().
|
||||
"""Pre-derive AES-128 key via HKDF-SHA256, matching Element Call JS deriveEncryptionKey().
|
||||
|
||||
JS params: hash=SHA-256, salt=encode("LKFrameEncryptionKey"), info=ArrayBuffer(128), length=128bit
|
||||
We set this pre-derived key via set_shared_key() which bypasses Rust FFI KDF entirely.
|
||||
Result is passed to set_key() with KDF_RAW so Rust uses it as-is (no double-derivation).
|
||||
"""
|
||||
import hmac
|
||||
salt = b"LKFrameEncryptionKey"
|
||||
@@ -208,7 +208,7 @@ def _ratchet_keys(base_raw: bytes, count: int = 6) -> dict[int, bytes]:
|
||||
|
||||
EC JS ratchet: new_raw = HMAC(key=current_raw, data="LKFrameEncryptionKey")[:16]
|
||||
Returns {index: raw_key} for all indices 0..count-1.
|
||||
Set these via set_key(identity, raw, index) with KDF_HKDF=1 so Rust applies HKDF.
|
||||
Each raw key is then HKDF-derived before passing to set_key() with KDF_RAW.
|
||||
"""
|
||||
import hmac as _hmac
|
||||
keys = {}
|
||||
@@ -219,6 +219,20 @@ def _ratchet_keys(base_raw: bytes, count: int = 6) -> dict[int, bytes]:
|
||||
return keys
|
||||
|
||||
|
||||
def _derive_and_set_key(kp, identity: str, raw_key: bytes, index: int) -> None:
|
||||
"""Pre-derive HKDF key in Python, then set via KeyProvider (MAT-144).
|
||||
|
||||
This replaces direct set_key(identity, raw_key, index) calls.
|
||||
Pre-derivation in Python ensures exact match with Element Call JS HKDF,
|
||||
eliminating Rust FFI HKDF implementation differences that caused
|
||||
video frame decryption failures (audio worked, video didn't).
|
||||
"""
|
||||
derived = _hkdf_derive(raw_key)
|
||||
ok = kp.set_key(identity, derived, index)
|
||||
logger.debug("set_key[%d] %s: raw=%s → derived=%s (ok=%s)",
|
||||
index, identity, raw_key.hex()[:8], derived.hex(), ok)
|
||||
|
||||
|
||||
async def _brave_search(query: str, count: int = 5) -> str:
|
||||
"""Call Brave Search API and return formatted results."""
|
||||
if not BRAVE_API_KEY:
|
||||
@@ -429,21 +443,24 @@ async def _confluence_recent_pages(limit: int = 5) -> list[dict]:
|
||||
return results
|
||||
|
||||
|
||||
def _build_e2ee_options() -> rtc.E2EEOptions:
|
||||
"""Build E2EE options — let Rust FFI apply HKDF internally (KDF_HKDF=1).
|
||||
KDF_RAW = 0 # proto value 0 = PBKDF2 in proto, but Rust returns None (= no KDF = raw key)
|
||||
|
||||
Pass raw base keys from Matrix key exchange events directly to set_key().
|
||||
The Rust FFI derives the AES frame key via HKDF(base_key, ratchetSalt, ...) internally.
|
||||
Element Call uses: ratchetWindowSize=10, keyringSize=256, ratchetSalt="LKFrameEncryptionKey"
|
||||
NOTE: proto value 0 = PBKDF2 (not raw/none) — must use KDF_HKDF=1.
|
||||
|
||||
def _build_e2ee_options() -> rtc.E2EEOptions:
|
||||
"""Build E2EE options — Python pre-derives HKDF keys, Rust uses them raw (MAT-144).
|
||||
|
||||
We pre-derive AES keys via _hkdf_derive() in Python (matching EC's JS deriveEncryptionKey),
|
||||
then pass derived keys to set_key() with KDF_RAW (proto 0 = no Rust-side derivation).
|
||||
This eliminates any HKDF implementation mismatch between Rust FFI and Element Call JS.
|
||||
Ratcheted keys are also pre-derived in Python via _ratchet_keys().
|
||||
"""
|
||||
key_opts = rtc.KeyProviderOptions(
|
||||
shared_key=b"", # empty = per-participant mode
|
||||
ratchet_window_size=10,
|
||||
ratchet_window_size=0, # we handle ratcheting in Python
|
||||
ratchet_salt=b"LKFrameEncryptionKey",
|
||||
failure_tolerance=10,
|
||||
key_ring_size=256,
|
||||
key_derivation_function=KDF_HKDF, # Rust FFI applies HKDF; we pass raw base keys
|
||||
key_derivation_function=KDF_RAW, # no Rust-side KDF; keys are pre-derived in Python
|
||||
)
|
||||
return rtc.E2EEOptions(
|
||||
encryption_type=rtc.EncryptionType.GCM,
|
||||
@@ -504,9 +521,8 @@ class VoiceSession:
|
||||
if has_subscribed:
|
||||
try:
|
||||
kp = self.lk_room.e2ee_manager.key_provider
|
||||
ok = kp.set_key(p.identity, key, index)
|
||||
logger.info("Late key set_key[%d] for %s (ok=%s)",
|
||||
index, p.identity, ok)
|
||||
_derive_and_set_key(kp, p.identity, key, index)
|
||||
logger.info("Late key[%d] set for %s", index, p.identity)
|
||||
except Exception as e:
|
||||
logger.warning("Late key set_key failed: %s", e)
|
||||
break
|
||||
@@ -666,11 +682,8 @@ class VoiceSession:
|
||||
# NOTE: Do NOT create rtc.AudioStream here — it competes with AgentSession's
|
||||
# internal audio pipeline for event loop time, causing intermittent VAD failures
|
||||
# (user_state stuck on "away"). See MAT-40. Use e2ee_state_changed for flow confirmation.
|
||||
# *** KEY FIX: set_key() with KDF_HKDF only applies HKDF when the frame cryptor
|
||||
# for this participant already exists. The frame cryptor is created at track
|
||||
# subscription time. Calling set_key() BEFORE track subscription (at connect)
|
||||
# skips HKDF derivation → raw key stored → DEC_FAILED.
|
||||
# Solution: set caller key HERE, after frame cryptor is initialized.
|
||||
# MAT-144: Pre-derive HKDF in Python, pass derived key with KDF_RAW.
|
||||
# This ensures exact HKDF match with Element Call JS for both audio AND video.
|
||||
# Store video track for on-demand vision (look_at_screen tool)
|
||||
# Screen share = source "screen_share" or "screenshare"; camera = "camera" or default
|
||||
if int(t.kind) == 2: # video track (LiveKit: 1=audio, 2=video)
|
||||
@@ -679,15 +692,16 @@ class VoiceSession:
|
||||
logger.info("Video track stored from %s source=%s for on-demand vision", p.identity, track_source)
|
||||
if int(t.kind) in (1, 2) and e2ee_opts is not None: # audio + video tracks
|
||||
caller_id = p.identity
|
||||
logger.info("E2EE_DIAG: track_subscribed for %s, have %d caller keys",
|
||||
caller_id, len(self._caller_all_keys))
|
||||
track_type = "video" if int(t.kind) == 2 else "audio"
|
||||
logger.info("E2EE_DIAG: track_subscribed %s for %s, have %d caller keys",
|
||||
track_type, caller_id, len(self._caller_all_keys))
|
||||
try:
|
||||
kp_local = self.lk_room.e2ee_manager.key_provider
|
||||
if self._caller_all_keys:
|
||||
for idx, base_k in sorted(self._caller_all_keys.items()):
|
||||
ok = kp_local.set_key(caller_id, base_k, idx)
|
||||
logger.info("on_ts: set_key[%d] for %s (ok=%s, %d bytes, raw=%s)",
|
||||
idx, caller_id, ok, len(base_k), base_k.hex())
|
||||
_derive_and_set_key(kp_local, caller_id, base_k, idx)
|
||||
logger.info("on_ts: derived+set key[%d] for %s (%s track)",
|
||||
idx, caller_id, track_type)
|
||||
else:
|
||||
logger.warning("on_ts: no caller keys yet — scheduling 0.5s retry")
|
||||
async def _brief_key_retry(pid=caller_id):
|
||||
@@ -696,8 +710,8 @@ class VoiceSession:
|
||||
try:
|
||||
kp_r = self.lk_room.e2ee_manager.key_provider
|
||||
for idx, base_k in sorted(self._caller_all_keys.items()):
|
||||
ok = kp_r.set_key(pid, base_k, idx)
|
||||
logger.info("on_ts_retry: set_key[%d] for %s (ok=%s)", idx, pid, ok)
|
||||
_derive_and_set_key(kp_r, pid, base_k, idx)
|
||||
logger.info("on_ts_retry: derived+set key[%d] for %s", idx, pid)
|
||||
except Exception as exc:
|
||||
logger.warning("on_ts_retry: set_key failed: %s", exc)
|
||||
else:
|
||||
@@ -718,21 +732,21 @@ class VoiceSession:
|
||||
try:
|
||||
kp_e = self.lk_room.e2ee_manager.key_provider
|
||||
for idx, base_k in sorted(self._caller_all_keys.items()):
|
||||
ok = kp_e.set_key(p_id, base_k, idx)
|
||||
logger.info("e2ee_state set_key[%d] for %s (ok=%s)", idx, p_id, ok)
|
||||
_derive_and_set_key(kp_e, p_id, base_k, idx)
|
||||
logger.info("e2ee_state: derived+set key[%d] for %s on %s",
|
||||
idx, p_id, state_name)
|
||||
except Exception as exc:
|
||||
logger.warning("e2ee_state set_key failed: %s", exc)
|
||||
|
||||
await self.lk_room.connect(self.lk_url, jwt, options=room_opts)
|
||||
logger.info("Connected (E2EE=HKDF), remote=%d",
|
||||
logger.info("Connected (E2EE=Python-HKDF+RAW), remote=%d",
|
||||
len(self.lk_room.remote_participants))
|
||||
|
||||
# Set bot's own key immediately after connect — local frame cryptor exists at connect time.
|
||||
# CALLER keys are set in on_track_subscribed (NOT here) because the caller's frame cryptor
|
||||
# is only created when their track arrives. Calling set_key() before that skips HKDF.
|
||||
# Pre-derive via HKDF in Python since KDF_RAW is set (no Rust-side derivation).
|
||||
kp = self.lk_room.e2ee_manager.key_provider
|
||||
ok = kp.set_key(bot_identity, self._bot_key, 0)
|
||||
logger.info("Set bot key for %s (ok=%s, %d bytes)", bot_identity, ok, len(self._bot_key))
|
||||
_derive_and_set_key(kp, bot_identity, self._bot_key, 0)
|
||||
logger.info("Set bot derived key for %s (%d raw bytes)", bot_identity, len(self._bot_key))
|
||||
|
||||
# Element Call rotates its key when bot joins. Wait up to 3s for the
|
||||
# rotated key to arrive via nio sync before proceeding. If it arrives,
|
||||
@@ -768,18 +782,14 @@ class VoiceSession:
|
||||
if remote_identity:
|
||||
break
|
||||
|
||||
# Set shared_key with pre-derived AES key for caller decryption.
|
||||
# NOT using set_key() for caller — Rust HKDF may produce different result than EC's JS HKDF.
|
||||
# set_shared_key() stores key raw (no KDF applied) — we pre-derive in Python.
|
||||
# After key rotation wait: if track already subscribed, set rotated key.
|
||||
# (Usually on_track_subscribed handles this, but if track arrived before rotation,
|
||||
# the rotated key needs to be set here for the already-subscribed participant.)
|
||||
# Post-rotation: set caller keys with Python-derived HKDF (MAT-144).
|
||||
# If track already subscribed, on_track_subscribed already set keys.
|
||||
# This catches the case where track arrived before key rotation completed.
|
||||
if self._caller_all_keys and remote_identity:
|
||||
try:
|
||||
for idx, base_k in sorted(self._caller_all_keys.items()):
|
||||
ok = kp.set_key(remote_identity, base_k, idx)
|
||||
logger.info("Post-rotation set_key[%d] for %s (ok=%s)",
|
||||
idx, remote_identity, ok)
|
||||
_derive_and_set_key(kp, remote_identity, base_k, idx)
|
||||
logger.info("Post-rotation derived+set key[%d] for %s", idx, remote_identity)
|
||||
except Exception as e:
|
||||
logger.warning("Post-rotation set_key failed: %s", e)
|
||||
elif not self._caller_all_keys:
|
||||
@@ -1169,6 +1179,11 @@ class VoiceSession:
|
||||
buf = io.BytesIO()
|
||||
img.convert("RGB").save(buf, format="JPEG", quality=85)
|
||||
img_b64 = base64.b64encode(buf.getvalue()).decode()
|
||||
if rgba.width <= 16 or rgba.height <= 16:
|
||||
logger.warning("LOOK_AT_SCREEN: frame %dx%d — E2EE decryption likely failed (garbage frame)",
|
||||
rgba.width, rgba.height)
|
||||
return ("E2EE Video-Entschluesselung fehlgeschlagen — das Bild ist nur "
|
||||
f"{rgba.width}x{rgba.height} Pixel. Bitte Bildschirmfreigabe neu starten.")
|
||||
logger.info("LOOK_AT_SCREEN: captured %dx%d frame (%d KB JPEG)",
|
||||
rgba.width, rgba.height, len(buf.getvalue()) // 1024)
|
||||
|
||||
@@ -1305,9 +1320,9 @@ class VoiceSession:
|
||||
kp_w = self.lk_room.e2ee_manager.key_provider
|
||||
for p in self.lk_room.remote_participants.values():
|
||||
for idx, base_k in sorted(self._caller_all_keys.items()):
|
||||
ok = kp_w.set_key(p.identity, base_k, idx)
|
||||
logger.info("VAD_WATCHDOG: recovery set_key[%d] for %s (ok=%s)",
|
||||
idx, p.identity, ok)
|
||||
_derive_and_set_key(kp_w, p.identity, base_k, idx)
|
||||
logger.info("VAD_WATCHDOG: recovery derived+set key[%d] for %s",
|
||||
idx, p.identity)
|
||||
except Exception as exc:
|
||||
logger.warning("VAD_WATCHDOG: recovery set_key failed: %s", exc)
|
||||
_vad_state_log["away_since"] = None # only warn once per stuck period
|
||||
|
||||
Reference in New Issue
Block a user