fix(e2ee): pre-derive HKDF keys in Python instead of Rust FFI (MAT-144)
Switch from Rust-side HKDF (KDF_HKDF=1) to Python-side HKDF derivation with raw key mode (KDF_RAW=0). This eliminates potential HKDF implementation mismatches between Rust FFI and Element Call JS that caused video frame decryption failures (audio worked, video showed 8x8 garbage frames). Changes: - Add _derive_and_set_key() helper that pre-derives HKDF then calls set_key() - Set key_derivation_function=KDF_RAW (proto 0 = no Rust-side derivation) - Replace all direct set_key() calls with _derive_and_set_key() - Add per-track diagnostic logging (audio vs video) - Add frame size check in look_at_screen (detect E2EE failure early) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
105
voice.py
105
voice.py
@@ -190,10 +190,10 @@ KDF_HKDF = 1 # Rust FFI applies HKDF internally (proto enum value 1)
|
|||||||
|
|
||||||
|
|
||||||
def _hkdf_derive(ikm: bytes) -> bytes:
|
def _hkdf_derive(ikm: bytes) -> bytes:
|
||||||
"""Pre-derive AES key via HKDF-SHA256 matching livekit-client-sdk-js deriveEncryptionKey().
|
"""Pre-derive AES-128 key via HKDF-SHA256, matching Element Call JS deriveEncryptionKey().
|
||||||
|
|
||||||
JS params: hash=SHA-256, salt=encode("LKFrameEncryptionKey"), info=ArrayBuffer(128), length=128bit
|
JS params: hash=SHA-256, salt=encode("LKFrameEncryptionKey"), info=ArrayBuffer(128), length=128bit
|
||||||
We set this pre-derived key via set_shared_key() which bypasses Rust FFI KDF entirely.
|
Result is passed to set_key() with KDF_RAW so Rust uses it as-is (no double-derivation).
|
||||||
"""
|
"""
|
||||||
import hmac
|
import hmac
|
||||||
salt = b"LKFrameEncryptionKey"
|
salt = b"LKFrameEncryptionKey"
|
||||||
@@ -208,7 +208,7 @@ def _ratchet_keys(base_raw: bytes, count: int = 6) -> dict[int, bytes]:
|
|||||||
|
|
||||||
EC JS ratchet: new_raw = HMAC(key=current_raw, data="LKFrameEncryptionKey")[:16]
|
EC JS ratchet: new_raw = HMAC(key=current_raw, data="LKFrameEncryptionKey")[:16]
|
||||||
Returns {index: raw_key} for all indices 0..count-1.
|
Returns {index: raw_key} for all indices 0..count-1.
|
||||||
Set these via set_key(identity, raw, index) with KDF_HKDF=1 so Rust applies HKDF.
|
Each raw key is then HKDF-derived before passing to set_key() with KDF_RAW.
|
||||||
"""
|
"""
|
||||||
import hmac as _hmac
|
import hmac as _hmac
|
||||||
keys = {}
|
keys = {}
|
||||||
@@ -219,6 +219,20 @@ def _ratchet_keys(base_raw: bytes, count: int = 6) -> dict[int, bytes]:
|
|||||||
return keys
|
return keys
|
||||||
|
|
||||||
|
|
||||||
|
def _derive_and_set_key(kp, identity: str, raw_key: bytes, index: int) -> None:
|
||||||
|
"""Pre-derive HKDF key in Python, then set via KeyProvider (MAT-144).
|
||||||
|
|
||||||
|
This replaces direct set_key(identity, raw_key, index) calls.
|
||||||
|
Pre-derivation in Python ensures exact match with Element Call JS HKDF,
|
||||||
|
eliminating Rust FFI HKDF implementation differences that caused
|
||||||
|
video frame decryption failures (audio worked, video didn't).
|
||||||
|
"""
|
||||||
|
derived = _hkdf_derive(raw_key)
|
||||||
|
ok = kp.set_key(identity, derived, index)
|
||||||
|
logger.debug("set_key[%d] %s: raw=%s → derived=%s (ok=%s)",
|
||||||
|
index, identity, raw_key.hex()[:8], derived.hex(), ok)
|
||||||
|
|
||||||
|
|
||||||
async def _brave_search(query: str, count: int = 5) -> str:
|
async def _brave_search(query: str, count: int = 5) -> str:
|
||||||
"""Call Brave Search API and return formatted results."""
|
"""Call Brave Search API and return formatted results."""
|
||||||
if not BRAVE_API_KEY:
|
if not BRAVE_API_KEY:
|
||||||
@@ -429,21 +443,24 @@ async def _confluence_recent_pages(limit: int = 5) -> list[dict]:
|
|||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def _build_e2ee_options() -> rtc.E2EEOptions:
|
KDF_RAW = 0 # proto value 0 = PBKDF2 in proto, but Rust returns None (= no KDF = raw key)
|
||||||
"""Build E2EE options — let Rust FFI apply HKDF internally (KDF_HKDF=1).
|
|
||||||
|
|
||||||
Pass raw base keys from Matrix key exchange events directly to set_key().
|
|
||||||
The Rust FFI derives the AES frame key via HKDF(base_key, ratchetSalt, ...) internally.
|
def _build_e2ee_options() -> rtc.E2EEOptions:
|
||||||
Element Call uses: ratchetWindowSize=10, keyringSize=256, ratchetSalt="LKFrameEncryptionKey"
|
"""Build E2EE options — Python pre-derives HKDF keys, Rust uses them raw (MAT-144).
|
||||||
NOTE: proto value 0 = PBKDF2 (not raw/none) — must use KDF_HKDF=1.
|
|
||||||
|
We pre-derive AES keys via _hkdf_derive() in Python (matching EC's JS deriveEncryptionKey),
|
||||||
|
then pass derived keys to set_key() with KDF_RAW (proto 0 = no Rust-side derivation).
|
||||||
|
This eliminates any HKDF implementation mismatch between Rust FFI and Element Call JS.
|
||||||
|
Ratcheted keys are also pre-derived in Python via _ratchet_keys().
|
||||||
"""
|
"""
|
||||||
key_opts = rtc.KeyProviderOptions(
|
key_opts = rtc.KeyProviderOptions(
|
||||||
shared_key=b"", # empty = per-participant mode
|
shared_key=b"", # empty = per-participant mode
|
||||||
ratchet_window_size=10,
|
ratchet_window_size=0, # we handle ratcheting in Python
|
||||||
ratchet_salt=b"LKFrameEncryptionKey",
|
ratchet_salt=b"LKFrameEncryptionKey",
|
||||||
failure_tolerance=10,
|
failure_tolerance=10,
|
||||||
key_ring_size=256,
|
key_ring_size=256,
|
||||||
key_derivation_function=KDF_HKDF, # Rust FFI applies HKDF; we pass raw base keys
|
key_derivation_function=KDF_RAW, # no Rust-side KDF; keys are pre-derived in Python
|
||||||
)
|
)
|
||||||
return rtc.E2EEOptions(
|
return rtc.E2EEOptions(
|
||||||
encryption_type=rtc.EncryptionType.GCM,
|
encryption_type=rtc.EncryptionType.GCM,
|
||||||
@@ -504,9 +521,8 @@ class VoiceSession:
|
|||||||
if has_subscribed:
|
if has_subscribed:
|
||||||
try:
|
try:
|
||||||
kp = self.lk_room.e2ee_manager.key_provider
|
kp = self.lk_room.e2ee_manager.key_provider
|
||||||
ok = kp.set_key(p.identity, key, index)
|
_derive_and_set_key(kp, p.identity, key, index)
|
||||||
logger.info("Late key set_key[%d] for %s (ok=%s)",
|
logger.info("Late key[%d] set for %s", index, p.identity)
|
||||||
index, p.identity, ok)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("Late key set_key failed: %s", e)
|
logger.warning("Late key set_key failed: %s", e)
|
||||||
break
|
break
|
||||||
@@ -666,11 +682,8 @@ class VoiceSession:
|
|||||||
# NOTE: Do NOT create rtc.AudioStream here — it competes with AgentSession's
|
# NOTE: Do NOT create rtc.AudioStream here — it competes with AgentSession's
|
||||||
# internal audio pipeline for event loop time, causing intermittent VAD failures
|
# internal audio pipeline for event loop time, causing intermittent VAD failures
|
||||||
# (user_state stuck on "away"). See MAT-40. Use e2ee_state_changed for flow confirmation.
|
# (user_state stuck on "away"). See MAT-40. Use e2ee_state_changed for flow confirmation.
|
||||||
# *** KEY FIX: set_key() with KDF_HKDF only applies HKDF when the frame cryptor
|
# MAT-144: Pre-derive HKDF in Python, pass derived key with KDF_RAW.
|
||||||
# for this participant already exists. The frame cryptor is created at track
|
# This ensures exact HKDF match with Element Call JS for both audio AND video.
|
||||||
# subscription time. Calling set_key() BEFORE track subscription (at connect)
|
|
||||||
# skips HKDF derivation → raw key stored → DEC_FAILED.
|
|
||||||
# Solution: set caller key HERE, after frame cryptor is initialized.
|
|
||||||
# Store video track for on-demand vision (look_at_screen tool)
|
# Store video track for on-demand vision (look_at_screen tool)
|
||||||
# Screen share = source "screen_share" or "screenshare"; camera = "camera" or default
|
# Screen share = source "screen_share" or "screenshare"; camera = "camera" or default
|
||||||
if int(t.kind) == 2: # video track (LiveKit: 1=audio, 2=video)
|
if int(t.kind) == 2: # video track (LiveKit: 1=audio, 2=video)
|
||||||
@@ -679,15 +692,16 @@ class VoiceSession:
|
|||||||
logger.info("Video track stored from %s source=%s for on-demand vision", p.identity, track_source)
|
logger.info("Video track stored from %s source=%s for on-demand vision", p.identity, track_source)
|
||||||
if int(t.kind) in (1, 2) and e2ee_opts is not None: # audio + video tracks
|
if int(t.kind) in (1, 2) and e2ee_opts is not None: # audio + video tracks
|
||||||
caller_id = p.identity
|
caller_id = p.identity
|
||||||
logger.info("E2EE_DIAG: track_subscribed for %s, have %d caller keys",
|
track_type = "video" if int(t.kind) == 2 else "audio"
|
||||||
caller_id, len(self._caller_all_keys))
|
logger.info("E2EE_DIAG: track_subscribed %s for %s, have %d caller keys",
|
||||||
|
track_type, caller_id, len(self._caller_all_keys))
|
||||||
try:
|
try:
|
||||||
kp_local = self.lk_room.e2ee_manager.key_provider
|
kp_local = self.lk_room.e2ee_manager.key_provider
|
||||||
if self._caller_all_keys:
|
if self._caller_all_keys:
|
||||||
for idx, base_k in sorted(self._caller_all_keys.items()):
|
for idx, base_k in sorted(self._caller_all_keys.items()):
|
||||||
ok = kp_local.set_key(caller_id, base_k, idx)
|
_derive_and_set_key(kp_local, caller_id, base_k, idx)
|
||||||
logger.info("on_ts: set_key[%d] for %s (ok=%s, %d bytes, raw=%s)",
|
logger.info("on_ts: derived+set key[%d] for %s (%s track)",
|
||||||
idx, caller_id, ok, len(base_k), base_k.hex())
|
idx, caller_id, track_type)
|
||||||
else:
|
else:
|
||||||
logger.warning("on_ts: no caller keys yet — scheduling 0.5s retry")
|
logger.warning("on_ts: no caller keys yet — scheduling 0.5s retry")
|
||||||
async def _brief_key_retry(pid=caller_id):
|
async def _brief_key_retry(pid=caller_id):
|
||||||
@@ -696,8 +710,8 @@ class VoiceSession:
|
|||||||
try:
|
try:
|
||||||
kp_r = self.lk_room.e2ee_manager.key_provider
|
kp_r = self.lk_room.e2ee_manager.key_provider
|
||||||
for idx, base_k in sorted(self._caller_all_keys.items()):
|
for idx, base_k in sorted(self._caller_all_keys.items()):
|
||||||
ok = kp_r.set_key(pid, base_k, idx)
|
_derive_and_set_key(kp_r, pid, base_k, idx)
|
||||||
logger.info("on_ts_retry: set_key[%d] for %s (ok=%s)", idx, pid, ok)
|
logger.info("on_ts_retry: derived+set key[%d] for %s", idx, pid)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.warning("on_ts_retry: set_key failed: %s", exc)
|
logger.warning("on_ts_retry: set_key failed: %s", exc)
|
||||||
else:
|
else:
|
||||||
@@ -718,21 +732,21 @@ class VoiceSession:
|
|||||||
try:
|
try:
|
||||||
kp_e = self.lk_room.e2ee_manager.key_provider
|
kp_e = self.lk_room.e2ee_manager.key_provider
|
||||||
for idx, base_k in sorted(self._caller_all_keys.items()):
|
for idx, base_k in sorted(self._caller_all_keys.items()):
|
||||||
ok = kp_e.set_key(p_id, base_k, idx)
|
_derive_and_set_key(kp_e, p_id, base_k, idx)
|
||||||
logger.info("e2ee_state set_key[%d] for %s (ok=%s)", idx, p_id, ok)
|
logger.info("e2ee_state: derived+set key[%d] for %s on %s",
|
||||||
|
idx, p_id, state_name)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.warning("e2ee_state set_key failed: %s", exc)
|
logger.warning("e2ee_state set_key failed: %s", exc)
|
||||||
|
|
||||||
await self.lk_room.connect(self.lk_url, jwt, options=room_opts)
|
await self.lk_room.connect(self.lk_url, jwt, options=room_opts)
|
||||||
logger.info("Connected (E2EE=HKDF), remote=%d",
|
logger.info("Connected (E2EE=Python-HKDF+RAW), remote=%d",
|
||||||
len(self.lk_room.remote_participants))
|
len(self.lk_room.remote_participants))
|
||||||
|
|
||||||
# Set bot's own key immediately after connect — local frame cryptor exists at connect time.
|
# Set bot's own key immediately after connect — local frame cryptor exists at connect time.
|
||||||
# CALLER keys are set in on_track_subscribed (NOT here) because the caller's frame cryptor
|
# Pre-derive via HKDF in Python since KDF_RAW is set (no Rust-side derivation).
|
||||||
# is only created when their track arrives. Calling set_key() before that skips HKDF.
|
|
||||||
kp = self.lk_room.e2ee_manager.key_provider
|
kp = self.lk_room.e2ee_manager.key_provider
|
||||||
ok = kp.set_key(bot_identity, self._bot_key, 0)
|
_derive_and_set_key(kp, bot_identity, self._bot_key, 0)
|
||||||
logger.info("Set bot key for %s (ok=%s, %d bytes)", bot_identity, ok, len(self._bot_key))
|
logger.info("Set bot derived key for %s (%d raw bytes)", bot_identity, len(self._bot_key))
|
||||||
|
|
||||||
# Element Call rotates its key when bot joins. Wait up to 3s for the
|
# Element Call rotates its key when bot joins. Wait up to 3s for the
|
||||||
# rotated key to arrive via nio sync before proceeding. If it arrives,
|
# rotated key to arrive via nio sync before proceeding. If it arrives,
|
||||||
@@ -768,18 +782,14 @@ class VoiceSession:
|
|||||||
if remote_identity:
|
if remote_identity:
|
||||||
break
|
break
|
||||||
|
|
||||||
# Set shared_key with pre-derived AES key for caller decryption.
|
# Post-rotation: set caller keys with Python-derived HKDF (MAT-144).
|
||||||
# NOT using set_key() for caller — Rust HKDF may produce different result than EC's JS HKDF.
|
# If track already subscribed, on_track_subscribed already set keys.
|
||||||
# set_shared_key() stores key raw (no KDF applied) — we pre-derive in Python.
|
# This catches the case where track arrived before key rotation completed.
|
||||||
# After key rotation wait: if track already subscribed, set rotated key.
|
|
||||||
# (Usually on_track_subscribed handles this, but if track arrived before rotation,
|
|
||||||
# the rotated key needs to be set here for the already-subscribed participant.)
|
|
||||||
if self._caller_all_keys and remote_identity:
|
if self._caller_all_keys and remote_identity:
|
||||||
try:
|
try:
|
||||||
for idx, base_k in sorted(self._caller_all_keys.items()):
|
for idx, base_k in sorted(self._caller_all_keys.items()):
|
||||||
ok = kp.set_key(remote_identity, base_k, idx)
|
_derive_and_set_key(kp, remote_identity, base_k, idx)
|
||||||
logger.info("Post-rotation set_key[%d] for %s (ok=%s)",
|
logger.info("Post-rotation derived+set key[%d] for %s", idx, remote_identity)
|
||||||
idx, remote_identity, ok)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("Post-rotation set_key failed: %s", e)
|
logger.warning("Post-rotation set_key failed: %s", e)
|
||||||
elif not self._caller_all_keys:
|
elif not self._caller_all_keys:
|
||||||
@@ -1169,6 +1179,11 @@ class VoiceSession:
|
|||||||
buf = io.BytesIO()
|
buf = io.BytesIO()
|
||||||
img.convert("RGB").save(buf, format="JPEG", quality=85)
|
img.convert("RGB").save(buf, format="JPEG", quality=85)
|
||||||
img_b64 = base64.b64encode(buf.getvalue()).decode()
|
img_b64 = base64.b64encode(buf.getvalue()).decode()
|
||||||
|
if rgba.width <= 16 or rgba.height <= 16:
|
||||||
|
logger.warning("LOOK_AT_SCREEN: frame %dx%d — E2EE decryption likely failed (garbage frame)",
|
||||||
|
rgba.width, rgba.height)
|
||||||
|
return ("E2EE Video-Entschluesselung fehlgeschlagen — das Bild ist nur "
|
||||||
|
f"{rgba.width}x{rgba.height} Pixel. Bitte Bildschirmfreigabe neu starten.")
|
||||||
logger.info("LOOK_AT_SCREEN: captured %dx%d frame (%d KB JPEG)",
|
logger.info("LOOK_AT_SCREEN: captured %dx%d frame (%d KB JPEG)",
|
||||||
rgba.width, rgba.height, len(buf.getvalue()) // 1024)
|
rgba.width, rgba.height, len(buf.getvalue()) // 1024)
|
||||||
|
|
||||||
@@ -1305,9 +1320,9 @@ class VoiceSession:
|
|||||||
kp_w = self.lk_room.e2ee_manager.key_provider
|
kp_w = self.lk_room.e2ee_manager.key_provider
|
||||||
for p in self.lk_room.remote_participants.values():
|
for p in self.lk_room.remote_participants.values():
|
||||||
for idx, base_k in sorted(self._caller_all_keys.items()):
|
for idx, base_k in sorted(self._caller_all_keys.items()):
|
||||||
ok = kp_w.set_key(p.identity, base_k, idx)
|
_derive_and_set_key(kp_w, p.identity, base_k, idx)
|
||||||
logger.info("VAD_WATCHDOG: recovery set_key[%d] for %s (ok=%s)",
|
logger.info("VAD_WATCHDOG: recovery derived+set key[%d] for %s",
|
||||||
idx, p.identity, ok)
|
idx, p.identity)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.warning("VAD_WATCHDOG: recovery set_key failed: %s", exc)
|
logger.warning("VAD_WATCHDOG: recovery set_key failed: %s", exc)
|
||||||
_vad_state_log["away_since"] = None # only warn once per stuck period
|
_vad_state_log["away_since"] = None # only warn once per stuck period
|
||||||
|
|||||||
Reference in New Issue
Block a user