fix: link AgentSession to remote participant + debug speech events

- Pass participant_identity via RoomOptions so AgentSession knows
  which audio track to consume (was silently ignoring user audio)
- Add USER_SPEECH and AGENT_SPEECH event handlers for debugging
- Simplify greeting to exact text to prevent hallucination
- Use httpx for room state scan (nio API was unreliable)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Christian Gick
2026-02-21 15:03:24 +02:00
parent 85df4b295f
commit 75970fc06b

View File

@@ -10,7 +10,7 @@ import os
import aiohttp import aiohttp
from livekit import rtc, api as lkapi from livekit import rtc, api as lkapi
from livekit.agents import Agent, AgentSession from livekit.agents import Agent, AgentSession, room_io
from livekit.plugins import openai as lk_openai, elevenlabs, silero from livekit.plugins import openai as lk_openai, elevenlabs, silero
logger = logging.getLogger("matrix-ai-voice") logger = logging.getLogger("matrix-ai-voice")
@@ -159,6 +159,13 @@ class VoiceSession:
"HKDF" if self._e2ee_key else "off", "HKDF" if self._e2ee_key else "off",
len(self.lk_room.remote_participants)) len(self.lk_room.remote_participants))
# Find the remote participant to link to
remote_identity = None
for p in self.lk_room.remote_participants.values():
remote_identity = p.identity
logger.info("Linking to remote participant: %s", remote_identity)
break
# Voice pipeline — German male voice (Daniel) # Voice pipeline — German male voice (Daniel)
self._http_session = aiohttp.ClientSession() self._http_session = aiohttp.ClientSession()
voice_id = os.environ.get("ELEVENLABS_VOICE_ID", DEFAULT_VOICE_ID) voice_id = os.environ.get("ELEVENLABS_VOICE_ID", DEFAULT_VOICE_ID)
@@ -169,14 +176,31 @@ class VoiceSession:
api_key=ELEVENLABS_KEY, http_session=self._http_session), api_key=ELEVENLABS_KEY, http_session=self._http_session),
vad=_get_vad(), vad=_get_vad(),
) )
# Debug: log speech events
@self.session.on("user_speech_committed")
def _on_user_speech(msg):
logger.info("USER_SPEECH: %s", msg.text_content)
@self.session.on("agent_speech_committed")
def _on_agent_speech(msg):
logger.info("AGENT_SPEECH: %s", msg.text_content)
agent = Agent(instructions=VOICE_PROMPT) agent = Agent(instructions=VOICE_PROMPT)
await self.session.start(agent=agent, room=self.lk_room) room_opts = room_io.RoomOptions(
logger.info("Voice pipeline started (voice=%s)", voice_id) participant_identity=remote_identity,
) if remote_identity else None
await self.session.start(
agent=agent,
room=self.lk_room,
room_options=room_opts,
)
logger.info("Voice pipeline started (voice=%s, linked_to=%s)", voice_id, remote_identity)
try: try:
await asyncio.wait_for( await asyncio.wait_for(
self.session.generate_reply( self.session.generate_reply(
instructions="Begruesse den Nutzer kurz auf Deutsch."), instructions="Sage nur: Hallo, wie kann ich helfen?"),
timeout=30.0) timeout=30.0)
logger.info("Greeting sent") logger.info("Greeting sent")
except asyncio.TimeoutError: except asyncio.TimeoutError: