matrix-ai-agent/agent.py

import os
import logging

from livekit.agents import Agent, AgentSession, AgentServer, JobContext, JobProcess, cli
from livekit.plugins import openai as lk_openai, elevenlabs, silero

logger = logging.getLogger("matrix-ai-agent")
logging.basicConfig(level=logging.DEBUG)

LITELLM_URL = os.environ["LITELLM_BASE_URL"]
LITELLM_KEY = os.environ.get("LITELLM_API_KEY", "not-needed")

SYSTEM_PROMPT = """You are a helpful voice assistant in a Matrix call.
Rules:
- Keep answers SHORT — 1-3 sentences max
- Be direct, no filler words
- If the user wants more detail, they will ask
- Speak naturally as in a conversation"""

server = AgentServer()


def prewarm(proc: JobProcess):
    proc.userdata["vad"] = silero.VAD.load()


server.setup_fnc = prewarm


@server.rtc_session(agent_name=os.environ.get("AGENT_NAME", "matrix-ai"))
async def entrypoint(ctx: JobContext):
    logger.info("Job received for room %s", ctx.job.room.name)

    # Standard framework connection (handles audio pipeline properly)
    await ctx.connect()
    logger.info("Connected to room, local identity: %s", ctx.room.local_participant.identity)
    logger.info("Remote participants: %s", list(ctx.room.remote_participants.keys()))

    model = os.environ.get("LITELLM_MODEL", "claude-sonnet")
    voice_id = os.environ.get("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM")

    session = AgentSession(
        stt=elevenlabs.STT(),
        llm=lk_openai.LLM(
            base_url=LITELLM_URL,
            api_key=LITELLM_KEY,
            model=model,
        ),
        tts=elevenlabs.TTS(
            voice_id=voice_id,
            model="eleven_turbo_v2_5",
        ),
        vad=ctx.proc.userdata["vad"],
    )

    # Debug: log pipeline events
    @session.on("user_speech_committed")
    def on_speech(msg):
        logger.info("USER_SPEECH_COMMITTED: %s", msg.text_content)

    @session.on("agent_speech_committed")
    def on_agent_speech(msg):
        logger.info("AGENT_SPEECH_COMMITTED: %s", msg.text_content)

    agent = Agent(instructions=SYSTEM_PROMPT)
    await session.start(
        agent=agent,
        room=ctx.room,
    )
    logger.info("Session started, generating greeting...")
    await session.generate_reply(instructions="Greet the user briefly.")
    logger.info("Greeting generated.")


if __name__ == "__main__":
    cli.run_app(server)