feat: activity video track (pulsing orb) for voice sessions

- ActivityVideoPublisher renders animated orb on 160x120 canvas
- Integrated into both agent.py and voice.py
- Updates confluence-collab submodule
This commit is contained in:
Christian Gick
2026-03-06 15:58:51 +00:00
parent 947699c988
commit efb976a27c
4 changed files with 199 additions and 1 deletions

161
activity_video.py Normal file
View File

@@ -0,0 +1,161 @@
"""Activity video track — pulsing orb (lightweight).
Small 160x120 canvas, only renders pixels near the orb.
LiveKit/browser upscales. Minimal CPU on both server and client.
"""
import asyncio
import math
import random
import logging
import time
import struct
from livekit.rtc import VideoSource, VideoFrame, VideoBufferType
logger = logging.getLogger("activity-video")
WIDTH = 160
HEIGHT = 120
FPS = 15
BPP = 4
CX, CY = WIDTH // 2, HEIGHT // 2
BG = (12, 12, 28)
STATE_COLORS = {
"listening": (40, 120, 255),
"thinking": (100, 60, 255),
"speaking": (30, 200, 255),
"initializing": (40, 60, 120),
}
_BG_PIXEL = struct.pack('BBBB', *BG, 255)
_BG_FRAME = _BG_PIXEL * (WIDTH * HEIGHT)
# Pre-compute distance from center — only within max possible glow radius
MAX_ORB = 45 # max orb radius at full energy
MAX_GLOW = int(MAX_ORB * 2.5) + 5
# Store sparse: list of (pixel_index, distance) for pixels within MAX_GLOW of center
_PIXELS = []
for _y in range(max(0, CY - MAX_GLOW), min(HEIGHT, CY + MAX_GLOW + 1)):
dy = _y - CY
for _x in range(max(0, CX - MAX_GLOW), min(WIDTH, CX + MAX_GLOW + 1)):
dx = _x - CX
d = math.sqrt(dx * dx + dy * dy)
if d <= MAX_GLOW:
_PIXELS.append((_y * WIDTH + _x, d))
class ActivityVideoPublisher:
def __init__(self):
self.source = VideoSource(WIDTH, HEIGHT)
self._state = "initializing"
self._stopped = False
self._pulse = 0.0
self._energy = 0.0
self._target_energy = 0.0
self._color = list(STATE_COLORS["initializing"])
self._target_color = list(STATE_COLORS["initializing"])
self._ring_phase = 0.0
def set_state(self, state: str):
if self._state != state:
logger.info("Activity video state: %s -> %s", self._state, state)
self._state = state
self._target_color = list(STATE_COLORS.get(state, STATE_COLORS["initializing"]))
def stop(self):
self._stopped = True
def _update(self, t: float):
state = self._state
for i in range(3):
self._color[i] += (self._target_color[i] - self._color[i]) * 0.08
if state == "listening":
self._target_energy = 0.3
self._pulse = 0.5 * math.sin(t * 1.5) + 0.5
elif state == "thinking":
self._target_energy = 0.6
self._pulse = 0.5 * math.sin(t * 3.0) + 0.5
elif state == "speaking":
self._target_energy = 0.9 + random.uniform(-0.1, 0.1)
self._pulse = 0.5 * math.sin(t * 6.0) + 0.5 + random.uniform(-0.15, 0.15)
else:
self._target_energy = 0.15
self._pulse = 0.3
self._energy += (self._target_energy - self._energy) * 0.12
self._ring_phase = t
def _render_frame(self) -> bytearray:
buf = bytearray(_BG_FRAME)
r, g, b = self._color
energy = self._energy
pulse = self._pulse
bg_r, bg_g, bg_b = BG
base_radius = 15 + 8 * energy
orb_radius = base_radius + 4 * pulse * energy
glow_radius = orb_radius * 2.5
inv_orb = 1.0 / max(orb_radius, 1)
glow_span = glow_radius - orb_radius
inv_glow = 1.0 / max(glow_span, 1)
ring_active = self._state == "speaking"
if ring_active:
ring1_r = orb_radius + ((self._ring_phase * 30) % glow_span)
ring2_r = orb_radius + ((self._ring_phase * 30 + glow_span * 0.5) % glow_span)
for idx, dist in _PIXELS:
if dist > glow_radius:
continue
if dist <= orb_radius:
f = dist * inv_orb
brightness = 1.0 - 0.3 * f * f
white = max(0.0, 1.0 - f * 2.5) * 0.6 * energy
pr = min(255, int(r * brightness + 255 * white))
pg = min(255, int(g * brightness + 255 * white))
pb = min(255, int(b * brightness + 255 * white))
else:
f = (dist - orb_radius) * inv_glow
t3 = 1.0 - f
glow = t3 * t3 * t3 * energy * 0.5
if ring_active:
for rr in (ring1_r, ring2_r):
rd = abs(dist - rr)
if rd < 4:
glow += (1.0 - rd * 0.25) * 0.3 * (1.0 - f)
pr = min(255, int(bg_r + r * glow))
pg = min(255, int(bg_g + g * glow))
pb = min(255, int(bg_b + b * glow))
off = idx * BPP
buf[off] = pr
buf[off + 1] = pg
buf[off + 2] = pb
return buf
async def run(self):
logger.info("Activity video loop started (%dx%d @ %d FPS, orb mode, %d active pixels)",
WIDTH, HEIGHT, FPS, len(_PIXELS))
interval = 1.0 / FPS
t0 = time.monotonic()
rgba_type = VideoBufferType.Value('RGBA')
while not self._stopped:
t = time.monotonic() - t0
self._update(t)
buf = self._render_frame()
frame = VideoFrame(WIDTH, HEIGHT, rgba_type, buf)
self.source.capture_frame(frame)
render_time = time.monotonic() - t0 - t
await asyncio.sleep(max(0.001, interval - render_time))
logger.info("Activity video loop stopped")

View File

@@ -1,3 +1,4 @@
import asyncio
import os import os
import json import json
import base64 import base64
@@ -10,6 +11,7 @@ from livekit.plugins import openai as lk_openai, elevenlabs, silero
import livekit.rtc as rtc import livekit.rtc as rtc
from e2ee_patch import KDF_HKDF from e2ee_patch import KDF_HKDF
from activity_video import ActivityVideoPublisher
logger = logging.getLogger("matrix-ai-agent") logger = logging.getLogger("matrix-ai-agent")
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)
@@ -103,6 +105,13 @@ async def entrypoint(ctx: JobContext):
logger.info("Connected to room, local identity: %s", ctx.room.local_participant.identity) logger.info("Connected to room, local identity: %s", ctx.room.local_participant.identity)
logger.info("Remote participants: %s", list(ctx.room.remote_participants.keys())) logger.info("Remote participants: %s", list(ctx.room.remote_participants.keys()))
# Publish activity video track (animated waveform bars)
activity_video = ActivityVideoPublisher()
video_track = rtc.LocalVideoTrack.create_video_track("activity", activity_video.source)
await ctx.room.local_participant.publish_track(video_track)
activity_task = asyncio.create_task(activity_video.run())
logger.info("Activity video track published")
model = os.environ.get("LITELLM_MODEL", "claude-sonnet") model = os.environ.get("LITELLM_MODEL", "claude-sonnet")
voice_id = os.environ.get("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM") voice_id = os.environ.get("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM")
@@ -120,6 +129,10 @@ async def entrypoint(ctx: JobContext):
vad=ctx.proc.userdata["vad"], vad=ctx.proc.userdata["vad"],
) )
@session.on("agent_state_changed")
def on_state_changed(ev):
activity_video.set_state(ev.new_state)
@session.on("user_speech_committed") @session.on("user_speech_committed")
def on_speech(msg): def on_speech(msg):
logger.info("USER_SPEECH_COMMITTED: %s", msg.text_content) logger.info("USER_SPEECH_COMMITTED: %s", msg.text_content)

View File

@@ -18,6 +18,7 @@ import aiohttp
import httpx import httpx
from livekit import rtc, api as lkapi from livekit import rtc, api as lkapi
from livekit.agents import Agent, AgentSession, StopResponse, function_tool, room_io, llm from livekit.agents import Agent, AgentSession, StopResponse, function_tool, room_io, llm
from activity_video import ActivityVideoPublisher
from livekit.plugins import openai as lk_openai, elevenlabs, silero from livekit.plugins import openai as lk_openai, elevenlabs, silero
from openai import AsyncOpenAI from openai import AsyncOpenAI
@@ -463,6 +464,8 @@ class VoiceSession:
self.lk_room = None self.lk_room = None
self.session = None self.session = None
self._task = None self._task = None
self._activity_video = None
self._activity_task = None
self._http_session = None self._http_session = None
self._caller_key: bytes | None = None self._caller_key: bytes | None = None
self._caller_identity: str | None = None self._caller_identity: str | None = None
@@ -575,6 +578,10 @@ class VoiceSession:
await obj.close() await obj.close()
except Exception: except Exception:
pass pass
if self._activity_video:
self._activity_video.stop()
if self._activity_task and not self._activity_task.done():
self._activity_task.cancel()
if self._task and not self._task.done(): if self._task and not self._task.done():
self._task.cancel() self._task.cancel()
try: try:
@@ -779,6 +786,17 @@ class VoiceSession:
if remote_identity: if remote_identity:
logger.info("Linking to remote participant: %s", remote_identity) logger.info("Linking to remote participant: %s", remote_identity)
# Publish activity video track (animated waveform bars)
try:
self._activity_video = ActivityVideoPublisher()
video_track = rtc.LocalVideoTrack.create_video_track("activity", self._activity_video.source)
pub_opts = rtc.TrackPublishOptions(source=rtc.TrackSource.SOURCE_CAMERA)
await self.lk_room.local_participant.publish_track(video_track, pub_opts)
self._activity_task = asyncio.create_task(self._activity_video.run())
logger.info("Activity video track published")
except Exception as e:
logger.warning("Failed to publish activity video: %s", e)
# Load memories and user preferences for this caller # Load memories and user preferences for this caller
memory_section = "" memory_section = ""
user_timezone = None user_timezone = None
@@ -1217,6 +1235,12 @@ class VoiceSession:
) )
logger.info("Voice pipeline started (voice=%s, linked_to=%s)", voice_id, remote_identity) logger.info("Voice pipeline started (voice=%s, linked_to=%s)", voice_id, remote_identity)
# Wire agent state to activity video animation
if self._activity_video:
@self.session.on("agent_state_changed")
def _on_state_changed(ev):
self._activity_video.set_state(ev.new_state)
try: try:
await asyncio.wait_for( await asyncio.wait_for(
self.session.generate_reply( self.session.generate_reply(