From efb976a27c1748f4f35b4c686b855116a7cf6355 Mon Sep 17 00:00:00 2001 From: Christian Gick Date: Fri, 6 Mar 2026 15:58:51 +0000 Subject: [PATCH] feat: activity video track (pulsing orb) for voice sessions - ActivityVideoPublisher renders animated orb on 160x120 canvas - Integrated into both agent.py and voice.py - Updates confluence-collab submodule --- activity_video.py | 161 ++++++++++++++++++++++++++++++++++++++++++++++ agent.py | 13 ++++ confluence-collab | 2 +- voice.py | 24 +++++++ 4 files changed, 199 insertions(+), 1 deletion(-) create mode 100644 activity_video.py diff --git a/activity_video.py b/activity_video.py new file mode 100644 index 0000000..54c70ee --- /dev/null +++ b/activity_video.py @@ -0,0 +1,161 @@ +"""Activity video track — pulsing orb (lightweight). + +Small 160x120 canvas, only renders pixels near the orb. +LiveKit/browser upscales. Minimal CPU on both server and client. +""" + +import asyncio +import math +import random +import logging +import time +import struct + +from livekit.rtc import VideoSource, VideoFrame, VideoBufferType + +logger = logging.getLogger("activity-video") + +WIDTH = 160 +HEIGHT = 120 +FPS = 15 +BPP = 4 +CX, CY = WIDTH // 2, HEIGHT // 2 + +BG = (12, 12, 28) + +STATE_COLORS = { + "listening": (40, 120, 255), + "thinking": (100, 60, 255), + "speaking": (30, 200, 255), + "initializing": (40, 60, 120), +} + +_BG_PIXEL = struct.pack('BBBB', *BG, 255) +_BG_FRAME = _BG_PIXEL * (WIDTH * HEIGHT) + +# Pre-compute distance from center — only within max possible glow radius +MAX_ORB = 45 # max orb radius at full energy +MAX_GLOW = int(MAX_ORB * 2.5) + 5 +# Store sparse: list of (pixel_index, distance) for pixels within MAX_GLOW of center +_PIXELS = [] +for _y in range(max(0, CY - MAX_GLOW), min(HEIGHT, CY + MAX_GLOW + 1)): + dy = _y - CY + for _x in range(max(0, CX - MAX_GLOW), min(WIDTH, CX + MAX_GLOW + 1)): + dx = _x - CX + d = math.sqrt(dx * dx + dy * dy) + if d <= MAX_GLOW: + _PIXELS.append((_y * WIDTH + _x, d)) + + +class ActivityVideoPublisher: + def __init__(self): + self.source = VideoSource(WIDTH, HEIGHT) + self._state = "initializing" + self._stopped = False + self._pulse = 0.0 + self._energy = 0.0 + self._target_energy = 0.0 + self._color = list(STATE_COLORS["initializing"]) + self._target_color = list(STATE_COLORS["initializing"]) + self._ring_phase = 0.0 + + def set_state(self, state: str): + if self._state != state: + logger.info("Activity video state: %s -> %s", self._state, state) + self._state = state + self._target_color = list(STATE_COLORS.get(state, STATE_COLORS["initializing"])) + + def stop(self): + self._stopped = True + + def _update(self, t: float): + state = self._state + for i in range(3): + self._color[i] += (self._target_color[i] - self._color[i]) * 0.08 + + if state == "listening": + self._target_energy = 0.3 + self._pulse = 0.5 * math.sin(t * 1.5) + 0.5 + elif state == "thinking": + self._target_energy = 0.6 + self._pulse = 0.5 * math.sin(t * 3.0) + 0.5 + elif state == "speaking": + self._target_energy = 0.9 + random.uniform(-0.1, 0.1) + self._pulse = 0.5 * math.sin(t * 6.0) + 0.5 + random.uniform(-0.15, 0.15) + else: + self._target_energy = 0.15 + self._pulse = 0.3 + + self._energy += (self._target_energy - self._energy) * 0.12 + self._ring_phase = t + + def _render_frame(self) -> bytearray: + buf = bytearray(_BG_FRAME) + + r, g, b = self._color + energy = self._energy + pulse = self._pulse + bg_r, bg_g, bg_b = BG + + base_radius = 15 + 8 * energy + orb_radius = base_radius + 4 * pulse * energy + glow_radius = orb_radius * 2.5 + inv_orb = 1.0 / max(orb_radius, 1) + glow_span = glow_radius - orb_radius + inv_glow = 1.0 / max(glow_span, 1) + + ring_active = self._state == "speaking" + if ring_active: + ring1_r = orb_radius + ((self._ring_phase * 30) % glow_span) + ring2_r = orb_radius + ((self._ring_phase * 30 + glow_span * 0.5) % glow_span) + + for idx, dist in _PIXELS: + if dist > glow_radius: + continue + + if dist <= orb_radius: + f = dist * inv_orb + brightness = 1.0 - 0.3 * f * f + white = max(0.0, 1.0 - f * 2.5) * 0.6 * energy + pr = min(255, int(r * brightness + 255 * white)) + pg = min(255, int(g * brightness + 255 * white)) + pb = min(255, int(b * brightness + 255 * white)) + else: + f = (dist - orb_radius) * inv_glow + t3 = 1.0 - f + glow = t3 * t3 * t3 * energy * 0.5 + + if ring_active: + for rr in (ring1_r, ring2_r): + rd = abs(dist - rr) + if rd < 4: + glow += (1.0 - rd * 0.25) * 0.3 * (1.0 - f) + + pr = min(255, int(bg_r + r * glow)) + pg = min(255, int(bg_g + g * glow)) + pb = min(255, int(bg_b + b * glow)) + + off = idx * BPP + buf[off] = pr + buf[off + 1] = pg + buf[off + 2] = pb + + return buf + + async def run(self): + logger.info("Activity video loop started (%dx%d @ %d FPS, orb mode, %d active pixels)", + WIDTH, HEIGHT, FPS, len(_PIXELS)) + interval = 1.0 / FPS + t0 = time.monotonic() + rgba_type = VideoBufferType.Value('RGBA') + + while not self._stopped: + t = time.monotonic() - t0 + self._update(t) + buf = self._render_frame() + frame = VideoFrame(WIDTH, HEIGHT, rgba_type, buf) + self.source.capture_frame(frame) + render_time = time.monotonic() - t0 - t + await asyncio.sleep(max(0.001, interval - render_time)) + + logger.info("Activity video loop stopped") diff --git a/agent.py b/agent.py index 8bee49c..d7fc91a 100644 --- a/agent.py +++ b/agent.py @@ -1,3 +1,4 @@ +import asyncio import os import json import base64 @@ -10,6 +11,7 @@ from livekit.plugins import openai as lk_openai, elevenlabs, silero import livekit.rtc as rtc from e2ee_patch import KDF_HKDF +from activity_video import ActivityVideoPublisher logger = logging.getLogger("matrix-ai-agent") logging.basicConfig(level=logging.DEBUG) @@ -103,6 +105,13 @@ async def entrypoint(ctx: JobContext): logger.info("Connected to room, local identity: %s", ctx.room.local_participant.identity) logger.info("Remote participants: %s", list(ctx.room.remote_participants.keys())) + # Publish activity video track (animated waveform bars) + activity_video = ActivityVideoPublisher() + video_track = rtc.LocalVideoTrack.create_video_track("activity", activity_video.source) + await ctx.room.local_participant.publish_track(video_track) + activity_task = asyncio.create_task(activity_video.run()) + logger.info("Activity video track published") + model = os.environ.get("LITELLM_MODEL", "claude-sonnet") voice_id = os.environ.get("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM") @@ -120,6 +129,10 @@ async def entrypoint(ctx: JobContext): vad=ctx.proc.userdata["vad"], ) + @session.on("agent_state_changed") + def on_state_changed(ev): + activity_video.set_state(ev.new_state) + @session.on("user_speech_committed") def on_speech(msg): logger.info("USER_SPEECH_COMMITTED: %s", msg.text_content) diff --git a/confluence-collab b/confluence-collab index 7e85918..a189fa3 160000 --- a/confluence-collab +++ b/confluence-collab @@ -1 +1 @@ -Subproject commit 7e85918233b0d7268a254876784d9d981abd3c52 +Subproject commit a189fa326b68e20ec2cdf80c7a2157f6f4b0645f diff --git a/voice.py b/voice.py index e38a9f9..e248c7c 100644 --- a/voice.py +++ b/voice.py @@ -18,6 +18,7 @@ import aiohttp import httpx from livekit import rtc, api as lkapi from livekit.agents import Agent, AgentSession, StopResponse, function_tool, room_io, llm +from activity_video import ActivityVideoPublisher from livekit.plugins import openai as lk_openai, elevenlabs, silero from openai import AsyncOpenAI @@ -463,6 +464,8 @@ class VoiceSession: self.lk_room = None self.session = None self._task = None + self._activity_video = None + self._activity_task = None self._http_session = None self._caller_key: bytes | None = None self._caller_identity: str | None = None @@ -575,6 +578,10 @@ class VoiceSession: await obj.close() except Exception: pass + if self._activity_video: + self._activity_video.stop() + if self._activity_task and not self._activity_task.done(): + self._activity_task.cancel() if self._task and not self._task.done(): self._task.cancel() try: @@ -779,6 +786,17 @@ class VoiceSession: if remote_identity: logger.info("Linking to remote participant: %s", remote_identity) + # Publish activity video track (animated waveform bars) + try: + self._activity_video = ActivityVideoPublisher() + video_track = rtc.LocalVideoTrack.create_video_track("activity", self._activity_video.source) + pub_opts = rtc.TrackPublishOptions(source=rtc.TrackSource.SOURCE_CAMERA) + await self.lk_room.local_participant.publish_track(video_track, pub_opts) + self._activity_task = asyncio.create_task(self._activity_video.run()) + logger.info("Activity video track published") + except Exception as e: + logger.warning("Failed to publish activity video: %s", e) + # Load memories and user preferences for this caller memory_section = "" user_timezone = None @@ -1217,6 +1235,12 @@ class VoiceSession: ) logger.info("Voice pipeline started (voice=%s, linked_to=%s)", voice_id, remote_identity) + # Wire agent state to activity video animation + if self._activity_video: + @self.session.on("agent_state_changed") + def _on_state_changed(ev): + self._activity_video.set_state(ev.new_state) + try: await asyncio.wait_for( self.session.generate_reply(