From efb976a27c1748f4f35b4c686b855116a7cf6355 Mon Sep 17 00:00:00 2001
From: Christian Gick <christian.gick@agiliton.eu>
Date: Fri, 6 Mar 2026 15:58:51 +0000
Subject: [PATCH] feat: activity video track (pulsing orb) for voice sessions

- ActivityVideoPublisher renders animated orb on 160x120 canvas
- Integrated into both agent.py and voice.py
- Updates confluence-collab submodule
---
 activity_video.py | 161 ++++++++++++++++++++++++++++++++++++++++++++++
 agent.py          |  13 ++++
 confluence-collab |   2 +-
 voice.py          |  24 +++++++
 4 files changed, 199 insertions(+), 1 deletion(-)
 create mode 100644 activity_video.py

diff --git a/activity_video.py b/activity_video.py
new file mode 100644
index 0000000..54c70ee
--- /dev/null
+++ b/activity_video.py
@@ -0,0 +1,161 @@
+"""Activity video track — pulsing orb (lightweight).
+
+Small 160x120 canvas, only renders pixels near the orb.
+LiveKit/browser upscales. Minimal CPU on both server and client.
+"""
+
+import asyncio
+import math
+import random
+import logging
+import time
+import struct
+
+from livekit.rtc import VideoSource, VideoFrame, VideoBufferType
+
+logger = logging.getLogger("activity-video")
+
+WIDTH = 160
+HEIGHT = 120
+FPS = 15
+BPP = 4
+CX, CY = WIDTH // 2, HEIGHT // 2
+
+BG = (12, 12, 28)
+
+STATE_COLORS = {
+    "listening":    (40, 120, 255),
+    "thinking":     (100, 60, 255),
+    "speaking":     (30, 200, 255),
+    "initializing": (40, 60, 120),
+}
+
+_BG_PIXEL = struct.pack('BBBB', *BG, 255)
+_BG_FRAME = _BG_PIXEL * (WIDTH * HEIGHT)
+
+# Pre-compute distance from center — only within max possible glow radius
+MAX_ORB = 45  # max orb radius at full energy
+MAX_GLOW = int(MAX_ORB * 2.5) + 5
+# Store sparse: list of (pixel_index, distance) for pixels within MAX_GLOW of center
+_PIXELS = []
+for _y in range(max(0, CY - MAX_GLOW), min(HEIGHT, CY + MAX_GLOW + 1)):
+    dy = _y - CY
+    for _x in range(max(0, CX - MAX_GLOW), min(WIDTH, CX + MAX_GLOW + 1)):
+        dx = _x - CX
+        d = math.sqrt(dx * dx + dy * dy)
+        if d <= MAX_GLOW:
+            _PIXELS.append((_y * WIDTH + _x, d))
+
+
+class ActivityVideoPublisher:
+    def __init__(self):
+        self.source = VideoSource(WIDTH, HEIGHT)
+        self._state = "initializing"
+        self._stopped = False
+        self._pulse = 0.0
+        self._energy = 0.0
+        self._target_energy = 0.0
+        self._color = list(STATE_COLORS["initializing"])
+        self._target_color = list(STATE_COLORS["initializing"])
+        self._ring_phase = 0.0
+
+    def set_state(self, state: str):
+        if self._state != state:
+            logger.info("Activity video state: %s -> %s", self._state, state)
+            self._state = state
+            self._target_color = list(STATE_COLORS.get(state, STATE_COLORS["initializing"]))
+
+    def stop(self):
+        self._stopped = True
+
+    def _update(self, t: float):
+        state = self._state
+        for i in range(3):
+            self._color[i] += (self._target_color[i] - self._color[i]) * 0.08
+
+        if state == "listening":
+            self._target_energy = 0.3
+            self._pulse = 0.5 * math.sin(t * 1.5) + 0.5
+        elif state == "thinking":
+            self._target_energy = 0.6
+            self._pulse = 0.5 * math.sin(t * 3.0) + 0.5
+        elif state == "speaking":
+            self._target_energy = 0.9 + random.uniform(-0.1, 0.1)
+            self._pulse = 0.5 * math.sin(t * 6.0) + 0.5 + random.uniform(-0.15, 0.15)
+        else:
+            self._target_energy = 0.15
+            self._pulse = 0.3
+
+        self._energy += (self._target_energy - self._energy) * 0.12
+        self._ring_phase = t
+
+    def _render_frame(self) -> bytearray:
+        buf = bytearray(_BG_FRAME)
+
+        r, g, b = self._color
+        energy = self._energy
+        pulse = self._pulse
+        bg_r, bg_g, bg_b = BG
+
+        base_radius = 15 + 8 * energy
+        orb_radius = base_radius + 4 * pulse * energy
+        glow_radius = orb_radius * 2.5
+        inv_orb = 1.0 / max(orb_radius, 1)
+        glow_span = glow_radius - orb_radius
+        inv_glow = 1.0 / max(glow_span, 1)
+
+        ring_active = self._state == "speaking"
+        if ring_active:
+            ring1_r = orb_radius + ((self._ring_phase * 30) % glow_span)
+            ring2_r = orb_radius + ((self._ring_phase * 30 + glow_span * 0.5) % glow_span)
+
+        for idx, dist in _PIXELS:
+            if dist > glow_radius:
+                continue
+
+            if dist <= orb_radius:
+                f = dist * inv_orb
+                brightness = 1.0 - 0.3 * f * f
+                white = max(0.0, 1.0 - f * 2.5) * 0.6 * energy
+                pr = min(255, int(r * brightness + 255 * white))
+                pg = min(255, int(g * brightness + 255 * white))
+                pb = min(255, int(b * brightness + 255 * white))
+            else:
+                f = (dist - orb_radius) * inv_glow
+                t3 = 1.0 - f
+                glow = t3 * t3 * t3 * energy * 0.5
+
+                if ring_active:
+                    for rr in (ring1_r, ring2_r):
+                        rd = abs(dist - rr)
+                        if rd < 4:
+                            glow += (1.0 - rd * 0.25) * 0.3 * (1.0 - f)
+
+                pr = min(255, int(bg_r + r * glow))
+                pg = min(255, int(bg_g + g * glow))
+                pb = min(255, int(bg_b + b * glow))
+
+            off = idx * BPP
+            buf[off] = pr
+            buf[off + 1] = pg
+            buf[off + 2] = pb
+
+        return buf
+
+    async def run(self):
+        logger.info("Activity video loop started (%dx%d @ %d FPS, orb mode, %d active pixels)",
+                     WIDTH, HEIGHT, FPS, len(_PIXELS))
+        interval = 1.0 / FPS
+        t0 = time.monotonic()
+        rgba_type = VideoBufferType.Value('RGBA')
+
+        while not self._stopped:
+            t = time.monotonic() - t0
+            self._update(t)
+            buf = self._render_frame()
+            frame = VideoFrame(WIDTH, HEIGHT, rgba_type, buf)
+            self.source.capture_frame(frame)
+            render_time = time.monotonic() - t0 - t
+            await asyncio.sleep(max(0.001, interval - render_time))
+
+        logger.info("Activity video loop stopped")
diff --git a/agent.py b/agent.py
index 8bee49c..d7fc91a 100644
--- a/agent.py
+++ b/agent.py
@@ -1,3 +1,4 @@
+import asyncio
 import os
 import json
 import base64
@@ -10,6 +11,7 @@ from livekit.plugins import openai as lk_openai, elevenlabs, silero
 import livekit.rtc as rtc
 
 from e2ee_patch import KDF_HKDF
+from activity_video import ActivityVideoPublisher
 
 logger = logging.getLogger("matrix-ai-agent")
 logging.basicConfig(level=logging.DEBUG)
@@ -103,6 +105,13 @@ async def entrypoint(ctx: JobContext):
     logger.info("Connected to room, local identity: %s", ctx.room.local_participant.identity)
     logger.info("Remote participants: %s", list(ctx.room.remote_participants.keys()))
 
+    # Publish activity video track (animated waveform bars)
+    activity_video = ActivityVideoPublisher()
+    video_track = rtc.LocalVideoTrack.create_video_track("activity", activity_video.source)
+    await ctx.room.local_participant.publish_track(video_track)
+    activity_task = asyncio.create_task(activity_video.run())
+    logger.info("Activity video track published")
+
     model = os.environ.get("LITELLM_MODEL", "claude-sonnet")
     voice_id = os.environ.get("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM")
 
@@ -120,6 +129,10 @@ async def entrypoint(ctx: JobContext):
         vad=ctx.proc.userdata["vad"],
     )
 
+    @session.on("agent_state_changed")
+    def on_state_changed(ev):
+        activity_video.set_state(ev.new_state)
+
     @session.on("user_speech_committed")
     def on_speech(msg):
         logger.info("USER_SPEECH_COMMITTED: %s", msg.text_content)
diff --git a/confluence-collab b/confluence-collab
index 7e85918..a189fa3 160000
--- a/confluence-collab
+++ b/confluence-collab
@@ -1 +1 @@
-Subproject commit 7e85918233b0d7268a254876784d9d981abd3c52
+Subproject commit a189fa326b68e20ec2cdf80c7a2157f6f4b0645f
diff --git a/voice.py b/voice.py
index e38a9f9..e248c7c 100644
--- a/voice.py
+++ b/voice.py
@@ -18,6 +18,7 @@ import aiohttp
 import httpx
 from livekit import rtc, api as lkapi
 from livekit.agents import Agent, AgentSession, StopResponse, function_tool, room_io, llm
+from activity_video import ActivityVideoPublisher
 from livekit.plugins import openai as lk_openai, elevenlabs, silero
 from openai import AsyncOpenAI
 
@@ -463,6 +464,8 @@ class VoiceSession:
         self.lk_room = None
         self.session = None
         self._task = None
+        self._activity_video = None
+        self._activity_task = None
         self._http_session = None
         self._caller_key: bytes | None = None
         self._caller_identity: str | None = None
@@ -575,6 +578,10 @@ class VoiceSession:
                         await obj.close()
                 except Exception:
                     pass
+        if self._activity_video:
+            self._activity_video.stop()
+        if self._activity_task and not self._activity_task.done():
+            self._activity_task.cancel()
         if self._task and not self._task.done():
             self._task.cancel()
             try:
@@ -779,6 +786,17 @@ class VoiceSession:
             if remote_identity:
                 logger.info("Linking to remote participant: %s", remote_identity)
 
+            # Publish activity video track (animated waveform bars)
+            try:
+                self._activity_video = ActivityVideoPublisher()
+                video_track = rtc.LocalVideoTrack.create_video_track("activity", self._activity_video.source)
+                pub_opts = rtc.TrackPublishOptions(source=rtc.TrackSource.SOURCE_CAMERA)
+                await self.lk_room.local_participant.publish_track(video_track, pub_opts)
+                self._activity_task = asyncio.create_task(self._activity_video.run())
+                logger.info("Activity video track published")
+            except Exception as e:
+                logger.warning("Failed to publish activity video: %s", e)
+
             # Load memories and user preferences for this caller
             memory_section = ""
             user_timezone = None
@@ -1217,6 +1235,12 @@ class VoiceSession:
             )
             logger.info("Voice pipeline started (voice=%s, linked_to=%s)", voice_id, remote_identity)
 
+            # Wire agent state to activity video animation
+            if self._activity_video:
+                @self.session.on("agent_state_changed")
+                def _on_state_changed(ev):
+                    self._activity_video.set_state(ev.new_state)
+
             try:
                 await asyncio.wait_for(
                     self.session.generate_reply(