feat(CF-1812): Use confluence-collab for section-based page editing

Replace inline regex section parser in voice.py with confluence_collab library (BS4 parsing, 409 conflict retry). Bot now loads section outline into LLM context when Confluence links are detected. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-24 11:37:37 +02:00
parent 3e60e822be
commit 9e146da3b0
6 changed files with 52 additions and 64 deletions
--- a/4
+++ b/4
@@ -52,6 +52,10 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt

+# Install confluence-collab for section-based editing (CF-1812)
+COPY confluence-collab/ /tmp/confluence-collab/
+RUN pip install --no-cache-dir /tmp/confluence-collab/ && rm -rf /tmp/confluence-collab/
+
 # Overwrite installed FFI binary with patched version (HKDF + key_ring_size support)
 COPY --from=rust-build /build/livekit-rust-sdks/target/release/liblivekit_ffi.so /patched/
 ENV LIVEKIT_LIB_PATH=/patched/liblivekit_ffi.so
--- a/Dockerfile.bot
+++ b/Dockerfile.bot
@@ -3,4 +3,9 @@ WORKDIR /app
 RUN apt-get update && apt-get install -y --no-install-recommends ffmpeg libolm-dev && rm -rf /var/lib/apt/lists/*
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
+
+# Install confluence-collab for section-based editing (CF-1812)
+COPY confluence-collab/ /tmp/confluence-collab/
+RUN pip install --no-cache-dir /tmp/confluence-collab/ && rm -rf /tmp/confluence-collab/
+
 COPY . .
--- a/bot.py
+++ b/bot.py
@@ -857,20 +857,42 @@ class Bot:
            except Exception as exc:
                logger.warning("Confluence short link resolution failed: %s", exc)
        if confluence_page_id:
-            # Fetch actual page content so the text bot can work with it
+            # Fetch page with section structure for targeted editing
            conf_text = f"confluence_page_id:{confluence_page_id}"
            conf_title = f"Confluence page {confluence_page_id}"
            if CONFLUENCE_URL and CONFLUENCE_USER and CONFLUENCE_TOKEN:
                try:
-                    from voice import _confluence_read_page
-                    title, plain, _ver = await _confluence_read_page(confluence_page_id)
-                    conf_title = title
-                    conf_text = f"confluence_page_id:{confluence_page_id}\n\nTitle: {title}\n\n{plain}"
-                    logger.info("Fetched Confluence page %s: %s (%d chars)",
-                                confluence_page_id, title, len(plain))
+                    from confluence_collab.client import Auth, get_page
+                    from confluence_collab.parser import parse_sections
+                    auth = Auth(base_url=CONFLUENCE_URL, username=CONFLUENCE_USER, api_token=CONFLUENCE_TOKEN)
+                    page = await get_page(confluence_page_id, auth)
+                    conf_title = page.title
+                    sections = parse_sections(page.body_html)
+                    section_outline = "\n".join(
+                        f"{'  ' * (s.level - 1)}h{s.level}: {s.heading}" for s in sections
+                    )
+                    # Strip HTML for plain text context
+                    plain = re.sub(r"<[^>]+>", " ", page.body_html)
+                    plain = re.sub(r"\s+", " ", plain).strip()
+                    conf_text = (
+                        f"confluence_page_id:{confluence_page_id}\n\n"
+                        f"Title: {page.title}\n\n"
+                        f"Sections:\n{section_outline}\n\n"
+                        f"{plain}"
+                    )
+                    logger.info("Fetched Confluence page %s: %s (%d chars, %d sections)",
+                                confluence_page_id, page.title, len(plain), len(sections))
                except Exception as exc:
                    logger.warning("Confluence page fetch failed for %s: %s",
                                   confluence_page_id, exc)
+                    # Fallback to voice.py reader
+                    try:
+                        from voice import _confluence_read_page
+                        title, plain, _ver = await _confluence_read_page(confluence_page_id)
+                        conf_title = title
+                        conf_text = f"confluence_page_id:{confluence_page_id}\n\nTitle: {title}\n\n{plain}"
+                    except Exception:
+                        pass
            docs = self._room_document_context.setdefault(room.room_id, [])
            docs.append({
                "type": "confluence",
--- a/1
+++ b/1
@@ -0,0 +1 @@
+/Users/christian.gick/Development/Infrastructure/mcp-servers/confluence-collab
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,3 +11,5 @@ openai>=2.0,<3.0
 pymupdf>=1.24,<2.0
 python-docx>=1.0,<2.0
 Pillow>=10.0,<12.0
+beautifulsoup4>=4.12
+lxml>=5.0
--- a/voice.py
+++ b/voice.py
@@ -279,67 +279,21 @@ async def _confluence_read_page(page_id: str) -> tuple[str, str, int]:
 async def _confluence_update_section(page_id: str, section_heading: str, new_html: str) -> str:
    """Update a section of a Confluence page by heading.

-    Finds the section by heading, replaces content up to next same-level heading,
-    PUTs with incremented version.
+    Uses confluence_collab library for BS4 parsing and 409 conflict retry.
    """
    if not CONFLUENCE_URL or not CONFLUENCE_USER or not CONFLUENCE_TOKEN:
        return "Confluence credentials not configured."
-    # Read current page
-    url = f"{CONFLUENCE_URL}/rest/api/content/{page_id}"
-    params = {"expand": "body.storage,version,title"}
-    async with httpx.AsyncClient(timeout=15.0) as client:
-        resp = await client.get(url, params=params, auth=(CONFLUENCE_USER, CONFLUENCE_TOKEN))
-        resp.raise_for_status()
-        data = resp.json()
+    from confluence_collab.client import Auth
+    from confluence_collab.editor import section_update

-    title = data["title"]
-    version = data["version"]["number"]
-    body_html = data["body"]["storage"]["value"]
-
-    # Find section by heading (h1-h6) and replace content up to next same-level heading
-    heading_pattern = re.compile(
-        r'(<h([1-6])[^>]*>.*?' + re.escape(section_heading) + r'.*?</h\2>)',
-        re.IGNORECASE | re.DOTALL,
-    )
-    match = heading_pattern.search(body_html)
-    if not match:
-        return f"Section '{section_heading}' not found on page."
-
-    heading_tag = match.group(0)
-    heading_level = match.group(2)
-    section_start = match.end()
-
-    # Find next heading of same or higher level
-    next_heading = re.compile(
-        rf'<h[1-{heading_level}][^>]*>',
-        re.IGNORECASE,
-    )
-    next_match = next_heading.search(body_html, section_start)
-    section_end = next_match.start() if next_match else len(body_html)
-
-    # Replace section content
-    new_body = body_html[:section_start] + new_html + body_html[section_end:]
-
-    # PUT updated page
-    put_data = {
-        "version": {"number": version + 1},
-        "title": title,
-        "type": "page",
-        "body": {
-            "storage": {
-                "value": new_body,
-                "representation": "storage",
-            }
-        },
-    }
-    async with httpx.AsyncClient(timeout=15.0) as client:
-        resp = await client.put(
-            url,
-            json=put_data,
-            auth=(CONFLUENCE_USER, CONFLUENCE_TOKEN),
-        )
-        resp.raise_for_status()
-    return f"Section '{section_heading}' updated successfully."
+    auth = Auth(base_url=CONFLUENCE_URL, username=CONFLUENCE_USER, api_token=CONFLUENCE_TOKEN)
+    result = await section_update(page_id, section_heading, new_html, auth)
+    if result.ok:
+        msg = f"Section '{section_heading}' updated successfully."
+        if result.retries > 0:
+            msg += f" ({result.retries} conflict retries)"
+        return msg
+    return result.message


 def _build_e2ee_options() -> rtc.E2EEOptions:
				`@@ -0,0 +1 @@`
				`/Users/christian.gick/Development/Infrastructure/mcp-servers/confluence-collab`