From 7087fbf733a557d9e7bf76367679a88afacd346a Mon Sep 17 00:00:00 2001
From: Christian Gick <christian.gick@clicksports.de>
Date: Sat, 18 Apr 2026 05:25:12 +0000
Subject: [PATCH] fix(bot): prevent dangling preamble + force final summary on
 tool-loop exhaustion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two compounding streaming bugs caused the bot to render only a
'Gute Frage — lass mich' preamble when claude-haiku spent all
MAX_TOOL_ITERATIONS=5 on tool calls without producing final text.

1. Preamble leak: stream posted first content delta as soon as it
   crossed MIN_CHARS_BEFORE_POST=20, before tool_calls deltas had
   arrived. Added 1.2s TOOL_GRACE_SECONDS buffer so the suppression
   path catches the upcoming tool_calls before we go visible.

2. No final synthesis: when the loop exhausted iterations while still
   requesting tools, reply was empty and the orphaned preamble stayed
   on screen. Added a forced tools=None final call to make the model
   summarize accumulated tool results before send/edit.
---
 bot.py | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/bot.py b/bot.py
index 8e5c8c9..5fa7e38 100644
--- a/bot.py
+++ b/bot.py
@@ -3283,6 +3283,30 @@ class Bot:
             if iteration > 0:
                 sentry_sdk.set_tag("used_tools", "true")
 
+            # If the loop exhausted MAX_TOOL_ITERATIONS while the model was still
+            # requesting tools, `reply` is empty and tool results sit unsummarized
+            # in `messages`. Force one final text-only turn so the user sees a
+            # synthesis instead of the dangling preamble we already streamed.
+            if not reply and tool_calls:
+                logger.info(
+                    "[stream] hit MAX_TOOL_ITERATIONS=%d still requesting tools; forcing final summary",
+                    MAX_TOOL_ITERATIONS,
+                )
+                try:
+                    final_resp = await self.llm.chat.completions.create(
+                        model=model,
+                        messages=messages + [{
+                            "role": "user",
+                            "content": "Bitte fasse jetzt deine Recherche zusammen — keine weiteren Tool-Aufrufe.",
+                        }],
+                        max_tokens=2048,
+                        tools=None,
+                    )
+                    reply = (final_resp.choices[0].message.content or "").strip()
+                except Exception:
+                    logger.warning("[stream] forced final-summary call failed", exc_info=True)
+                    reply = "_(Recherche lief in Tool-Schleife — bitte gezielter nachfragen.)_"
+
             # Send / finalize reply. If we streamed, do a final edit only if
             # the complete text differs from what was last sent (avoids the
             # "(bearbeitet)" / "(edited)" indicator for unchanged messages).
@@ -3754,8 +3778,10 @@ class Bot:
         event_id = prior_event_id
         last_edit = 0.0
         last_sent_text: str = ""  # track what was last sent to Matrix to avoid redundant edits
+        first_content_time: float = 0.0  # monotonic time of first content delta
         EDIT_THROTTLE = 0.6  # seconds — keep Matrix edit traffic reasonable
         MIN_CHARS_BEFORE_POST = 20  # avoid posting a single character first
+        TOOL_GRACE_SECONDS = 1.2  # buffer initial content this long; tool_calls deltas usually arrive within ~500ms
 
         try:
             stream = await self.llm.chat.completions.create(
@@ -3816,7 +3842,13 @@ class Bot:
                 # Suppress visible streaming once we know this turn will end in tool calls
                 if not tool_calls_acc:
                     now = time.monotonic()
-                    if now - last_edit >= EDIT_THROTTLE:
+                    if first_content_time == 0.0:
+                        first_content_time = now
+                    # Grace period: hold first post long enough for tool_calls deltas
+                    # to start arriving, so we never leak a "Gute Frage — lass mich…"
+                    # preamble that the model intends to follow with tool calls.
+                    grace_passed = (event_id is not None) or (now - first_content_time >= TOOL_GRACE_SECONDS)
+                    if grace_passed and now - last_edit >= EDIT_THROTTLE:
                         text_so_far = "".join(content_parts)
                         if len(text_so_far) >= MIN_CHARS_BEFORE_POST:
                             if event_id is None: