From d6dae1da8e1d6d7ac1f202b72083739d3145023a Mon Sep 17 00:00:00 2001 From: Christian Gick Date: Sun, 8 Mar 2026 17:11:24 +0200 Subject: [PATCH] feat: Haiku-default model routing with Sonnet escalation + Sentry observability Route ~90% of simple chat to claude-haiku (4x cheaper), escalate to claude-sonnet for code blocks, long messages, technical keywords, multimodal, and explicit requests. Sentry tags track model_used, escalation_reason, and token usage breadcrumbs. Co-Authored-By: Claude Opus 4.6 --- bot.py | 78 +++++++++++++++++++++++++++++++++++++++++++--- docker-compose.yml | 2 ++ 2 files changed, 75 insertions(+), 5 deletions(-) diff --git a/bot.py b/bot.py index 4049fef..9268ec4 100644 --- a/bot.py +++ b/bot.py @@ -69,6 +69,8 @@ CREDS_FILE = os.path.join(STORE_PATH, "credentials.json") LITELLM_URL = os.environ.get("LITELLM_BASE_URL", "") LITELLM_KEY = os.environ.get("LITELLM_API_KEY", "not-needed") DEFAULT_MODEL = os.environ.get("DEFAULT_MODEL", "claude-sonnet") +BASE_MODEL = os.environ.get("BASE_MODEL", "claude-haiku") +ESCALATION_MODEL = os.environ.get("ESCALATION_MODEL", "claude-sonnet") MEMORY_SERVICE_URL = os.environ.get("MEMORY_SERVICE_URL", "http://memory-service:8090") MEMORY_SERVICE_TOKEN = os.environ.get("MEMORY_SERVICE_TOKEN", "") CONFLUENCE_URL = os.environ.get("CONFLUENCE_BASE_URL", "") @@ -2078,9 +2080,60 @@ class Bot: else: return f"Unknown tool: {tool_name}" + # -- Escalation patterns for model routing -- + _ESCALATION_KEYWORDS = re.compile( + r"\b(debug|architecture|algorithm|regex|sql|refactor|optimize|migration" + r"|explain\s+in\s+detail|explain\s+how|step.by.step)\b", + re.IGNORECASE, + ) + _EXPLICIT_ESCALATION = re.compile( + r"\b(think\s+harder|detailed|comprehensive|deep\s+dive|ausf[üu]hrlich|genau\s+erkl[äa]r)\b", + re.IGNORECASE, + ) + + def _check_escalation(self, user_message: str, image_data: tuple | None) -> str | None: + """Return escalation reason string, or None if Haiku suffices.""" + if image_data: + return "multimodal" + if len(user_message) > 500: + return "long_message" + if "```" in user_message: + return "code_block" + m = self._ESCALATION_KEYWORDS.search(user_message) + if m: + return f"technical_keyword:{m.group(0).lower()}" + if user_message.count("?") >= 3: + return "multi_question" + if self._EXPLICIT_ESCALATION.search(user_message): + return "explicit_request" + return None + + def _select_model(self, room_id: str, user_message: str, image_data: tuple | None) -> tuple[str, str]: + """Pick model: room override > escalation heuristics > BASE_MODEL.""" + if room_id in self.room_models: + return self.room_models[room_id], "room_override" + + reason = self._check_escalation(user_message, image_data) + if reason: + sentry_sdk.set_tag("escalated", "true") + sentry_sdk.set_tag("escalation_reason", reason) + sentry_sdk.add_breadcrumb(category="model", message=f"Escalated to {ESCALATION_MODEL}: {reason}") + return ESCALATION_MODEL, reason + + sentry_sdk.set_tag("escalated", "false") + return BASE_MODEL, "default" + async def _respond_with_ai(self, room, user_message: str, sender: str = None, image_data: tuple = None) -> str | None: """Send AI response and return the reply text (or None on failure).""" - model = self.room_models.get(room.room_id, DEFAULT_MODEL) + model, escalation_reason = self._select_model(room.room_id, user_message, image_data) + sentry_sdk.set_tag("model_used", model) + sentry_sdk.set_context("ai_request", { + "message_length": len(user_message), + "has_images": bool(image_data), + "escalation_reason": escalation_reason, + "room_id": room.room_id[:30], + }) + logger.info("Model selected: %s (reason: %s) for room %s", model, escalation_reason, room.room_id[:30]) # Fetch conversation history FIRST (needed for query rewriting) history = [] @@ -2098,7 +2151,7 @@ class Bot: logger.debug("Could not fetch room history, proceeding without context") # Rewrite query using conversation context for better RAG search - search_query = await self._rewrite_query(user_message, history, model) + search_query = await self._rewrite_query(user_message, history) # Document context via MatrixHost API doc_results = await self.rag.search(search_query, matrix_user_id=sender) if sender else [] @@ -2180,6 +2233,17 @@ class Bot: choice = resp.choices[0] reply = choice.message.content or "" + sentry_sdk.add_breadcrumb( + category="llm", + message=f"LLM response: {model}", + data={ + "tokens_in": getattr(resp.usage, "prompt_tokens", 0) if resp.usage else 0, + "tokens_out": getattr(resp.usage, "completion_tokens", 0) if resp.usage else 0, + "tool_calls": len(choice.message.tool_calls or []), + "iteration": iteration, + }, + ) + if not choice.message.tool_calls: # No tool calls — final text response break @@ -2209,6 +2273,10 @@ class Bot: }) logger.info("Tool %s executed (iter %d) for %s", tc.function.name, iteration, sender) + # Tag whether tools were used during the loop + if iteration > 0: + sentry_sdk.set_tag("used_tools", "true") + # Send final reply if reply: await self._send_text(room.room_id, reply) @@ -2246,7 +2314,7 @@ class Bot: await self._send_text(room.room_id, "Sorry, I couldn't generate a response.") return None - async def _rewrite_query(self, user_message: str, history: list[dict], model: str) -> str: + async def _rewrite_query(self, user_message: str, history: list[dict]) -> str: """Rewrite user message into a standalone search query using conversation context.""" if not history or not self.llm: return user_message @@ -2261,7 +2329,7 @@ class Bot: try: resp = await self.llm.chat.completions.create( - model=model, + model=BASE_MODEL, messages=[ {"role": "system", "content": ( "You are a search query rewriter. Given conversation history and a new user message, " @@ -2289,7 +2357,7 @@ class Bot: try: resp = await self.llm.chat.completions.create( - model=self.room_models.get(room.room_id, DEFAULT_MODEL), + model=BASE_MODEL, messages=[ {"role": "user", "content": user_message}, {"role": "assistant", "content": ai_reply[:300]}, diff --git a/docker-compose.yml b/docker-compose.yml index 218f97c..d1ac2ce 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -19,6 +19,8 @@ services: - LITELLM_BASE_URL - LITELLM_API_KEY - DEFAULT_MODEL + - BASE_MODEL=${BASE_MODEL:-claude-haiku} + - ESCALATION_MODEL=${ESCALATION_MODEL:-claude-sonnet} - MEMORY_SERVICE_URL=http://memory-service:8090 - MEMORY_SERVICE_TOKEN - PORTAL_URL