feat: Haiku-default model routing with Sonnet escalation + Sentry observability
Route ~90% of simple chat to claude-haiku (4x cheaper), escalate to claude-sonnet for code blocks, long messages, technical keywords, multimodal, and explicit requests. Sentry tags track model_used, escalation_reason, and token usage breadcrumbs. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
78
bot.py
78
bot.py
@@ -69,6 +69,8 @@ CREDS_FILE = os.path.join(STORE_PATH, "credentials.json")
|
|||||||
LITELLM_URL = os.environ.get("LITELLM_BASE_URL", "")
|
LITELLM_URL = os.environ.get("LITELLM_BASE_URL", "")
|
||||||
LITELLM_KEY = os.environ.get("LITELLM_API_KEY", "not-needed")
|
LITELLM_KEY = os.environ.get("LITELLM_API_KEY", "not-needed")
|
||||||
DEFAULT_MODEL = os.environ.get("DEFAULT_MODEL", "claude-sonnet")
|
DEFAULT_MODEL = os.environ.get("DEFAULT_MODEL", "claude-sonnet")
|
||||||
|
BASE_MODEL = os.environ.get("BASE_MODEL", "claude-haiku")
|
||||||
|
ESCALATION_MODEL = os.environ.get("ESCALATION_MODEL", "claude-sonnet")
|
||||||
MEMORY_SERVICE_URL = os.environ.get("MEMORY_SERVICE_URL", "http://memory-service:8090")
|
MEMORY_SERVICE_URL = os.environ.get("MEMORY_SERVICE_URL", "http://memory-service:8090")
|
||||||
MEMORY_SERVICE_TOKEN = os.environ.get("MEMORY_SERVICE_TOKEN", "")
|
MEMORY_SERVICE_TOKEN = os.environ.get("MEMORY_SERVICE_TOKEN", "")
|
||||||
CONFLUENCE_URL = os.environ.get("CONFLUENCE_BASE_URL", "")
|
CONFLUENCE_URL = os.environ.get("CONFLUENCE_BASE_URL", "")
|
||||||
@@ -2078,9 +2080,60 @@ class Bot:
|
|||||||
else:
|
else:
|
||||||
return f"Unknown tool: {tool_name}"
|
return f"Unknown tool: {tool_name}"
|
||||||
|
|
||||||
|
# -- Escalation patterns for model routing --
|
||||||
|
_ESCALATION_KEYWORDS = re.compile(
|
||||||
|
r"\b(debug|architecture|algorithm|regex|sql|refactor|optimize|migration"
|
||||||
|
r"|explain\s+in\s+detail|explain\s+how|step.by.step)\b",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
_EXPLICIT_ESCALATION = re.compile(
|
||||||
|
r"\b(think\s+harder|detailed|comprehensive|deep\s+dive|ausf[üu]hrlich|genau\s+erkl[äa]r)\b",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _check_escalation(self, user_message: str, image_data: tuple | None) -> str | None:
|
||||||
|
"""Return escalation reason string, or None if Haiku suffices."""
|
||||||
|
if image_data:
|
||||||
|
return "multimodal"
|
||||||
|
if len(user_message) > 500:
|
||||||
|
return "long_message"
|
||||||
|
if "```" in user_message:
|
||||||
|
return "code_block"
|
||||||
|
m = self._ESCALATION_KEYWORDS.search(user_message)
|
||||||
|
if m:
|
||||||
|
return f"technical_keyword:{m.group(0).lower()}"
|
||||||
|
if user_message.count("?") >= 3:
|
||||||
|
return "multi_question"
|
||||||
|
if self._EXPLICIT_ESCALATION.search(user_message):
|
||||||
|
return "explicit_request"
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _select_model(self, room_id: str, user_message: str, image_data: tuple | None) -> tuple[str, str]:
|
||||||
|
"""Pick model: room override > escalation heuristics > BASE_MODEL."""
|
||||||
|
if room_id in self.room_models:
|
||||||
|
return self.room_models[room_id], "room_override"
|
||||||
|
|
||||||
|
reason = self._check_escalation(user_message, image_data)
|
||||||
|
if reason:
|
||||||
|
sentry_sdk.set_tag("escalated", "true")
|
||||||
|
sentry_sdk.set_tag("escalation_reason", reason)
|
||||||
|
sentry_sdk.add_breadcrumb(category="model", message=f"Escalated to {ESCALATION_MODEL}: {reason}")
|
||||||
|
return ESCALATION_MODEL, reason
|
||||||
|
|
||||||
|
sentry_sdk.set_tag("escalated", "false")
|
||||||
|
return BASE_MODEL, "default"
|
||||||
|
|
||||||
async def _respond_with_ai(self, room, user_message: str, sender: str = None, image_data: tuple = None) -> str | None:
|
async def _respond_with_ai(self, room, user_message: str, sender: str = None, image_data: tuple = None) -> str | None:
|
||||||
"""Send AI response and return the reply text (or None on failure)."""
|
"""Send AI response and return the reply text (or None on failure)."""
|
||||||
model = self.room_models.get(room.room_id, DEFAULT_MODEL)
|
model, escalation_reason = self._select_model(room.room_id, user_message, image_data)
|
||||||
|
sentry_sdk.set_tag("model_used", model)
|
||||||
|
sentry_sdk.set_context("ai_request", {
|
||||||
|
"message_length": len(user_message),
|
||||||
|
"has_images": bool(image_data),
|
||||||
|
"escalation_reason": escalation_reason,
|
||||||
|
"room_id": room.room_id[:30],
|
||||||
|
})
|
||||||
|
logger.info("Model selected: %s (reason: %s) for room %s", model, escalation_reason, room.room_id[:30])
|
||||||
|
|
||||||
# Fetch conversation history FIRST (needed for query rewriting)
|
# Fetch conversation history FIRST (needed for query rewriting)
|
||||||
history = []
|
history = []
|
||||||
@@ -2098,7 +2151,7 @@ class Bot:
|
|||||||
logger.debug("Could not fetch room history, proceeding without context")
|
logger.debug("Could not fetch room history, proceeding without context")
|
||||||
|
|
||||||
# Rewrite query using conversation context for better RAG search
|
# Rewrite query using conversation context for better RAG search
|
||||||
search_query = await self._rewrite_query(user_message, history, model)
|
search_query = await self._rewrite_query(user_message, history)
|
||||||
|
|
||||||
# Document context via MatrixHost API
|
# Document context via MatrixHost API
|
||||||
doc_results = await self.rag.search(search_query, matrix_user_id=sender) if sender else []
|
doc_results = await self.rag.search(search_query, matrix_user_id=sender) if sender else []
|
||||||
@@ -2180,6 +2233,17 @@ class Bot:
|
|||||||
choice = resp.choices[0]
|
choice = resp.choices[0]
|
||||||
reply = choice.message.content or ""
|
reply = choice.message.content or ""
|
||||||
|
|
||||||
|
sentry_sdk.add_breadcrumb(
|
||||||
|
category="llm",
|
||||||
|
message=f"LLM response: {model}",
|
||||||
|
data={
|
||||||
|
"tokens_in": getattr(resp.usage, "prompt_tokens", 0) if resp.usage else 0,
|
||||||
|
"tokens_out": getattr(resp.usage, "completion_tokens", 0) if resp.usage else 0,
|
||||||
|
"tool_calls": len(choice.message.tool_calls or []),
|
||||||
|
"iteration": iteration,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
if not choice.message.tool_calls:
|
if not choice.message.tool_calls:
|
||||||
# No tool calls — final text response
|
# No tool calls — final text response
|
||||||
break
|
break
|
||||||
@@ -2209,6 +2273,10 @@ class Bot:
|
|||||||
})
|
})
|
||||||
logger.info("Tool %s executed (iter %d) for %s", tc.function.name, iteration, sender)
|
logger.info("Tool %s executed (iter %d) for %s", tc.function.name, iteration, sender)
|
||||||
|
|
||||||
|
# Tag whether tools were used during the loop
|
||||||
|
if iteration > 0:
|
||||||
|
sentry_sdk.set_tag("used_tools", "true")
|
||||||
|
|
||||||
# Send final reply
|
# Send final reply
|
||||||
if reply:
|
if reply:
|
||||||
await self._send_text(room.room_id, reply)
|
await self._send_text(room.room_id, reply)
|
||||||
@@ -2246,7 +2314,7 @@ class Bot:
|
|||||||
await self._send_text(room.room_id, "Sorry, I couldn't generate a response.")
|
await self._send_text(room.room_id, "Sorry, I couldn't generate a response.")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
async def _rewrite_query(self, user_message: str, history: list[dict], model: str) -> str:
|
async def _rewrite_query(self, user_message: str, history: list[dict]) -> str:
|
||||||
"""Rewrite user message into a standalone search query using conversation context."""
|
"""Rewrite user message into a standalone search query using conversation context."""
|
||||||
if not history or not self.llm:
|
if not history or not self.llm:
|
||||||
return user_message
|
return user_message
|
||||||
@@ -2261,7 +2329,7 @@ class Bot:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
resp = await self.llm.chat.completions.create(
|
resp = await self.llm.chat.completions.create(
|
||||||
model=model,
|
model=BASE_MODEL,
|
||||||
messages=[
|
messages=[
|
||||||
{"role": "system", "content": (
|
{"role": "system", "content": (
|
||||||
"You are a search query rewriter. Given conversation history and a new user message, "
|
"You are a search query rewriter. Given conversation history and a new user message, "
|
||||||
@@ -2289,7 +2357,7 @@ class Bot:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
resp = await self.llm.chat.completions.create(
|
resp = await self.llm.chat.completions.create(
|
||||||
model=self.room_models.get(room.room_id, DEFAULT_MODEL),
|
model=BASE_MODEL,
|
||||||
messages=[
|
messages=[
|
||||||
{"role": "user", "content": user_message},
|
{"role": "user", "content": user_message},
|
||||||
{"role": "assistant", "content": ai_reply[:300]},
|
{"role": "assistant", "content": ai_reply[:300]},
|
||||||
|
|||||||
@@ -19,6 +19,8 @@ services:
|
|||||||
- LITELLM_BASE_URL
|
- LITELLM_BASE_URL
|
||||||
- LITELLM_API_KEY
|
- LITELLM_API_KEY
|
||||||
- DEFAULT_MODEL
|
- DEFAULT_MODEL
|
||||||
|
- BASE_MODEL=${BASE_MODEL:-claude-haiku}
|
||||||
|
- ESCALATION_MODEL=${ESCALATION_MODEL:-claude-sonnet}
|
||||||
- MEMORY_SERVICE_URL=http://memory-service:8090
|
- MEMORY_SERVICE_URL=http://memory-service:8090
|
||||||
- MEMORY_SERVICE_TOKEN
|
- MEMORY_SERVICE_TOKEN
|
||||||
- PORTAL_URL
|
- PORTAL_URL
|
||||||
|
|||||||
Reference in New Issue
Block a user