feat(CF-2411): Pipeline hardening — Sentry, retry, concurrent limits, audit log

- Sentry transactions wrapping pipeline execution with tags
- Retry with exponential backoff for transient failures (connect, timeout, 5xx)
- Concurrent execution limit (3/user) enforced in scheduler
- Audit log events fired at each pipeline lifecycle point
- Resume support: skip already-completed steps on restart

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Christian Gick
2026-03-24 08:29:24 +02:00
parent b3e6ae65de
commit 0ae59c8ebe
3 changed files with 286 additions and 110 deletions

View File

@@ -60,3 +60,43 @@ class PipelineStateManager:
except Exception:
logger.debug("Failed to fetch pending approvals", exc_info=True)
return []
async def count_active_executions(self, user_id: str) -> int:
"""Count running/waiting_approval executions for a user."""
try:
async with httpx.AsyncClient(timeout=10.0) as client:
resp = await client.get(
f"{self.portal_url}/api/pipelines/executions/active",
headers={"x-api-key": self.api_key},
params={"userId": user_id},
)
resp.raise_for_status()
data = resp.json()
return data.get("count", 0)
except Exception:
logger.warning("Failed to count active executions for user %s", user_id, exc_info=True)
return 0
async def log_event(
self, execution_id: str, action: str, *,
step_name: str | None = None,
status: str | None = None,
message: str | None = None,
) -> None:
"""Log an audit event for a pipeline execution (fire-and-forget)."""
try:
payload = {"action": action}
if step_name:
payload["stepName"] = step_name
if status:
payload["status"] = status
if message:
payload["message"] = message
async with httpx.AsyncClient(timeout=10.0) as client:
await client.post(
f"{self.portal_url}/api/pipelines/executions/{execution_id}/audit-log",
headers={"x-api-key": self.api_key},
json=payload,
)
except Exception:
logger.debug("Failed to log audit event %s for %s", action, execution_id)