feat(CF-2411): Pipeline hardening — Sentry, retry, concurrent limits, audit log

- Sentry transactions wrapping pipeline execution with tags - Retry with exponential backoff for transient failures (connect, timeout, 5xx) - Concurrent execution limit (3/user) enforced in scheduler - Audit log events fired at each pipeline lifecycle point - Resume support: skip already-completed steps on restart Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-24 08:29:24 +02:00
parent b3e6ae65de
commit 0ae59c8ebe
3 changed files with 286 additions and 110 deletions
--- a/cron/scheduler.py
+++ b/cron/scheduler.py
@@ -13,6 +13,7 @@ logger = logging.getLogger(__name__)
 SYNC_INTERVAL = 300  # 5 minutes — full job reconciliation
 PENDING_CHECK_INTERVAL = 15  # 15 seconds — fast check for manual triggers
 MAX_CONCURRENT_PER_USER = 3  # CF-2411: prevent runaway pipelines
 class CronScheduler:
@@ -145,6 +146,16 @@ class CronScheduler:
        pipelines = await self._pipeline_state.fetch_active_pipelines()
        for pipeline in pipelines:
            if pipeline.get("lastStatus") == "pending":
                # CF-2411: concurrent limit check
                user_id = pipeline.get("userId", "")
                if user_id:
                    active = await self._pipeline_state.count_active_executions(user_id)
                    if active >= MAX_CONCURRENT_PER_USER:
                        logger.warning(
                            "Pipeline %s skipped: user %s has %d active executions (limit %d)",
                            pipeline["name"], user_id, active, MAX_CONCURRENT_PER_USER,
                        )
                        continue
                logger.info("Pending pipeline trigger: %s", pipeline["name"])
                asyncio.create_task(self.pipeline_engine.run(pipeline))
@@ -155,6 +166,16 @@ class CronScheduler:
                sleep_secs = self._seconds_until_next_run(pipeline)
                if sleep_secs > 0:
                    await asyncio.sleep(sleep_secs)
                # CF-2411: concurrent limit check
                user_id = pipeline.get("userId", "")
                if user_id:
                    active = await self._pipeline_state.count_active_executions(user_id)
                    if active >= MAX_CONCURRENT_PER_USER:
                        logger.warning(
                            "Pipeline %s cron skipped: user %s at limit (%d/%d)",
                            pipeline["name"], user_id, active, MAX_CONCURRENT_PER_USER,
                        )
                        continue
                await self.pipeline_engine.run(pipeline)
        except asyncio.CancelledError:
            pass
--- a/pipelines/engine.py
+++ b/pipelines/engine.py
@@ -6,11 +6,26 @@ import re
 import time
 from datetime import datetime, timezone, timedelta
 import httpx
 import sentry_sdk
 from .state import PipelineStateManager
 from .steps import execute_step
 logger = logging.getLogger(__name__)
 # Transient errors eligible for retry
 TRANSIENT_EXCEPTIONS = (
    httpx.ConnectError,
    httpx.ConnectTimeout,
    httpx.ReadTimeout,
    ConnectionError,
    OSError,
 )
 # Sentinel for failed step (distinct from None output)
 _STEP_FAILED = object()
 class PipelineEngine:
    """Executes pipeline steps sequentially, managing state and output chaining."""
@@ -74,80 +89,96 @@ class PipelineEngine:
        pipeline_name = pipeline["name"]
        target_room = pipeline["targetRoom"]
        steps = pipeline.get("steps", [])
        user_id = pipeline.get("userId", "")
        max_retries = pipeline.get("maxRetries", 0)
        if not steps:
            logger.warning("Pipeline %s has no steps", pipeline_name)
            return
-        # Create execution record
+        with sentry_sdk.start_transaction(op="pipeline.execute", name=pipeline_name) as txn:
-        execution = await self.state.create_execution(pipeline_id, trigger_data)
+            txn.set_tag("pipeline_id", pipeline_id)
-        execution_id = execution["id"]
+            txn.set_tag("user_id", user_id)
            txn.set_tag("trigger_type", pipeline.get("triggerType", "unknown"))
            txn.set_tag("step_count", len(steps))
-        context: dict[str, dict] = {}
+            # Create execution record
-        if trigger_data:
+            execution = await self.state.create_execution(pipeline_id, trigger_data)
-            context["trigger"] = trigger_data
+            execution_id = execution["id"]
            txn.set_tag("execution_id", execution_id)
-        step_results: list[dict] = []
+            # Audit: execution started
            asyncio.create_task(self.state.log_event(
                execution_id, "execution_started", message=f"Trigger: {pipeline.get('triggerType', 'manual')}"
            ))
-        try:
+            context: dict[str, dict] = {}
-            for i, step in enumerate(steps):
+            if trigger_data:
-                step_name = step.get("name", f"step_{i}")
+                context["trigger"] = trigger_data
                step_type = step.get("type", "")
-                # Evaluate condition
+            step_results: list[dict] = []
                condition = step.get("if")
                if condition and not self.evaluate_condition(condition, context):
                    logger.info("Pipeline %s: skipping step %s (condition not met)", pipeline_name, step_name)
                    result = {"name": step_name, "output": "skipped", "status": "skipped", "timestamp": time.time()}
                    step_results.append(result)
                    context[step_name] = {"output": "skipped", "status": "skipped"}
                    continue
-                # Update execution state
+            try:
-                await self.state.update_execution(
+                for i, step in enumerate(steps):
-                    execution_id,
+                    step_name = step.get("name", f"step_{i}")
-                    currentStep=i,
+                    step_type = step.get("type", "")
                    stepResults=step_results,
                    state="running",
                )
-                logger.info("Pipeline %s: executing step %s (%s)", pipeline_name, step_name, step_type)
+                    # Resume support: skip already-completed steps
-
+                    if i < len(step_results) and step_results[i].get("status") in ("success", "skipped"):
-                # Render templates in step config
+                        context[step_name] = {
-                rendered_step = {}
+                            "output": step_results[i].get("output", ""),
-                for key, value in step.items():
+                            "status": step_results[i]["status"],
                    if isinstance(value, str):
                        rendered_step[key] = self.render_template(value, context)
                    elif isinstance(value, dict):
                        rendered_step[key] = {
                            k: self.render_template(v, context) if isinstance(v, str) else v
                            for k, v in value.items()
                        }
-                    else:
+                        continue
                        rendered_step[key] = value
-                # Execute step
+                    # Evaluate condition
-                try:
+                    condition = step.get("if")
-                    timeout_s = step.get("timeout_s", 60)
+                    if condition and not self.evaluate_condition(condition, context):
                        logger.info("Pipeline %s: skipping step %s (condition not met)", pipeline_name, step_name)
                        result = {"name": step_name, "output": "skipped", "status": "skipped", "timestamp": time.time()}
                        step_results.append(result)
                        context[step_name] = {"output": "skipped", "status": "skipped"}
                        continue
-                    if step_type == "approval":
+                    # Update execution state
-                        output = await self._execute_approval_step(
+                    await self.state.update_execution(
-                            rendered_step, target_room, execution_id, timeout_s
+                        execution_id,
-                        )
+                        currentStep=i,
-                    else:
+                        stepResults=step_results,
-                        output = await asyncio.wait_for(
+                        state="running",
-                            execute_step(
+                    )
-                                step_type=step_type,
+
-                                step_config=rendered_step,
+                    logger.info("Pipeline %s: executing step %s (%s)", pipeline_name, step_name, step_type)
-                                context=context,
+
-                                send_text=self.send_text,
+                    # Render templates in step config
-                                target_room=target_room,
+                    rendered_step = {}
-                                llm=self.llm,
+                    for key, value in step.items():
-                                default_model=self.default_model,
+                        if isinstance(value, str):
-                                escalation_model=self.escalation_model,
+                            rendered_step[key] = self.render_template(value, context)
-                            ),
+                        elif isinstance(value, dict):
-                            timeout=timeout_s,
+                            rendered_step[key] = {
-                        )
+                                k: self.render_template(v, context) if isinstance(v, str) else v
                                for k, v in value.items()
                            }
                        else:
                            rendered_step[key] = value
                    # Execute step with retry logic
                    output = await self._execute_step_with_retry(
                        step_type=step_type,
                        step_name=step_name,
                        rendered_step=rendered_step,
                        context=context,
                        target_room=target_room,
                        execution_id=execution_id,
                        timeout_s=step.get("timeout_s", 60),
                        max_retries=max_retries,
                        step_results=step_results,
                        pipeline_name=pipeline_name,
                    )
                    if output is _STEP_FAILED:
                        return  # step handler already updated state
                    result = {
                        "name": step_name,
@@ -162,60 +193,133 @@ class PipelineEngine:
                    if step_type == "approval":
                        context[step_name]["response"] = output
-                except asyncio.TimeoutError:
+                    # Audit: step completed
-                    error_msg = f"Step {step_name} timed out after {step.get('timeout_s', 60)}s"
+                    asyncio.create_task(self.state.log_event(
-                    logger.error("Pipeline %s: %s", pipeline_name, error_msg)
+                        execution_id, "step_completed", step_name=step_name, status="success"
-                    step_results.append({
+                    ))
-                        "name": step_name,
+
-                        "output": None,
+                # All steps completed
-                        "status": "timeout",
+                await self.state.update_execution(
-                        "error": error_msg,
+                    execution_id,
-                        "timestamp": time.time(),
+                    state="complete",
-                    })
+                    stepResults=step_results,
-                    await self.state.update_execution(
+                )
-                        execution_id,
+                logger.info("Pipeline %s completed successfully", pipeline_name)
-                        state="failed",
+
-                        stepResults=step_results,
+                # Audit: execution completed
-                        error=error_msg,
+                asyncio.create_task(self.state.log_event(
                    execution_id, "execution_completed",
                    message=f"{len(steps)} steps completed",
                ))
            except Exception as exc:
                logger.error("Pipeline %s failed unexpectedly: %s", pipeline_name, exc, exc_info=True)
                sentry_sdk.capture_exception(exc)
                await self.state.update_execution(
                    execution_id,
                    state="failed",
                    stepResults=step_results,
                    error=str(exc),
                )
                asyncio.create_task(self.state.log_event(
                    execution_id, "execution_failed", message=str(exc)
                ))
    async def _execute_step_with_retry(
        self, *, step_type, step_name, rendered_step, context, target_room,
        execution_id, timeout_s, max_retries, step_results, pipeline_name,
    ):
        """Execute a step with retry on transient failures. Returns output or _STEP_FAILED."""
        last_exc = None
        for attempt in range(max_retries + 1):
            try:
                if step_type == "approval":
                    output = await self._execute_approval_step(
                        rendered_step, target_room, execution_id, timeout_s
                    )
-                    await self.send_text(target_room, f"**{pipeline_name}**: {error_msg}")
+                else:
-                    return
+                    output = await asyncio.wait_for(
-
+                        execute_step(
-                except Exception as exc:
+                            step_type=step_type,
-                    error_msg = f"Step {step_name} failed: {exc}"
+                            step_config=rendered_step,
-                    logger.error("Pipeline %s: %s", pipeline_name, error_msg, exc_info=True)
+                            context=context,
-                    step_results.append({
+                            send_text=self.send_text,
-                        "name": step_name,
+                            target_room=target_room,
-                        "output": None,
+                            llm=self.llm,
-                        "status": "error",
+                            default_model=self.default_model,
-                        "error": str(exc),
+                            escalation_model=self.escalation_model,
-                        "timestamp": time.time(),
+                        ),
-                    })
+                        timeout=timeout_s,
                    await self.state.update_execution(
                        execution_id,
                        state="failed",
                        stepResults=step_results,
                        error=error_msg,
                    )
-                    await self.send_text(target_room, f"**{pipeline_name}**: {error_msg}")
+                return output
                    return
-            # All steps completed
+            except TRANSIENT_EXCEPTIONS as exc:
-            await self.state.update_execution(
+                last_exc = exc
-                execution_id,
+                if attempt < max_retries:
-                state="complete",
+                    backoff = 2 ** attempt  # 1s, 2s, 4s
-                stepResults=step_results,
+                    logger.warning(
-            )
+                        "Pipeline %s: step %s transient failure (attempt %d/%d), retrying in %ds: %s",
-            logger.info("Pipeline %s completed successfully", pipeline_name)
+                        pipeline_name, step_name, attempt + 1, max_retries + 1, backoff, exc,
                    )
                    asyncio.create_task(self.state.log_event(
                        execution_id, "retry_attempted", step_name=step_name,
                        message=f"Attempt {attempt + 1}/{max_retries + 1}: {exc}",
                    ))
                    await asyncio.sleep(backoff)
                    continue
                # Exhausted retries — fall through to failure
-        except Exception as exc:
+            except asyncio.TimeoutError:
-            logger.error("Pipeline %s failed unexpectedly: %s", pipeline_name, exc, exc_info=True)
+                error_msg = f"Step {step_name} timed out after {timeout_s}s"
-            await self.state.update_execution(
+                logger.error("Pipeline %s: %s", pipeline_name, error_msg)
-                execution_id,
+                sentry_sdk.capture_message(error_msg, level="error")
-                state="failed",
+                step_results.append({
-                stepResults=step_results,
+                    "name": step_name, "output": None, "status": "timeout",
-                error=str(exc),
+                    "error": error_msg, "timestamp": time.time(),
-            )
+                })
                await self.state.update_execution(
                    execution_id, state="failed", stepResults=step_results, error=error_msg,
                )
                await self.send_text(target_room, f"**{pipeline_name}**: {error_msg}")
                asyncio.create_task(self.state.log_event(
                    execution_id, "step_failed", step_name=step_name, status="timeout", message=error_msg,
                ))
                return _STEP_FAILED
            except Exception as exc:
                error_msg = f"Step {step_name} failed: {exc}"
                logger.error("Pipeline %s: %s", pipeline_name, error_msg, exc_info=True)
                sentry_sdk.capture_exception(exc)
                step_results.append({
                    "name": step_name, "output": None, "status": "error",
                    "error": str(exc), "timestamp": time.time(),
                })
                await self.state.update_execution(
                    execution_id, state="failed", stepResults=step_results, error=error_msg,
                )
                await self.send_text(target_room, f"**{pipeline_name}**: {error_msg}")
                asyncio.create_task(self.state.log_event(
                    execution_id, "step_failed", step_name=step_name, status="error", message=str(exc),
                ))
                return _STEP_FAILED
        # Retries exhausted on transient failure
        error_msg = f"Step {step_name} failed after {max_retries + 1} attempts: {last_exc}"
        logger.error("Pipeline %s: %s", pipeline_name, error_msg)
        sentry_sdk.capture_exception(last_exc)
        step_results.append({
            "name": step_name, "output": None, "status": "error",
            "error": error_msg, "timestamp": time.time(), "retries": max_retries,
        })
        await self.state.update_execution(
            execution_id, state="failed", stepResults=step_results, error=error_msg,
        )
        await self.send_text(target_room, f"**{pipeline_name}**: {error_msg}")
        asyncio.create_task(self.state.log_event(
            execution_id, "step_failed", step_name=step_name, status="error", message=error_msg,
        ))
        return _STEP_FAILED
    @staticmethod
    def _md_to_html(text: str) -> str:
@@ -275,10 +379,21 @@ class PipelineEngine:
        try:
            result = await asyncio.wait_for(future, timeout=timeout_s)
            asyncio.create_task(self.state.log_event(
                execution_id, "approval_resolved", message=f"Response: {result}",
            ))
            return result  # "approve" or "decline"
        except asyncio.TimeoutError:
            sentry_sdk.capture_message(
                f"Approval timeout: execution={execution_id}",
                level="warning",
            )
            await self.send_text(target_room, "Approval timed out. Pipeline aborted.")
            await self.state.update_execution(execution_id, state="aborted", error="Approval timed out")
            asyncio.create_task(self.state.log_event(
                execution_id, "step_failed", step_name="approval", status="timeout",
                message="Approval timed out",
            ))
            raise
        finally:
            self._approval_futures.pop(execution_id, None)
--- a/pipelines/state.py
+++ b/pipelines/state.py
@@ -60,3 +60,43 @@ class PipelineStateManager:
        except Exception:
            logger.debug("Failed to fetch pending approvals", exc_info=True)
            return []
    async def count_active_executions(self, user_id: str) -> int:
        """Count running/waiting_approval executions for a user."""
        try:
            async with httpx.AsyncClient(timeout=10.0) as client:
                resp = await client.get(
                    f"{self.portal_url}/api/pipelines/executions/active",
                    headers={"x-api-key": self.api_key},
                    params={"userId": user_id},
                )
                resp.raise_for_status()
                data = resp.json()
            return data.get("count", 0)
        except Exception:
            logger.warning("Failed to count active executions for user %s", user_id, exc_info=True)
            return 0
    async def log_event(
        self, execution_id: str, action: str, *,
        step_name: str | None = None,
        status: str | None = None,
        message: str | None = None,
    ) -> None:
        """Log an audit event for a pipeline execution (fire-and-forget)."""
        try:
            payload = {"action": action}
            if step_name:
                payload["stepName"] = step_name
            if status:
                payload["status"] = status
            if message:
                payload["message"] = message
            async with httpx.AsyncClient(timeout=10.0) as client:
                await client.post(
                    f"{self.portal_url}/api/pipelines/executions/{execution_id}/audit-log",
                    headers={"x-api-key": self.api_key},
                    json=payload,
                )
        except Exception:
            logger.debug("Failed to log audit event %s for %s", action, execution_id)