feat(CF-2411): Pipeline hardening — Sentry, retry, concurrent limits, audit log

- Sentry transactions wrapping pipeline execution with tags
- Retry with exponential backoff for transient failures (connect, timeout, 5xx)
- Concurrent execution limit (3/user) enforced in scheduler
- Audit log events fired at each pipeline lifecycle point
- Resume support: skip already-completed steps on restart

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Christian Gick
2026-03-24 08:29:24 +02:00
parent b3e6ae65de
commit 0ae59c8ebe
3 changed files with 286 additions and 110 deletions

View File

@@ -13,6 +13,7 @@ logger = logging.getLogger(__name__)
SYNC_INTERVAL = 300 # 5 minutes — full job reconciliation
PENDING_CHECK_INTERVAL = 15 # 15 seconds — fast check for manual triggers
MAX_CONCURRENT_PER_USER = 3 # CF-2411: prevent runaway pipelines
class CronScheduler:
@@ -145,6 +146,16 @@ class CronScheduler:
pipelines = await self._pipeline_state.fetch_active_pipelines()
for pipeline in pipelines:
if pipeline.get("lastStatus") == "pending":
# CF-2411: concurrent limit check
user_id = pipeline.get("userId", "")
if user_id:
active = await self._pipeline_state.count_active_executions(user_id)
if active >= MAX_CONCURRENT_PER_USER:
logger.warning(
"Pipeline %s skipped: user %s has %d active executions (limit %d)",
pipeline["name"], user_id, active, MAX_CONCURRENT_PER_USER,
)
continue
logger.info("Pending pipeline trigger: %s", pipeline["name"])
asyncio.create_task(self.pipeline_engine.run(pipeline))
@@ -155,6 +166,16 @@ class CronScheduler:
sleep_secs = self._seconds_until_next_run(pipeline)
if sleep_secs > 0:
await asyncio.sleep(sleep_secs)
# CF-2411: concurrent limit check
user_id = pipeline.get("userId", "")
if user_id:
active = await self._pipeline_state.count_active_executions(user_id)
if active >= MAX_CONCURRENT_PER_USER:
logger.warning(
"Pipeline %s cron skipped: user %s at limit (%d/%d)",
pipeline["name"], user_id, active, MAX_CONCURRENT_PER_USER,
)
continue
await self.pipeline_engine.run(pipeline)
except asyncio.CancelledError:
pass