Spaces:

melikakheirieh
/

nl2sql-copilot

Sleeping

App Files Files Community

Melika Kheirieh commited on Nov 12

Commit

f89e294

1 Parent(s): f55959d

feat(metrics): initialize all counters with zero and extend Prometheus rules for full Grafana coverage

Browse files

Files changed (4) hide show

docker-compose.prom.yml +1 -0
nl2sql/metrics.py +41 -1
nl2sql/verifier.py +9 -0
prometheus/rules.yml +83 -64

docker-compose.prom.yml CHANGED Viewed

@@ -4,6 +4,7 @@ services:
     container_name: nl2sql-prom
     command:
       - --config.file=/etc/prometheus/prometheus.yml
     volumes:
       - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
       - ./prometheus/rules.yml:/etc/prometheus/rules.yml:ro

     container_name: nl2sql-prom
     command:
       - --config.file=/etc/prometheus/prometheus.yml
+      - --web.enable-lifecycle
     volumes:
       - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
       - ./prometheus/rules.yml:/etc/prometheus/rules.yml:ro

nl2sql/metrics.py CHANGED Viewed

@@ -59,7 +59,6 @@ repair_attempts_total = Counter(
     registry=REGISTRY,
 )
 # -----------------------------------------------------------------------------
 #  Pipeline-level metrics
 # -----------------------------------------------------------------------------
@@ -69,3 +68,44 @@ pipeline_runs_total = Counter(
     ["status"],  # ok | error | ambiguous
     registry=REGISTRY,
 )

     registry=REGISTRY,
 )
 # -----------------------------------------------------------------------------
 #  Pipeline-level metrics
 # -----------------------------------------------------------------------------
     ["status"],  # ok | error | ambiguous
     registry=REGISTRY,
 )
+# -----------------------------------------------------------------------------
+#  Cache metrics (optional)
+# -----------------------------------------------------------------------------
+cache_events_total = Counter(
+    "cache_events_total",
+    "Cache hit/miss events in the pipeline",
+    ["hit"],  # "true" | "false"
+    registry=REGISTRY,
+)
+# -----------------------------------------------------------------------------
+#  Prime all counters with zero to ensure Grafana panels always have data
+# -----------------------------------------------------------------------------
+for reason in (
+    "forbidden_keyword",
+    "multiple_statements",
+    "non_readonly",
+    "explain_not_allowed",
+    "parse_error",
+    "semantic_check_error",
+    "adapter_failure",
+    "unsafe-sql",
+    "malformed-sql",
+    "unknown",
+):
+    safety_blocks_total.labels(reason=reason).inc(0)
+    verifier_failures_total.labels(reason=reason).inc(0)
+for ok in ("true", "false"):
+    safety_checks_total.labels(ok=ok).inc(0)
+    verifier_checks_total.labels(ok=ok).inc(0)
+for outcome in ("attempt", "success", "failed"):
+    repair_attempts_total.labels(outcome=outcome).inc(0)
+for status in ("ok", "error", "ambiguous"):
+    pipeline_runs_total.labels(status=status).inc(0)
+for hit in ("true", "false"):
+    cache_events_total.labels(hit=hit).inc(0)

nl2sql/verifier.py CHANGED Viewed

@@ -5,6 +5,10 @@ import time
 from typing import Any, Dict
 from nl2sql.types import StageResult, StageTrace
 class Verifier:
@@ -92,6 +96,7 @@ class Verifier:
             # --- pass ---
             dt = int(round((time.perf_counter() - t0) * 1000.0))
             notes.update({"verified": True, "reason": reason})
             trace = StageTrace(
                 stage="verifier",
                 duration_ms=dt,
@@ -123,6 +128,10 @@ class Verifier:
         notes.update({"verified": False, "reason": reason})
         if exc_type:
             notes["exception_type"] = exc_type
         trace = StageTrace(
             stage="verifier",
             duration_ms=dt,

 from typing import Any, Dict
 from nl2sql.types import StageResult, StageTrace
+from nl2sql.metrics import (
+    verifier_checks_total,
+    verifier_failures_total,
+)
 class Verifier:
             # --- pass ---
             dt = int(round((time.perf_counter() - t0) * 1000.0))
             notes.update({"verified": True, "reason": reason})
+            verifier_checks_total.labels(ok="true").inc()
             trace = StageTrace(
                 stage="verifier",
                 duration_ms=dt,
         notes.update({"verified": False, "reason": reason})
         if exc_type:
             notes["exception_type"] = exc_type
+        verifier_checks_total.labels(ok="false").inc()
+        verifier_failures_total.labels(reason=reason).inc()
         trace = StageTrace(
             stage="verifier",
             duration_ms=dt,

prometheus/rules.yml CHANGED Viewed

@@ -1,72 +1,91 @@
 groups:
 # 1) Recording rules (all derived metric calculations)
-- name: nl2sql_derived
-  interval: 15s
-  rules:
-    # p95 latency per stage (ms) — remove *1000 if histogram buckets are already in milliseconds
-    - record: nl2sql:stage_p95_ms
-      expr: |
-        histogram_quantile(
-          0.95,
-          sum by (le, stage) (rate(stage_duration_ms_bucket[5m]))
-        ) * 1000
-    # pipeline success ratio (0..1) — safe division to avoid divide-by-zero
-    - record: nl2sql:pipeline_success_ratio
-      expr: |
-        (
-          sum(rate(pipeline_runs_total{status="ok"}[5m]))
-        )
-        /
-        clamp_min(sum(rate(pipeline_runs_total[5m])), 1)
-    # repair success rate (0..1)
-    - record: nl2sql:repair_success_rate
-      expr: |
-          (sum(increase(repair_attempts_total{outcome="success"}[30m]))) /
-          clamp_min(sum(increase(repair_attempts_total[30m])), 1)
-    # cache hit ratio (0..1)
-    - record: nl2sql:cache_hit_ratio
-      expr: |
-        (
-          sum(rate(cache_hits_total[5m]))
-        )
-        /
-        clamp_min(
-          sum(rate(cache_hits_total[5m])) + sum(rate(cache_misses_total[5m])),
-          1
-        )
-# 2) Alerts (must come after recording rules)
-- name: nl2sql_alerts
-  rules:
-    # Success ratio < 90% for 10 minutes
-    - alert: PipelineLowSuccessRatio
-      expr: nl2sql:pipeline_success_ratio < 0.9
-      for: 10m
-      labels:
-        severity: warning
-      annotations:
-        summary: "Pipeline success ratio dropped"
-        description: "Success ratio < 90% over the past 10 minutes"
-    # Generator p95 latency > 1.5s for 5 minutes
-    - alert: GeneratorLatencyHigh
-      expr: nl2sql:stage_p95_ms{stage="generator"} > 1500
-      for: 5m
-      labels:
-        severity: warning
-      annotations:
-        summary: "Generator p95 latency high"
-        description: "Generator p95 > 1.5s for 5 minutes"
-    # Safety blocks spike — per minute (not per second)
-    - alert: SafetyBlocksSpike
-      expr: rate(safety_blocks_total[5m]) * 60 > 0.5
-      for: 5m
-      labels:
-        severity: info
-      annotations:
-        summary: "Unusual Safety block rate"
-        description: "Safety blocks > 0.5 per minute (5m window)"

 groups:
 # 1) Recording rules (all derived metric calculations)
+  - name: nl2sql_derived
+    interval: 15s
+    rules:
+      # p95 latency per stage (ms)
+      - record: nl2sql:stage_p95_ms
+        expr: |
+          histogram_quantile(
+            0.95,
+            sum by (le, stage) (rate(stage_duration_ms_bucket[5m]))
+          ) * 1000
+      # pipeline success ratio
+      - record: nl2sql:pipeline_success_ratio
+        expr: |
+          (
+            sum(rate(pipeline_runs_total{status="ok"}[5m]))
+          )
+          /
+          clamp_min(sum(rate(pipeline_runs_total[5m])), 1)
+      # repair success rate
+      - record: nl2sql:repair_success_rate
+        expr: |
+          (
+            sum(rate(repair_attempts_total{outcome="success"}[5m]))
+          )
+          /
+          clamp_min(sum(rate(repair_attempts_total{outcome="attempt"}[5m])), 1)
+      # cache hit ratio
+      - record: nl2sql:cache_hit_ratio
+        expr: |
+          (
+            sum(rate(cache_events_total{hit="true"}[5m]))
+          )
+          /
+          clamp_min(sum(rate(cache_events_total[5m])), 1)
+      # verifier events per minute (split by ok)
+      - record: nl2sql:verifier_events_per_min
+        expr: |
+          sum by (ok) (rate(verifier_checks_total[1m]))
+      # safety blocks per minute
+      - record: nl2sql:safety_blocks_per_min
+        expr: |
+          sum(rate(safety_blocks_total[1m]))
+      # combined safety + verifier failures per minute
+      - record: nl2sql:safety_verifier_events_per_min
+        expr: |
+          (
+            sum(rate(safety_blocks_total[1m]))
+            +
+            sum(rate(verifier_failures_total[1m]))
+          )
+  # 2) Alerts (must come after recording rules)
+  - name: nl2sql_alerts
+    rules:
+      # Success ratio < 90% for 10 minutes
+      - alert: PipelineLowSuccessRatio
+        expr: nl2sql:pipeline_success_ratio < 0.9
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Pipeline success ratio dropped"
+          description: "Success ratio < 90% over the past 10 minutes"
+      # Generator p95 latency > 1.5s for 5 minutes
+      - alert: GeneratorLatencyHigh
+        expr: nl2sql:stage_p95_ms{stage="generator"} > 1500
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Generator p95 latency high"
+          description: "Generator p95 > 1.5s for 5 minutes"
+      # Safety blocks spike — per minute (not per second)
+      - alert: SafetyBlocksSpike
+        expr: rate(safety_blocks_total[5m]) * 60 > 0.5
+        for: 5m
+        labels:
+          severity: info
+        annotations:
+          summary: "Unusual Safety block rate"
+          description: "Safety blocks > 0.5 per minute (5m window)"