Melika Kheirieh commited on
Commit
f89e294
·
1 Parent(s): f55959d

feat(metrics): initialize all counters with zero and extend Prometheus rules for full Grafana coverage

Browse files
docker-compose.prom.yml CHANGED
@@ -4,6 +4,7 @@ services:
4
  container_name: nl2sql-prom
5
  command:
6
  - --config.file=/etc/prometheus/prometheus.yml
 
7
  volumes:
8
  - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
9
  - ./prometheus/rules.yml:/etc/prometheus/rules.yml:ro
 
4
  container_name: nl2sql-prom
5
  command:
6
  - --config.file=/etc/prometheus/prometheus.yml
7
+ - --web.enable-lifecycle
8
  volumes:
9
  - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
10
  - ./prometheus/rules.yml:/etc/prometheus/rules.yml:ro
nl2sql/metrics.py CHANGED
@@ -59,7 +59,6 @@ repair_attempts_total = Counter(
59
  registry=REGISTRY,
60
  )
61
 
62
-
63
  # -----------------------------------------------------------------------------
64
  # Pipeline-level metrics
65
  # -----------------------------------------------------------------------------
@@ -69,3 +68,44 @@ pipeline_runs_total = Counter(
69
  ["status"], # ok | error | ambiguous
70
  registry=REGISTRY,
71
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  registry=REGISTRY,
60
  )
61
 
 
62
  # -----------------------------------------------------------------------------
63
  # Pipeline-level metrics
64
  # -----------------------------------------------------------------------------
 
68
  ["status"], # ok | error | ambiguous
69
  registry=REGISTRY,
70
  )
71
+
72
+ # -----------------------------------------------------------------------------
73
+ # Cache metrics (optional)
74
+ # -----------------------------------------------------------------------------
75
+ cache_events_total = Counter(
76
+ "cache_events_total",
77
+ "Cache hit/miss events in the pipeline",
78
+ ["hit"], # "true" | "false"
79
+ registry=REGISTRY,
80
+ )
81
+
82
+ # -----------------------------------------------------------------------------
83
+ # Prime all counters with zero to ensure Grafana panels always have data
84
+ # -----------------------------------------------------------------------------
85
+ for reason in (
86
+ "forbidden_keyword",
87
+ "multiple_statements",
88
+ "non_readonly",
89
+ "explain_not_allowed",
90
+ "parse_error",
91
+ "semantic_check_error",
92
+ "adapter_failure",
93
+ "unsafe-sql",
94
+ "malformed-sql",
95
+ "unknown",
96
+ ):
97
+ safety_blocks_total.labels(reason=reason).inc(0)
98
+ verifier_failures_total.labels(reason=reason).inc(0)
99
+
100
+ for ok in ("true", "false"):
101
+ safety_checks_total.labels(ok=ok).inc(0)
102
+ verifier_checks_total.labels(ok=ok).inc(0)
103
+
104
+ for outcome in ("attempt", "success", "failed"):
105
+ repair_attempts_total.labels(outcome=outcome).inc(0)
106
+
107
+ for status in ("ok", "error", "ambiguous"):
108
+ pipeline_runs_total.labels(status=status).inc(0)
109
+
110
+ for hit in ("true", "false"):
111
+ cache_events_total.labels(hit=hit).inc(0)
nl2sql/verifier.py CHANGED
@@ -5,6 +5,10 @@ import time
5
  from typing import Any, Dict
6
 
7
  from nl2sql.types import StageResult, StageTrace
 
 
 
 
8
 
9
 
10
  class Verifier:
@@ -92,6 +96,7 @@ class Verifier:
92
  # --- pass ---
93
  dt = int(round((time.perf_counter() - t0) * 1000.0))
94
  notes.update({"verified": True, "reason": reason})
 
95
  trace = StageTrace(
96
  stage="verifier",
97
  duration_ms=dt,
@@ -123,6 +128,10 @@ class Verifier:
123
  notes.update({"verified": False, "reason": reason})
124
  if exc_type:
125
  notes["exception_type"] = exc_type
 
 
 
 
126
  trace = StageTrace(
127
  stage="verifier",
128
  duration_ms=dt,
 
5
  from typing import Any, Dict
6
 
7
  from nl2sql.types import StageResult, StageTrace
8
+ from nl2sql.metrics import (
9
+ verifier_checks_total,
10
+ verifier_failures_total,
11
+ )
12
 
13
 
14
  class Verifier:
 
96
  # --- pass ---
97
  dt = int(round((time.perf_counter() - t0) * 1000.0))
98
  notes.update({"verified": True, "reason": reason})
99
+ verifier_checks_total.labels(ok="true").inc()
100
  trace = StageTrace(
101
  stage="verifier",
102
  duration_ms=dt,
 
128
  notes.update({"verified": False, "reason": reason})
129
  if exc_type:
130
  notes["exception_type"] = exc_type
131
+
132
+ verifier_checks_total.labels(ok="false").inc()
133
+ verifier_failures_total.labels(reason=reason).inc()
134
+
135
  trace = StageTrace(
136
  stage="verifier",
137
  duration_ms=dt,
prometheus/rules.yml CHANGED
@@ -1,72 +1,91 @@
1
  groups:
2
  # 1) Recording rules (all derived metric calculations)
3
- - name: nl2sql_derived
4
- interval: 15s
5
- rules:
6
- # p95 latency per stage (ms) — remove *1000 if histogram buckets are already in milliseconds
7
- - record: nl2sql:stage_p95_ms
8
- expr: |
9
- histogram_quantile(
10
- 0.95,
11
- sum by (le, stage) (rate(stage_duration_ms_bucket[5m]))
12
- ) * 1000
13
 
14
- # pipeline success ratio (0..1) — safe division to avoid divide-by-zero
15
- - record: nl2sql:pipeline_success_ratio
16
- expr: |
17
- (
18
- sum(rate(pipeline_runs_total{status="ok"}[5m]))
19
- )
20
- /
21
- clamp_min(sum(rate(pipeline_runs_total[5m])), 1)
22
 
23
- # repair success rate (0..1)
24
- - record: nl2sql:repair_success_rate
25
- expr: |
26
- (sum(increase(repair_attempts_total{outcome="success"}[30m]))) /
27
- clamp_min(sum(increase(repair_attempts_total[30m])), 1)
 
 
 
28
 
29
- # cache hit ratio (0..1)
30
- - record: nl2sql:cache_hit_ratio
31
- expr: |
32
- (
33
- sum(rate(cache_hits_total[5m]))
34
- )
35
- /
36
- clamp_min(
37
- sum(rate(cache_hits_total[5m])) + sum(rate(cache_misses_total[5m])),
38
- 1
39
- )
40
 
41
- # 2) Alerts (must come after recording rules)
42
- - name: nl2sql_alerts
43
- rules:
44
- # Success ratio < 90% for 10 minutes
45
- - alert: PipelineLowSuccessRatio
46
- expr: nl2sql:pipeline_success_ratio < 0.9
47
- for: 10m
48
- labels:
49
- severity: warning
50
- annotations:
51
- summary: "Pipeline success ratio dropped"
52
- description: "Success ratio < 90% over the past 10 minutes"
53
 
54
- # Generator p95 latency > 1.5s for 5 minutes
55
- - alert: GeneratorLatencyHigh
56
- expr: nl2sql:stage_p95_ms{stage="generator"} > 1500
57
- for: 5m
58
- labels:
59
- severity: warning
60
- annotations:
61
- summary: "Generator p95 latency high"
62
- description: "Generator p95 > 1.5s for 5 minutes"
63
 
64
- # Safety blocks spike per minute (not per second)
65
- - alert: SafetyBlocksSpike
66
- expr: rate(safety_blocks_total[5m]) * 60 > 0.5
67
- for: 5m
68
- labels:
69
- severity: info
70
- annotations:
71
- summary: "Unusual Safety block rate"
72
- description: "Safety blocks > 0.5 per minute (5m window)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  groups:
2
  # 1) Recording rules (all derived metric calculations)
3
+ - name: nl2sql_derived
4
+ interval: 15s
5
+ rules:
6
+ # p95 latency per stage (ms)
7
+ - record: nl2sql:stage_p95_ms
8
+ expr: |
9
+ histogram_quantile(
10
+ 0.95,
11
+ sum by (le, stage) (rate(stage_duration_ms_bucket[5m]))
12
+ ) * 1000
13
 
14
+ # pipeline success ratio
15
+ - record: nl2sql:pipeline_success_ratio
16
+ expr: |
17
+ (
18
+ sum(rate(pipeline_runs_total{status="ok"}[5m]))
19
+ )
20
+ /
21
+ clamp_min(sum(rate(pipeline_runs_total[5m])), 1)
22
 
23
+ # repair success rate
24
+ - record: nl2sql:repair_success_rate
25
+ expr: |
26
+ (
27
+ sum(rate(repair_attempts_total{outcome="success"}[5m]))
28
+ )
29
+ /
30
+ clamp_min(sum(rate(repair_attempts_total{outcome="attempt"}[5m])), 1)
31
 
32
+ # cache hit ratio
33
+ - record: nl2sql:cache_hit_ratio
34
+ expr: |
35
+ (
36
+ sum(rate(cache_events_total{hit="true"}[5m]))
37
+ )
38
+ /
39
+ clamp_min(sum(rate(cache_events_total[5m])), 1)
 
 
 
40
 
41
+ # verifier events per minute (split by ok)
42
+ - record: nl2sql:verifier_events_per_min
43
+ expr: |
44
+ sum by (ok) (rate(verifier_checks_total[1m]))
 
 
 
 
 
 
 
 
45
 
46
+ # safety blocks per minute
47
+ - record: nl2sql:safety_blocks_per_min
48
+ expr: |
49
+ sum(rate(safety_blocks_total[1m]))
 
 
 
 
 
50
 
51
+ # combined safety + verifier failures per minute
52
+ - record: nl2sql:safety_verifier_events_per_min
53
+ expr: |
54
+ (
55
+ sum(rate(safety_blocks_total[1m]))
56
+ +
57
+ sum(rate(verifier_failures_total[1m]))
58
+ )
59
+
60
+ # 2) Alerts (must come after recording rules)
61
+ - name: nl2sql_alerts
62
+ rules:
63
+ # Success ratio < 90% for 10 minutes
64
+ - alert: PipelineLowSuccessRatio
65
+ expr: nl2sql:pipeline_success_ratio < 0.9
66
+ for: 10m
67
+ labels:
68
+ severity: warning
69
+ annotations:
70
+ summary: "Pipeline success ratio dropped"
71
+ description: "Success ratio < 90% over the past 10 minutes"
72
+
73
+ # Generator p95 latency > 1.5s for 5 minutes
74
+ - alert: GeneratorLatencyHigh
75
+ expr: nl2sql:stage_p95_ms{stage="generator"} > 1500
76
+ for: 5m
77
+ labels:
78
+ severity: warning
79
+ annotations:
80
+ summary: "Generator p95 latency high"
81
+ description: "Generator p95 > 1.5s for 5 minutes"
82
+
83
+ # Safety blocks spike — per minute (not per second)
84
+ - alert: SafetyBlocksSpike
85
+ expr: rate(safety_blocks_total[5m]) * 60 > 0.5
86
+ for: 5m
87
+ labels:
88
+ severity: info
89
+ annotations:
90
+ summary: "Unusual Safety block rate"
91
+ description: "Safety blocks > 0.5 per minute (5m window)"