Initial commit — basic SLO eval working
This commit is contained in:
commit
79c18a58f2
4 changed files with 109 additions and 0 deletions
23
README.md
Normal file
23
README.md
Normal file
|
|
@ -0,0 +1,23 @@
|
||||||
|
# slo-tracker
|
||||||
|
|
||||||
|
Minimal CLI for tracking SLO burn rates from Prometheus.
|
||||||
|
Outputs a quick summary to stdout — useful for morning standups and incident reviews.
|
||||||
|
|
||||||
|
Built for my own use. Tested on the Nexus prod setup.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
python slo_tracker.py --config slos.yml --window 7d
|
||||||
|
|
||||||
|
## Config format
|
||||||
|
|
||||||
|
slos:
|
||||||
|
- name: "API availability"
|
||||||
|
query: 'sum(rate(http_requests_total{code!~"5.."}[5m])) / sum(rate(http_requests_total[5m]))'
|
||||||
|
target: 0.999
|
||||||
|
window: 30d
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
pip install prometheus-api-client pyyaml rich
|
||||||
|
|
||||||
3
requirements.txt
Normal file
3
requirements.txt
Normal file
|
|
@ -0,0 +1,3 @@
|
||||||
|
prometheus-api-client==0.5.4
|
||||||
|
pyyaml==6.0.1
|
||||||
|
rich==13.7.1
|
||||||
64
slo_tracker.py
Normal file
64
slo_tracker.py
Normal file
|
|
@ -0,0 +1,64 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Minimal SLO burn rate tracker for Prometheus."""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
from prometheus_api_client import PrometheusConnect
|
||||||
|
from rich.console import Console
|
||||||
|
from rich.table import Table
|
||||||
|
|
||||||
|
console = Console()
|
||||||
|
|
||||||
|
|
||||||
|
def load_config(path: str) -> dict:
|
||||||
|
with open(path) as f:
|
||||||
|
return yaml.safe_load(f)
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate_slo(prom: PrometheusConnect, slo: dict, window: str) -> dict:
|
||||||
|
result = prom.custom_query(slo["query"])
|
||||||
|
if not result:
|
||||||
|
return {"name": slo["name"], "value": None, "target": slo["target"], "ok": False}
|
||||||
|
value = float(result[0]["value"][1])
|
||||||
|
return {
|
||||||
|
"name": slo["name"],
|
||||||
|
"value": value,
|
||||||
|
"target": slo["target"],
|
||||||
|
"ok": value >= slo["target"],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="SLO burn rate tracker")
|
||||||
|
parser.add_argument("--config", required=True)
|
||||||
|
parser.add_argument("--prometheus", default="http://localhost:9090")
|
||||||
|
parser.add_argument("--window", default="7d")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
cfg = load_config(args.config)
|
||||||
|
prom = PrometheusConnect(url=args.prometheus, disable_ssl=True)
|
||||||
|
|
||||||
|
table = Table(title=f"SLO Status ({args.window} window)")
|
||||||
|
table.add_column("SLO", style="cyan")
|
||||||
|
table.add_column("Target", justify="right")
|
||||||
|
table.add_column("Current", justify="right")
|
||||||
|
table.add_column("Status", justify="center")
|
||||||
|
|
||||||
|
all_ok = True
|
||||||
|
for slo in cfg.get("slos", []):
|
||||||
|
result = evaluate_slo(prom, slo, args.window)
|
||||||
|
status = "[green]OK[/green]" if result["ok"] else "[red]BURNING[/red]"
|
||||||
|
value_str = f"{result['value']:.4f}" if result["value"] is not None else "n/a"
|
||||||
|
table.add_row(result["name"], str(result["target"]), value_str, status)
|
||||||
|
if not result["ok"]:
|
||||||
|
all_ok = False
|
||||||
|
|
||||||
|
console.print(table)
|
||||||
|
sys.exit(0 if all_ok else 1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
19
slos.yml
Normal file
19
slos.yml
Normal file
|
|
@ -0,0 +1,19 @@
|
||||||
|
# Example SLO config — replace with your Prometheus queries
|
||||||
|
|
||||||
|
prometheus: http://prometheus.nexus.local:9090
|
||||||
|
|
||||||
|
slos:
|
||||||
|
- name: "API availability"
|
||||||
|
query: 'sum(rate(http_requests_total{job="nexus-api",code!~"5.."}[5m])) / sum(rate(http_requests_total{job="nexus-api"}[5m]))'
|
||||||
|
target: 0.999
|
||||||
|
window: 30d
|
||||||
|
|
||||||
|
- name: "API latency p99 < 500ms"
|
||||||
|
query: 'histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{job="nexus-api"}[5m])) by (le))'
|
||||||
|
target: 0.5
|
||||||
|
window: 7d
|
||||||
|
|
||||||
|
- name: "Auth service availability"
|
||||||
|
query: 'sum(rate(http_requests_total{job="nexus-auth",code!~"5.."}[5m])) / sum(rate(http_requests_total{job="nexus-auth"}[5m]))'
|
||||||
|
target: 0.9999
|
||||||
|
window: 30d
|
||||||
Loading…
Reference in a new issue