Initial commit — basic SLO eval working

This commit is contained in:
Oliver Neumann 2025-05-19 08:30:00 +00:00
commit 79c18a58f2
4 changed files with 109 additions and 0 deletions

23
README.md Normal file
View file

@ -0,0 +1,23 @@
# slo-tracker
Minimal CLI for tracking SLO burn rates from Prometheus.
Outputs a quick summary to stdout — useful for morning standups and incident reviews.
Built for my own use. Tested on the Nexus prod setup.
## Usage
python slo_tracker.py --config slos.yml --window 7d
## Config format
slos:
- name: "API availability"
query: 'sum(rate(http_requests_total{code!~"5.."}[5m])) / sum(rate(http_requests_total[5m]))'
target: 0.999
window: 30d
## Requirements
pip install prometheus-api-client pyyaml rich

3
requirements.txt Normal file
View file

@ -0,0 +1,3 @@
prometheus-api-client==0.5.4
pyyaml==6.0.1
rich==13.7.1

64
slo_tracker.py Normal file
View file

@ -0,0 +1,64 @@
#!/usr/bin/env python3
"""Minimal SLO burn rate tracker for Prometheus."""
import argparse
import sys
from datetime import datetime, timedelta
import yaml
from prometheus_api_client import PrometheusConnect
from rich.console import Console
from rich.table import Table
console = Console()
def load_config(path: str) -> dict:
with open(path) as f:
return yaml.safe_load(f)
def evaluate_slo(prom: PrometheusConnect, slo: dict, window: str) -> dict:
result = prom.custom_query(slo["query"])
if not result:
return {"name": slo["name"], "value": None, "target": slo["target"], "ok": False}
value = float(result[0]["value"][1])
return {
"name": slo["name"],
"value": value,
"target": slo["target"],
"ok": value >= slo["target"],
}
def main():
parser = argparse.ArgumentParser(description="SLO burn rate tracker")
parser.add_argument("--config", required=True)
parser.add_argument("--prometheus", default="http://localhost:9090")
parser.add_argument("--window", default="7d")
args = parser.parse_args()
cfg = load_config(args.config)
prom = PrometheusConnect(url=args.prometheus, disable_ssl=True)
table = Table(title=f"SLO Status ({args.window} window)")
table.add_column("SLO", style="cyan")
table.add_column("Target", justify="right")
table.add_column("Current", justify="right")
table.add_column("Status", justify="center")
all_ok = True
for slo in cfg.get("slos", []):
result = evaluate_slo(prom, slo, args.window)
status = "[green]OK[/green]" if result["ok"] else "[red]BURNING[/red]"
value_str = f"{result['value']:.4f}" if result["value"] is not None else "n/a"
table.add_row(result["name"], str(result["target"]), value_str, status)
if not result["ok"]:
all_ok = False
console.print(table)
sys.exit(0 if all_ok else 1)
if __name__ == "__main__":
main()

19
slos.yml Normal file
View file

@ -0,0 +1,19 @@
# Example SLO config — replace with your Prometheus queries
prometheus: http://prometheus.nexus.local:9090
slos:
- name: "API availability"
query: 'sum(rate(http_requests_total{job="nexus-api",code!~"5.."}[5m])) / sum(rate(http_requests_total{job="nexus-api"}[5m]))'
target: 0.999
window: 30d
- name: "API latency p99 < 500ms"
query: 'histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{job="nexus-api"}[5m])) by (le))'
target: 0.5
window: 7d
- name: "Auth service availability"
query: 'sum(rate(http_requests_total{job="nexus-auth",code!~"5.."}[5m])) / sum(rate(http_requests_total{job="nexus-auth"}[5m]))'
target: 0.9999
window: 30d