Initial commit — basic SLO eval working
This commit is contained in:
commit
79c18a58f2
4 changed files with 109 additions and 0 deletions
23
README.md
Normal file
23
README.md
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
# slo-tracker
|
||||
|
||||
Minimal CLI for tracking SLO burn rates from Prometheus.
|
||||
Outputs a quick summary to stdout — useful for morning standups and incident reviews.
|
||||
|
||||
Built for my own use. Tested on the Nexus prod setup.
|
||||
|
||||
## Usage
|
||||
|
||||
python slo_tracker.py --config slos.yml --window 7d
|
||||
|
||||
## Config format
|
||||
|
||||
slos:
|
||||
- name: "API availability"
|
||||
query: 'sum(rate(http_requests_total{code!~"5.."}[5m])) / sum(rate(http_requests_total[5m]))'
|
||||
target: 0.999
|
||||
window: 30d
|
||||
|
||||
## Requirements
|
||||
|
||||
pip install prometheus-api-client pyyaml rich
|
||||
|
||||
3
requirements.txt
Normal file
3
requirements.txt
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
prometheus-api-client==0.5.4
|
||||
pyyaml==6.0.1
|
||||
rich==13.7.1
|
||||
64
slo_tracker.py
Normal file
64
slo_tracker.py
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Minimal SLO burn rate tracker for Prometheus."""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
import yaml
|
||||
from prometheus_api_client import PrometheusConnect
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
|
||||
console = Console()
|
||||
|
||||
|
||||
def load_config(path: str) -> dict:
|
||||
with open(path) as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
|
||||
def evaluate_slo(prom: PrometheusConnect, slo: dict, window: str) -> dict:
|
||||
result = prom.custom_query(slo["query"])
|
||||
if not result:
|
||||
return {"name": slo["name"], "value": None, "target": slo["target"], "ok": False}
|
||||
value = float(result[0]["value"][1])
|
||||
return {
|
||||
"name": slo["name"],
|
||||
"value": value,
|
||||
"target": slo["target"],
|
||||
"ok": value >= slo["target"],
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="SLO burn rate tracker")
|
||||
parser.add_argument("--config", required=True)
|
||||
parser.add_argument("--prometheus", default="http://localhost:9090")
|
||||
parser.add_argument("--window", default="7d")
|
||||
args = parser.parse_args()
|
||||
|
||||
cfg = load_config(args.config)
|
||||
prom = PrometheusConnect(url=args.prometheus, disable_ssl=True)
|
||||
|
||||
table = Table(title=f"SLO Status ({args.window} window)")
|
||||
table.add_column("SLO", style="cyan")
|
||||
table.add_column("Target", justify="right")
|
||||
table.add_column("Current", justify="right")
|
||||
table.add_column("Status", justify="center")
|
||||
|
||||
all_ok = True
|
||||
for slo in cfg.get("slos", []):
|
||||
result = evaluate_slo(prom, slo, args.window)
|
||||
status = "[green]OK[/green]" if result["ok"] else "[red]BURNING[/red]"
|
||||
value_str = f"{result['value']:.4f}" if result["value"] is not None else "n/a"
|
||||
table.add_row(result["name"], str(result["target"]), value_str, status)
|
||||
if not result["ok"]:
|
||||
all_ok = False
|
||||
|
||||
console.print(table)
|
||||
sys.exit(0 if all_ok else 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
19
slos.yml
Normal file
19
slos.yml
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
# Example SLO config — replace with your Prometheus queries
|
||||
|
||||
prometheus: http://prometheus.nexus.local:9090
|
||||
|
||||
slos:
|
||||
- name: "API availability"
|
||||
query: 'sum(rate(http_requests_total{job="nexus-api",code!~"5.."}[5m])) / sum(rate(http_requests_total{job="nexus-api"}[5m]))'
|
||||
target: 0.999
|
||||
window: 30d
|
||||
|
||||
- name: "API latency p99 < 500ms"
|
||||
query: 'histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{job="nexus-api"}[5m])) by (le))'
|
||||
target: 0.5
|
||||
window: 7d
|
||||
|
||||
- name: "Auth service availability"
|
||||
query: 'sum(rate(http_requests_total{job="nexus-auth",code!~"5.."}[5m])) / sum(rate(http_requests_total{job="nexus-auth"}[5m]))'
|
||||
target: 0.9999
|
||||
window: 30d
|
||||
Loading…
Reference in a new issue