# stacks/monitoring/prometheus/rules/alerts.yml
# Alert rules. Anything that pages should be in 'critical' severity.
# Warnings route to email only (see alertmanager.yml).
groups:
- name: host
rules:
- alert: HostDiskFillingUp
expr: |
instance:node_filesystem_used:ratio > 0.85
for: 15m
labels:
severity: warning
annotations:
summary: "Disk {{ $labels.mountpoint }} > 85% on {{ $labels.instance }}"
description: "Filesystem usage {{ $value | humanizePercentage }}."
- alert: HostDiskCritical
expr: |
instance:node_filesystem_used:ratio > 0.95
for: 5m
labels:
severity: critical
annotations:
summary: "Disk critical on {{ $labels.instance }} ({{ $labels.mountpoint }})"
- alert: HostHighCPU
expr: |
instance:node_cpu_usage:ratio_avg1m > 0.90
for: 20m
labels:
severity: warning
annotations:
summary: "CPU > 90% for 20m on {{ $labels.instance }}"
- alert: HostOutOfMemory
expr: |
instance:node_memory_used:ratio > 0.92
for: 10m
labels:
severity: warning
annotations:
summary: "Memory > 92% on {{ $labels.instance }}"
- alert: HostDown
expr: up{job="node"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Node exporter unreachable on {{ $labels.instance }}"
- name: containers
rules:
- alert: ContainerRestartLoop
expr: service:container_restarts:rate10m > 3
for: 10m
labels:
severity: critical
annotations:
summary: "Service {{ $labels.service }} restarting repeatedly"
description: "{{ $value }} restarts in last 10m."
- alert: ContainerHighMemory
expr: |
service:container_memory_rss:bytes
/ on(service) group_left
container_spec_memory_limit_bytes{service!=""}
> 0.9
for: 15m
labels:
severity: warning
annotations:
summary: "Container {{ $labels.service }} > 90% of memory limit"
- name: http
rules:
- alert: EndpointDown
expr: probe_success{job="blackbox_http"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Endpoint {{ $labels.instance }} failing health probe"
- alert: EndpointSlow
expr: job:blackbox_http_duration:avg5m > 2
for: 10m
labels:
severity: warning
annotations:
summary: "Endpoint {{ $labels.instance }} slow ({{ $value }}s avg)"
- name: tls
rules:
- alert: CertExpiringSoon
expr: |
probe_ssl_earliest_cert_expiry - time() < 14 * 24 * 3600
for: 1h
labels:
severity: warning
annotations:
summary: "Cert for {{ $labels.instance }} expires in < 14d"
- alert: CertExpired
expr: probe_ssl_earliest_cert_expiry - time() <= 0
for: 5m
labels:
severity: critical
annotations:
summary: "Cert for {{ $labels.instance }} is expired"
- name: monitoring
rules:
- alert: PrometheusTargetMissing
expr: up == 0
for: 10m
labels:
severity: warning
annotations:
summary: "Target {{ $labels.job }}/{{ $labels.instance }} missing"
- alert: PrometheusRuleFailures
expr: rate(prometheus_rule_evaluation_failures_total[5m]) > 0
for: 10m
labels:
severity: warning
annotations:
summary: "Prometheus rule evaluation failing"