stacks/monitoring/prometheus/rules/alerts.yml

# stacks/monitoring/prometheus/rules/alerts.yml
# Alert rules. Anything that pages should be in 'critical' severity.
# Warnings route to email only (see alertmanager.yml).

groups:
  - name: host
    rules:
      - alert: HostDiskFillingUp
        expr: |
          instance:node_filesystem_used:ratio > 0.85
        for: 15m
        labels:
          severity: warning
        annotations:
          summary: "Disk {{ $labels.mountpoint }} > 85% on {{ $labels.instance }}"
          description: "Filesystem usage {{ $value | humanizePercentage }}."

      - alert: HostDiskCritical
        expr: |
          instance:node_filesystem_used:ratio > 0.95
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Disk critical on {{ $labels.instance }} ({{ $labels.mountpoint }})"

      - alert: HostHighCPU
        expr: |
          instance:node_cpu_usage:ratio_avg1m > 0.90
        for: 20m
        labels:
          severity: warning
        annotations:
          summary: "CPU > 90% for 20m on {{ $labels.instance }}"

      - alert: HostOutOfMemory
        expr: |
          instance:node_memory_used:ratio > 0.92
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "Memory > 92% on {{ $labels.instance }}"

      - alert: HostDown
        expr: up{job="node"} == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Node exporter unreachable on {{ $labels.instance }}"

  - name: containers
    rules:
      - alert: ContainerRestartLoop
        expr: service:container_restarts:rate10m > 3
        for: 10m
        labels:
          severity: critical
        annotations:
          summary: "Service {{ $labels.service }} restarting repeatedly"
          description: "{{ $value }} restarts in last 10m."

      - alert: ContainerHighMemory
        expr: |
          service:container_memory_rss:bytes
            / on(service) group_left
          container_spec_memory_limit_bytes{service!=""}
          > 0.9
        for: 15m
        labels:
          severity: warning
        annotations:
          summary: "Container {{ $labels.service }} > 90% of memory limit"

  - name: http
    rules:
      - alert: EndpointDown
        expr: probe_success{job="blackbox_http"} == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Endpoint {{ $labels.instance }} failing health probe"

      - alert: EndpointSlow
        expr: job:blackbox_http_duration:avg5m > 2
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "Endpoint {{ $labels.instance }} slow ({{ $value }}s avg)"

  - name: tls
    rules:
      - alert: CertExpiringSoon
        expr: |
          probe_ssl_earliest_cert_expiry - time() < 14 * 24 * 3600
        for: 1h
        labels:
          severity: warning
        annotations:
          summary: "Cert for {{ $labels.instance }} expires in < 14d"

      - alert: CertExpired
        expr: probe_ssl_earliest_cert_expiry - time() <= 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Cert for {{ $labels.instance }} is expired"

  - name: monitoring
    rules:
      - alert: PrometheusTargetMissing
        expr: up == 0
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "Target {{ $labels.job }}/{{ $labels.instance }} missing"

      - alert: PrometheusRuleFailures
        expr: rate(prometheus_rule_evaluation_failures_total[5m]) > 0
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "Prometheus rule evaluation failing"