scripts/health-check-all.sh

#!/usr/bin/env bash
# scripts/health-check-all.sh
# Iterate every stack and probe each service it exposes. Exits 0 on OK,
# 2 on any warn (so cron emails but does not alert-page).
#
# Docs: mercemay.top/src/homelab-compose/

set -euo pipefail

HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=/dev/null
. "${HERE}/lib/log.sh"

declare -A TARGETS=(
    [auth]="https://auth.home.arpa"
    [jellyfin]="https://jellyfin.home.arpa/System/Info/Public"
    [gitea]="https://gitea.home.arpa/api/v1/version"
    [grafana]="https://grafana.home.arpa/api/health"
    [prometheus]="https://prometheus.home.arpa/-/healthy"
    [paperless]="https://paperless.home.arpa/api/"
    [immich]="https://immich.home.arpa/api/server-info/ping"
    [sonarr]="https://sonarr.home.arpa/ping"
    [radarr]="https://radarr.home.arpa/ping"
    [prowlarr]="https://prowlarr.home.arpa/ping"
    [bazarr]="https://bazarr.home.arpa/api/system/status"
)

warn=0
fail=0

check() {
    local name="$1" url="$2"
    local code
    code=$(curl -k -s -o /dev/null -w '%{http_code}' --max-time 8 "${url}" || echo "000")
    case "${code}" in
        200|204|301|302|401)
            log_info "OK   ${name} ${code}"
            ;;
        000)
            log_err "FAIL ${name} no-response"
            fail=$((fail + 1))
            ;;
        5??)
            log_err "FAIL ${name} ${code}"
            fail=$((fail + 1))
            ;;
        *)
            log_warn "WARN ${name} ${code}"
            warn=$((warn + 1))
            ;;
    esac
}

for name in "${!TARGETS[@]}"; do
    check "${name}" "${TARGETS[${name}]}"
done

log_info "docker compose ps summary:"
for stack in /srv/homelab/stacks/*/docker-compose.yml; do
    (
        cd "$(dirname "${stack}")" &&
        docker compose ps --format 'table {{.Service}}\t{{.Status}}'
    )
done

if (( fail > 0 )); then
    exit 1
elif (( warn > 0 )); then
    exit 2
fi
exit 0