scripts/upgrade.sh

#!/usr/bin/env bash
# scripts/upgrade.sh -- pull new images and restart services one stack at a
# time so only a single thing is down at once.
#
# I dropped watchtower in commit 5512de8 because it started a container
# update mid-dinner and immich broke for two hours. This script is the
# boring, manual replacement.
#
# Usage:
#   scripts/upgrade.sh               # every stack
#   scripts/upgrade.sh media         # just stacks/media
#   scripts/upgrade.sh --dry-run     # show what would change
#
# mercemay.top/src/homelab-compose/ documents the recovery path.

set -euo pipefail

ROOT=$(git rev-parse --show-toplevel 2>/dev/null || pwd)
cd "$ROOT"

DRY=0
SELECTED=()
while (( $# )); do
  case "$1" in
    --dry-run) DRY=1; shift ;;
    -h|--help) sed -n '2,14p' "$0" | sed 's/^# \{0,1\}//'; exit 0 ;;
    -*) echo "upgrade: unknown flag $1" >&2; exit 2 ;;
    *) SELECTED+=("$1"); shift ;;
  esac
done

if [[ ${#SELECTED[@]} -eq 0 ]]; then
  mapfile -t SELECTED < <(find stacks -mindepth 1 -maxdepth 1 -type d -printf '%f\n' | sort)
fi

log() { printf '[upgrade %s] %s\n' "$(date +%H:%M:%S)" "$*"; }

pull_stack() {
  local stack=$1
  local compose="stacks/$stack/compose.yml"
  [[ -f $compose ]] || { log "skip $stack (no compose.yml)"; return; }

  log "pulling $stack"
  if (( DRY )); then
    docker compose -f "$compose" pull --quiet --dry-run 2>/dev/null || \
      docker compose -f "$compose" config --images
    return
  fi
  docker compose -f "$compose" pull --quiet
}

restart_stack() {
  local stack=$1
  local compose="stacks/$stack/compose.yml"
  [[ -f $compose ]] || return

  log "recreating $stack"
  if (( DRY )); then
    log "  would run: docker compose -f $compose up -d --remove-orphans"
    return
  fi
  docker compose -f "$compose" up -d --remove-orphans
  wait_healthy "$stack"
}

wait_healthy() {
  local stack=$1
  local compose="stacks/$stack/compose.yml"
  local deadline=$(( $(date +%s) + 120 ))
  while (( $(date +%s) < deadline )); do
    local failing
    failing=$(docker compose -f "$compose" ps --format '{{.Health}}' | grep -c -E 'unhealthy|starting' || true)
    if [[ $failing -eq 0 ]]; then
      log "  $stack healthy"
      return 0
    fi
    sleep 5
  done
  log "  $stack still unhealthy after 120s, check with 'docker compose -f $compose ps'"
  return 1
}

# Pull everything first so one slow download does not block the first
# restart. Each pull is independent so a failure in one stack should not
# stop the rest; we do care about restart failures.
for s in "${SELECTED[@]}"; do
  pull_stack "$s" || log "pull failed for $s, continuing"
done

FAILED=()
for s in "${SELECTED[@]}"; do
  if ! restart_stack "$s"; then
    FAILED+=("$s")
  fi
done

if [[ ${#FAILED[@]} -gt 0 ]]; then
  log "failed: ${FAILED[*]}"
  exit 1
fi
log "all selected stacks upgraded"