stacks/monitoring/exporters/smart-exporter.sh

#!/usr/bin/env bash
# stacks/monitoring/exporters/smart-exporter.sh
# Writes a snapshot of smartctl health per disk into a prom textfile.
# Meant to be called from smart-test.timer once an hour, not per scrape.
#
# Docs: mercemay.top/src/homelab-compose/

set -euo pipefail

TEXTFILE_DIR="${TEXTFILE_DIR:-/var/lib/node_exporter/textfile}"
OUT="${TEXTFILE_DIR}/smart.prom"
TMP="${OUT}.$$"

mkdir -p "${TEXTFILE_DIR}"

disks=()
while IFS= read -r line; do
    disks+=("${line}")
done < <(lsblk -dn -o NAME,TYPE | awk '$2=="disk"{print "/dev/"$1}')

: > "${TMP}"

{
    echo "# HELP smart_device_health 1 if smartctl health PASSED"
    echo "# TYPE smart_device_health gauge"
    echo "# HELP smart_temperature_celsius current temperature"
    echo "# TYPE smart_temperature_celsius gauge"
    echo "# HELP smart_power_on_hours total power on hours"
    echo "# TYPE smart_power_on_hours counter"
    echo "# HELP smart_reallocated_sectors reallocated sector count"
    echo "# TYPE smart_reallocated_sectors gauge"
    echo "# HELP smart_pending_sectors pending reallocation sectors"
    echo "# TYPE smart_pending_sectors gauge"
} >> "${TMP}"

for disk in "${disks[@]}"; do
    model=$(smartctl -i "${disk}" 2>/dev/null \
        | awk -F: '/Device Model|Model Number/ {gsub(/^ +/,"",$2); print $2; exit}')
    serial=$(smartctl -i "${disk}" 2>/dev/null \
        | awk -F: '/Serial Number/ {gsub(/^ +/,"",$2); print $2; exit}')
    model="${model:-unknown}"
    serial="${serial:-unknown}"
    labels="device=\"${disk}\",model=\"${model// /_}\",serial=\"${serial// /_}\""

    health=0
    if smartctl -H "${disk}" 2>/dev/null | grep -qE 'PASSED|OK'; then
        health=1
    fi
    printf 'smart_device_health{%s} %s\n' "${labels}" "${health}" >> "${TMP}"

    tempc=$(smartctl -A "${disk}" 2>/dev/null \
        | awk '/Temperature_Celsius|Current Drive Temperature/ {for (i=NF;i>=1;i--) if ($i ~ /^[0-9]+$/) {print $i; exit}}')
    if [[ -n "${tempc:-}" ]]; then
        printf 'smart_temperature_celsius{%s} %s\n' "${labels}" "${tempc}" >> "${TMP}"
    fi

    poh=$(smartctl -A "${disk}" 2>/dev/null \
        | awk '/Power_On_Hours|Power On Hours/ {for (i=NF;i>=1;i--) if ($i ~ /^[0-9]+$/) {print $i; exit}}')
    if [[ -n "${poh:-}" ]]; then
        printf 'smart_power_on_hours{%s} %s\n' "${labels}" "${poh}" >> "${TMP}"
    fi

    reall=$(smartctl -A "${disk}" 2>/dev/null \
        | awk '/Reallocated_Sector_Ct|Reallocate.*Count/ {for (i=NF;i>=1;i--) if ($i ~ /^[0-9]+$/) {print $i; exit}}')
    if [[ -n "${reall:-}" ]]; then
        printf 'smart_reallocated_sectors{%s} %s\n' "${labels}" "${reall}" >> "${TMP}"
    fi

    pend=$(smartctl -A "${disk}" 2>/dev/null \
        | awk '/Current_Pending_Sector/ {for (i=NF;i>=1;i--) if ($i ~ /^[0-9]+$/) {print $i; exit}}')
    if [[ -n "${pend:-}" ]]; then
        printf 'smart_pending_sectors{%s} %s\n' "${labels}" "${pend}" >> "${TMP}"
    fi
done

mv -f "${TMP}" "${OUT}"
chmod 0644 "${OUT}"