diff --git a/2-nomad-config/1-data/pgadmin.nomad.hcl b/2-nomad-config/1-data/pgadmin.nomad.hcl index 7cb4854..cdcddea 100644 --- a/2-nomad-config/1-data/pgadmin.nomad.hcl +++ b/2-nomad-config/1-data/pgadmin.nomad.hcl @@ -58,8 +58,10 @@ job "pgadmin" { ] check { + name = "alive" type = "http" path = "/" + port = "http" interval = "10s" timeout = "2s" } diff --git a/2-nomad-config/1-data/postgres.nomad.hcl b/2-nomad-config/1-data/postgres.nomad.hcl index f51ef77..9ac2b60 100644 --- a/2-nomad-config/1-data/postgres.nomad.hcl +++ b/2-nomad-config/1-data/postgres.nomad.hcl @@ -7,6 +7,10 @@ job "postgres" { connect { sidecar_service {} } + + # Note: TCP checks are not valid for Connect-enabled services (runs through + # Envoy sidecar). Postgres is a single-writer DB that we never canary, so + # observable health via Consul is lower priority than other services. } task "postgres" { diff --git a/2-nomad-config/2-ingress/authelia.nomad.hcl b/2-nomad-config/2-ingress/authelia.nomad.hcl index 16f3bbf..c562e0b 100644 --- a/2-nomad-config/2-ingress/authelia.nomad.hcl +++ b/2-nomad-config/2-ingress/authelia.nomad.hcl @@ -33,8 +33,10 @@ job "authelia" { } check { + name = "alive" type = "http" path = "/health" + port = "http" interval = "10s" timeout = "2s" } diff --git a/2-nomad-config/2-ingress/traefik.nomad.hcl b/2-nomad-config/2-ingress/traefik.nomad.hcl index 90da4c5..e2e44ed 100644 --- a/2-nomad-config/2-ingress/traefik.nomad.hcl +++ b/2-nomad-config/2-ingress/traefik.nomad.hcl @@ -35,7 +35,8 @@ job "traefik" { check { name = "alive" - type = "tcp" + type = "http" + path = "/ping" port = "api" interval = "10s" timeout = "2s" @@ -85,6 +86,9 @@ api: dashboard: true insecure: true +ping: + entryPoint: traefik + providers: file: directory: "/etc/traefik/configs/" diff --git a/2-nomad-config/deluge.nomad.hcl b/2-nomad-config/deluge.nomad.hcl index cb4a625..eff983a 100644 --- a/2-nomad-config/deluge.nomad.hcl +++ b/2-nomad-config/deluge.nomad.hcl @@ -54,8 +54,10 @@ EOH ] check { + name = "alive" type = "http" path = "/" + port = "http" interval = "10s" timeout = "2s" } @@ -78,8 +80,10 @@ EOH } check { + name = "alive" type = "http" path = "/" + port = "http" interval = "10s" timeout = "2s" } diff --git a/2-nomad-config/gitea.nomad.hcl b/2-nomad-config/gitea.nomad.hcl index 47db5bc..a9c1eca 100644 --- a/2-nomad-config/gitea.nomad.hcl +++ b/2-nomad-config/gitea.nomad.hcl @@ -32,8 +32,10 @@ job "gitea" { ] check { + name = "alive" type = "http" - path = "/" + path = "/api/healthz" + port = "http" interval = "10s" timeout = "2s" } diff --git a/2-nomad-config/glance.nomad.hcl b/2-nomad-config/glance.nomad.hcl index 13e2ee7..9285b6f 100644 --- a/2-nomad-config/glance.nomad.hcl +++ b/2-nomad-config/glance.nomad.hcl @@ -12,7 +12,7 @@ job "glance" { driver = "docker" config { - image = "glanceapp/glance:latest" + image = "glanceapp/glance:v0.7.8" ports = ["http"] volumes = [ "local/glance.yml:/app/config/glance.yml", @@ -30,7 +30,8 @@ job "glance" { check { name = "alive" - type = "tcp" + type = "http" + path = "/" port = "http" interval = "10s" timeout = "2s" diff --git a/2-nomad-config/jellyfin.nomad.hcl b/2-nomad-config/jellyfin.nomad.hcl index 9db474f..dc41d4e 100644 --- a/2-nomad-config/jellyfin.nomad.hcl +++ b/2-nomad-config/jellyfin.nomad.hcl @@ -28,10 +28,11 @@ job "jellyfin" { check { name = "alive" - type = "tcp" + type = "http" + path = "/health" port = "http" interval = "10s" - timeout = "2s" + timeout = "5s" } } diff --git a/2-nomad-config/ntfy.nomad.hcl b/2-nomad-config/ntfy.nomad.hcl index f5560a5..613857c 100644 --- a/2-nomad-config/ntfy.nomad.hcl +++ b/2-nomad-config/ntfy.nomad.hcl @@ -33,8 +33,10 @@ job "ntfy" { ] check { + name = "alive" type = "http" path = "/healthz" + port = "http" interval = "10s" timeout = "2s" } diff --git a/2-nomad-config/openreader.nomad.hcl b/2-nomad-config/openreader.nomad.hcl index 7383f34..beb8071 100644 --- a/2-nomad-config/openreader.nomad.hcl +++ b/2-nomad-config/openreader.nomad.hcl @@ -31,8 +31,10 @@ job "openreader" { ] check { + name = "alive" type = "http" path = "/" + port = "http" interval = "10s" timeout = "2s" } @@ -48,8 +50,10 @@ job "openreader" { } check { + name = "alive" type = "http" path = "/" + port = "http" interval = "10s" timeout = "2s" } diff --git a/2-nomad-config/prowlarr.nomad.hcl b/2-nomad-config/prowlarr.nomad.hcl index 15c29c8..69630ab 100644 --- a/2-nomad-config/prowlarr.nomad.hcl +++ b/2-nomad-config/prowlarr.nomad.hcl @@ -34,8 +34,9 @@ job "prowlarr" { ] check { + name = "alive" type = "http" - path = "/" + path = "/ping" interval = "10s" timeout = "2s" } @@ -51,8 +52,9 @@ job "prowlarr" { } check { + name = "alive" type = "http" - path = "/" + path = "/ping" interval = "10s" timeout = "2s" } diff --git a/2-nomad-config/sonarr.nomad.hcl b/2-nomad-config/sonarr.nomad.hcl index e646a06..d5ca9dd 100644 --- a/2-nomad-config/sonarr.nomad.hcl +++ b/2-nomad-config/sonarr.nomad.hcl @@ -37,8 +37,9 @@ job "sonarr" { ] check { + name = "alive" type = "http" - path = "/" + path = "/ping" interval = "10s" timeout = "2s" } @@ -54,8 +55,9 @@ job "sonarr" { } check { + name = "alive" type = "http" - path = "/" + path = "/ping" interval = "10s" timeout = "2s" } diff --git a/2-nomad-config/transfer.nomad.hcl b/2-nomad-config/transfer.nomad.hcl index 8913752..5779ec9 100644 --- a/2-nomad-config/transfer.nomad.hcl +++ b/2-nomad-config/transfer.nomad.hcl @@ -16,8 +16,10 @@ job "transfer" { ] check { + name = "alive" type = "http" path = "/" + port = "http" interval = "10s" timeout = "2s" } diff --git a/2-nomad-config/unifi.nomad.hcl b/2-nomad-config/unifi.nomad.hcl index e0de2ed..7bbed4f 100644 --- a/2-nomad-config/unifi.nomad.hcl +++ b/2-nomad-config/unifi.nomad.hcl @@ -21,6 +21,26 @@ job "unifi-network" { UNIFI_STDOUT = "true" } + # Register in Consul so Traefik and health checks can find it. + # address_mode=driver uses the macvlan IP (192.168.1.50) rather than the host IP. + service { + name = "unifi-network" + port = 8443 + address_mode = "driver" + + # TCP/HTTP checks from the Consul agent can't reach the macvlan IP (host↔macvlan + # isolation). Use a script check instead — it runs inside the container via + # docker exec and connects to localhost:8443 directly. + check { + name = "alive" + type = "script" + command = "/usr/bin/curl" + args = ["-sk", "--max-time", "5", "-o", "/dev/null", "https://localhost:8443"] + interval = "30s" + timeout = "10s" + } + } + volume_mount { volume = "unraid_appdata_unifi_network" destination = "/unifi" # Expected root directory (contains data, log, cert subdirs) diff --git a/cicd-plan.md b/cicd-plan.md index 98416b2..34c1c9f 100644 --- a/cicd-plan.md +++ b/cicd-plan.md @@ -188,8 +188,8 @@ Most jobs already have Consul health checks — these can use `health_check = "c | frigate | ✅ | ✅ `single-node-writer` | ⚠️ same — rolling | | glance | ✅ | no | ✅ yes | | transfer | ✅ | ✅ `single-node-writer` | ⚠️ rolling | -| openreader | ❌ | ✅ `single-node-writer` | ⚠️ add check first, then rolling | -| unifi | ❌ | ✅ `single-node-writer` | ⚠️ add check first, then rolling | +| openreader | ✅ `/` | ✅ `single-node-writer` | ⚠️ rolling | +| unifi | ✅ script | ✅ `single-node-writer` | ⚠️ rolling | | traefik | (ingress) | ✅ | ⚠️ rolling — downtime risk, promote quickly | | authelia | (ingress) | ✅ | ✅ stateless config, canary fine | | renovate | batch job | n/a | n/a — no deployment model | @@ -298,8 +298,8 @@ exit 1 - [x] **Phase 1c**: Add Nomad validate step — add `NOMAD_ADDR` + read-only `NOMAD_TOKEN` to Gitea secrets - [x] **Phase 2**: Add image pull validation step to the workflow - [ ] **Phase 3a**: Add `update` stanzas to ntfy and glance (simplest, no volume conflict) -- [ ] **Phase 3b**: Add rolling `update` stanzas to remaining service jobs (jellyfin, sonarr, etc.) -- [ ] **Phase 3c**: Add health checks to openreader and unifi before adding update stanzas +- [ ] **Phase 3b**: Add rolling `update` stanzas to remaining service jobs (jellyfin, sonarr, prowlarr, deluge, gitea, immich, transfer, frigate, openreader, unifi, authelia, traefik) +- [x] **Phase 3c**: Add health checks to openreader and unifi before adding update stanzas - [ ] **Phase 4a**: Add on-push workflow that runs `terraform apply -auto-approve` using full credential set - [ ] **Phase 4b**: Add deployment promotion/revert polling script - [ ] **Phase 4c**: Wire ntfy notifications for promote/revert outcomes