Move frigate into the cluster and enable GPU detector

2026-03-28 17:10:23 +11:00
parent 68cf58ead5
commit d22ea96879
10 changed files with 386 additions and 31 deletions
--- a/2-nomad-config/codeproject-ai.nomad.hcl
+++ b/2-nomad-config/codeproject-ai.nomad.hcl
@@ -0,0 +1,97 @@
+job "codeproject-ai" {
+  # Pin to N150 LattePanda nodes for their Intel UHD iGPU (OpenVINO) and dedicated CPU headroom.
+  # The node_class is set via the node.tf/configuration.nix templates in 1-nixos-node.
+  constraint {
+    attribute = "${node.class}"
+    value     = "latte-panda-n150"
+  }
+
+  group "codeproject-ai" {
+    count = 1
+
+    network {
+      port "http" {
+        to = 32168
+      }
+    }
+
+    task "codeproject-ai" {
+      driver = "docker"
+
+      config {
+        image = "codeproject/ai-server:latest"
+        ports = ["http"]
+
+        # Pass the Intel iGPU render nodes so CPAI can accelerate inference
+        # via OpenVINO on the N150's Intel UHD Graphics.
+        # Requires hardware.graphics.enable = true in the NixOS node config
+        # (added automatically when node_class = "latte-panda-n150").
+        devices = [
+          {
+            host_path      = "/dev/dri"
+            container_path = "/dev/dri"
+          }
+        ]
+      }
+
+      service {
+        name = "codeproject-ai"
+        port = "http"
+
+        tags = [
+          "traefik.enable=true",
+          # Auth-gated user-facing UI at codeproject-ai.othrayte.one (generated by defaultRule).
+          "traefik.http.routers.codeproject-ai.middlewares=auth@file",
+          # No-auth bypass for external Frigate access until Frigate moves into the cluster
+          # and can reach the service via Consul DNS directly.
+          "traefik.http.routers.codeproject-ai-token.rule=Host(`c3ll7nbevl5j4j8rcnfxnr95q48fuayz-codeproject-ai.othrayte.one`)",
+        ]
+
+        check {
+          name     = "alive"
+          type     = "http"
+          path     = "/v1/server/status/ping"
+          method   = "GET"
+          port     = "http"
+          interval = "10s"
+          timeout  = "5s"
+        }
+      }
+
+      env {
+        TZ = "Australia/Melbourne"
+      }
+
+      # Persistent storage for downloaded AI modules and their models.
+      # On first start CPAI will download ~1-2 GB of YOLOv5/MobileNet weights
+      # into this volume; subsequent restarts reuse the cached models.
+      volume_mount {
+        volume      = "unraid_appdata_codeproject_ai"
+        destination = "/etc/codeproject/ai"
+        read_only   = false
+      }
+
+      resources {
+        # ~56% of the N150's 7200 MHz allocation - enough for concurrent
+        # object detection requests without starving other jobs on the node.
+        cpu = 4000
+        # YOLOv5-6.2 (default detection module) needs ~900 MB;
+        # leave headroom for a second module (e.g. face detection) and the
+        # CPAI process itself.
+        memory = 3072
+      }
+    }
+
+    volume "unraid_appdata_codeproject_ai" {
+      type            = "csi"
+      read_only       = false
+      source          = "unraid_appdata_codeproject_ai"
+      access_mode     = "single-node-writer"
+      attachment_mode = "file-system"
+
+      mount_options {
+        mount_flags = ["uid=1000", "gid=1000"]
+      }
+    }
+  }
+}