From 82f97308d6b98fd2a79980b72e6c069a1ff9abc7 Mon Sep 17 00:00:00 2001
From: Taylor Mutch <taylormutch@gmail.com>
Date: Tue, 5 May 2026 14:04:51 -0700
Subject: [PATCH 1/8] test(e2e): Add a Helm specific e2e harness and linting
 workflow

---
 .agents/skills/helm-dev-environment/SKILL.md  |  14 +-
 .github/workflows/helm-lint.yml               |  57 ++++
 deploy/helm/openshell/.helmignore             |   6 +-
 .../{ => ci}/values-cert-manager.yaml         |   2 +-
 .../openshell/{ => ci}/values-gateway.yaml    |   2 +-
 .../openshell/{ => ci}/values-keycloak.yaml   |   2 +-
 .../openshell/{ => ci}/values-skaffold.yaml   |   0
 .../openshell/ci/values-tls-disabled.yaml     |  10 +
 deploy/helm/openshell/skaffold.yaml           |  10 +-
 tasks/helm.toml                               |  36 ++-
 tasks/scripts/helm-e2e.sh                     | 291 ++++++++++++++++++
 11 files changed, 408 insertions(+), 22 deletions(-)
 create mode 100644 .github/workflows/helm-lint.yml
 rename deploy/helm/openshell/{ => ci}/values-cert-manager.yaml (84%)
 rename deploy/helm/openshell/{ => ci}/values-gateway.yaml (92%)
 rename deploy/helm/openshell/{ => ci}/values-keycloak.yaml (95%)
 rename deploy/helm/openshell/{ => ci}/values-skaffold.yaml (100%)
 create mode 100644 deploy/helm/openshell/ci/values-tls-disabled.yaml
 create mode 100755 tasks/scripts/helm-e2e.sh

diff --git a/.agents/skills/helm-dev-environment/SKILL.md b/.agents/skills/helm-dev-environment/SKILL.md
index 986ce1490..0db44f1f8 100644
--- a/.agents/skills/helm-dev-environment/SKILL.md
+++ b/.agents/skills/helm-dev-environment/SKILL.md
@@ -63,7 +63,7 @@ The gateway Service uses ClusterIP. Access is via Envoy Gateway (port `8080`) or
 
 ### TLS behaviour
 
-`values-skaffold.yaml` sets `server.disableTls: true`, so Skaffold-based deploys run
+`ci/values-skaffold.yaml` sets `server.disableTls: true`, so Skaffold-based deploys run
 plaintext by default. To test with TLS enabled, comment out that line and redeploy.
 
 | Mode | `server.disableTls` | Gateway scheme |
@@ -160,7 +160,7 @@ imports the openshell realm from `scripts/keycloak-realm.json`, and prints a por
 command for acquiring tokens from the CLI.
 
 Then activate OIDC in the OpenShell Helm chart:
-1. Uncomment `#- values-keycloak.yaml` in `skaffold.yaml`
+1. Uncomment `#- ci/values-keycloak.yaml` in `skaffold.yaml`
 2. Redeploy: `mise run helm:skaffold:run`
 
 To remove Keycloak:
@@ -191,10 +191,12 @@ mise run helm:k3s:status
 |------|---------|
 | `deploy/helm/openshell/skaffold.yaml` | Skaffold config — images, Helm releases, values overlays |
 | `deploy/helm/openshell/values.yaml` | Default Helm values |
-| `deploy/helm/openshell/values-skaffold.yaml` | Dev overrides (image pull policy, local image names) |
-| `deploy/helm/openshell/values-cert-manager.yaml` | cert-manager TLS overlay (opt-in; disables pkiInitJob) |
-| `deploy/helm/openshell/values-gateway.yaml` | Envoy Gateway GRPCRoute + Gateway overlay |
-| `deploy/helm/openshell/values-keycloak.yaml` | Keycloak OIDC overlay |
+| `deploy/helm/openshell/ci/values-skaffold.yaml` | Dev overrides (image pull policy, TLS disabled for local Skaffold) |
+| `deploy/helm/openshell/ci/values-cert-manager.yaml` | cert-manager PKI overlay (opt-in; disables pkiInitJob) |
+| `deploy/helm/openshell/ci/values-gateway.yaml` | Envoy Gateway GRPCRoute + Gateway overlay |
+| `deploy/helm/openshell/ci/values-keycloak.yaml` | Keycloak OIDC overlay |
+| `deploy/helm/openshell/ci/values-tls-disabled.yaml` | Lint-only: TLS + auth disabled (reverse-proxy edge termination) |
 | `deploy/kube/manifests/envoy-gateway-openshell.yaml` | GatewayClass for Envoy Gateway (`mise run helm:gateway:apply`) |
 | `tasks/scripts/helm-k3s-local.sh` | k3d cluster create/delete/start/stop/status |
+| `tasks/scripts/helm-e2e.sh` | Bootstrap k3d cluster and run Rust + Python e2e via Helm |
 | `tasks/scripts/keycloak-k8s-setup.sh` | Keycloak deploy + realm import |
diff --git a/.github/workflows/helm-lint.yml b/.github/workflows/helm-lint.yml
new file mode 100644
index 000000000..8b7184133
--- /dev/null
+++ b/.github/workflows/helm-lint.yml
@@ -0,0 +1,57 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+name: Helm Lint
+
+on:
+  push:
+    branches:
+      - "pull-request/[0-9]+"
+    paths:
+      - "deploy/helm/**"
+  workflow_dispatch:
+
+env:
+  MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+permissions:
+  contents: read
+  packages: read
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  pr_metadata:
+    name: Resolve PR metadata
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      pull-requests: read
+    outputs:
+      should_run: ${{ steps.gate.outputs.should_run }}
+    steps:
+      - uses: actions/checkout@v6
+
+      - id: gate
+        uses: ./.github/actions/pr-gate
+
+  helm-lint:
+    name: Helm Lint
+    needs: pr_metadata
+    if: needs.pr_metadata.outputs.should_run == 'true'
+    runs-on: linux-amd64-cpu8
+    container:
+      image: ghcr.io/nvidia/openshell/ci:latest
+      credentials:
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Install tools
+        run: mise install --locked
+
+      - name: Lint Helm chart
+        run: mise run helm:lint
diff --git a/deploy/helm/openshell/.helmignore b/deploy/helm/openshell/.helmignore
index 798d0e7c8..a12325802 100644
--- a/deploy/helm/openshell/.helmignore
+++ b/deploy/helm/openshell/.helmignore
@@ -19,8 +19,4 @@
 
 # Ignore development files
 skaffold.yaml
-values-keycloak.yaml
-values-ingress.yaml
-values-gateway.yaml
-values-cert-manager.yaml
-values-skaffold.yaml
+ci/
diff --git a/deploy/helm/openshell/values-cert-manager.yaml b/deploy/helm/openshell/ci/values-cert-manager.yaml
similarity index 84%
rename from deploy/helm/openshell/values-cert-manager.yaml
rename to deploy/helm/openshell/ci/values-cert-manager.yaml
index bb024d716..ed99c8b46 100644
--- a/deploy/helm/openshell/values-cert-manager.yaml
+++ b/deploy/helm/openshell/ci/values-cert-manager.yaml
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # Merge after values.yaml when cert-manager CRDs are installed, e.g.:
-#   helm install ... -f values.yaml -f values-cert-manager.yaml
+#   helm install ... -f values.yaml -f ci/values-cert-manager.yaml
 # Or add this file to skaffold manifests.helm.releases[].valuesFiles.
 server:
   disableTls: false
diff --git a/deploy/helm/openshell/values-gateway.yaml b/deploy/helm/openshell/ci/values-gateway.yaml
similarity index 92%
rename from deploy/helm/openshell/values-gateway.yaml
rename to deploy/helm/openshell/ci/values-gateway.yaml
index c43a4cd45..196192213 100644
--- a/deploy/helm/openshell/values-gateway.yaml
+++ b/deploy/helm/openshell/ci/values-gateway.yaml
@@ -5,7 +5,7 @@
 #
 # Requires Envoy Gateway in the cluster (installed via skaffold.yaml).
 # Add this file to the openshell release valuesFiles to activate:
-#   uncomment values-gateway.yaml in deploy/helm/openshell/skaffold.yaml
+#   uncomment ci/values-gateway.yaml in deploy/helm/openshell/skaffold.yaml
 #
 # Envoy Gateway will create an Envoy proxy Deployment and a LoadBalancer
 # Service (named envoy-<namespace>-<gateway-name>-*) in the openshell namespace.
diff --git a/deploy/helm/openshell/values-keycloak.yaml b/deploy/helm/openshell/ci/values-keycloak.yaml
similarity index 95%
rename from deploy/helm/openshell/values-keycloak.yaml
rename to deploy/helm/openshell/ci/values-keycloak.yaml
index 42bb2ad4e..cc6ca658b 100644
--- a/deploy/helm/openshell/values-keycloak.yaml
+++ b/deploy/helm/openshell/ci/values-keycloak.yaml
@@ -8,7 +8,7 @@
 #
 # Then layer this file on top of values.yaml when deploying:
 #   helm upgrade --install openshell . \
-#     -f values.yaml -f values-skaffold.yaml -f values-keycloak.yaml
+#     -f values.yaml -f ci/values-skaffold.yaml -f ci/values-keycloak.yaml
 #
 # Or add this file to skaffold.yaml valuesFiles for iterative dev.
 #
diff --git a/deploy/helm/openshell/values-skaffold.yaml b/deploy/helm/openshell/ci/values-skaffold.yaml
similarity index 100%
rename from deploy/helm/openshell/values-skaffold.yaml
rename to deploy/helm/openshell/ci/values-skaffold.yaml
diff --git a/deploy/helm/openshell/ci/values-tls-disabled.yaml b/deploy/helm/openshell/ci/values-tls-disabled.yaml
new file mode 100644
index 000000000..ea7c7900c
--- /dev/null
+++ b/deploy/helm/openshell/ci/values-tls-disabled.yaml
@@ -0,0 +1,10 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# CI lint target: TLS disabled (plaintext HTTP, no client cert requirement).
+# Typical when a reverse proxy or tunnel terminates TLS at the edge.
+server:
+  disableTls: true
+  disableGatewayAuth: true
+pkiInitJob:
+  enabled: false
diff --git a/deploy/helm/openshell/skaffold.yaml b/deploy/helm/openshell/skaffold.yaml
index fe7b96cf2..2de9ee4e6 100644
--- a/deploy/helm/openshell/skaffold.yaml
+++ b/deploy/helm/openshell/skaffold.yaml
@@ -87,16 +87,16 @@ deploy:
         createNamespace: true
         valuesFiles:
           - values.yaml
-          - values-skaffold.yaml
-          # Add values-cert-manager.yaml here (and uncomment the cert-manager
+          - ci/values-skaffold.yaml
+          # Add ci/values-cert-manager.yaml here (and uncomment the cert-manager
           # release above) to switch from pkiInitJob to cert-manager for PKI.
-          #- values-cert-manager.yaml
+          #- ci/values-cert-manager.yaml
           # To enable OIDC with a local Keycloak instance, run the one-time
           # setup task first, then uncomment the line below:
           #   mise run keycloak:k8s:setup
-          #- values-keycloak.yaml
+          #- ci/values-keycloak.yaml
           # To enable the Gateway API HTTPRoute (requires Envoy Gateway above):
-          #- values-gateway.yaml
+          #- ci/values-gateway.yaml
         setValueTemplates:
           image.repository: '{{.IMAGE_REPO_openshell_gateway}}'
           image.tag: '{{.IMAGE_TAG_openshell_gateway}}'
diff --git a/tasks/helm.toml b/tasks/helm.toml
index c7949865b..9ef2ae832 100644
--- a/tasks/helm.toml
+++ b/tasks/helm.toml
@@ -4,9 +4,18 @@
 # Helm chart tasks
 
 ["helm:lint"]
-description = "Lint the openshell helm chart"
-run = "helm lint deploy/helm/openshell"
-hide = true
+description = "Lint the openshell Helm chart (defaults + all CI configuration variants)"
+run = """
+  set -e
+  echo "--- helm lint: defaults ---"
+  helm lint deploy/helm/openshell
+  for f in deploy/helm/openshell/ci/values-*.yaml; do
+    variant=$(basename "$f" .yaml | sed 's/values-//')
+    echo "--- helm lint: $variant ---"
+    helm lint deploy/helm/openshell -f "$f"
+  done
+  echo "All variants passed."
+"""
 
 ["helm:skaffold:dev"]
 description = "Run skaffold dev for deploy/helm/openshell (iterative deploy)"
@@ -59,3 +68,24 @@ hide = true
 ["helm:gateway:apply"]
 description = "Apply the Envoy GatewayClass manifest (run after helm:skaffold:run when gateway routing is enabled)"
 run = "kubectl apply -f deploy/kube/manifests/envoy-gateway-openshell.yaml"
+
+# Helm e2e — boots a k3d cluster via the Helm path and runs the Rust + Python suites
+
+["e2e:helm"]
+description = "Bootstrap Helm k3d cluster and run Rust + Python e2e suites"
+run = "tasks/scripts/helm-e2e.sh"
+
+["e2e:helm:rust"]
+description = "Bootstrap Helm k3d cluster and run Rust e2e only"
+env = { HELM_E2E_SUITE = "rust" }
+run = "tasks/scripts/helm-e2e.sh"
+
+["e2e:helm:python"]
+description = "Bootstrap Helm k3d cluster and run Python e2e only"
+env = { HELM_E2E_SUITE = "python" }
+run = "tasks/scripts/helm-e2e.sh"
+
+["e2e:helm:cert-manager"]
+description = "Bootstrap Helm k3d cluster with cert-manager PKI and run full e2e"
+env = { HELM_E2E_PKI = "cert-manager" }
+run = "tasks/scripts/helm-e2e.sh"
diff --git a/tasks/scripts/helm-e2e.sh b/tasks/scripts/helm-e2e.sh
new file mode 100755
index 000000000..e0514cf66
--- /dev/null
+++ b/tasks/scripts/helm-e2e.sh
@@ -0,0 +1,291 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Run Rust and/or Python e2e tests against a gateway deployed via the Helm chart
+# on a local k3d cluster (k3s backed by Docker).
+#
+# The script follows the same preflight → bootstrap → register → test → cleanup
+# pattern as e2e/rust/e2e-docker.sh, but uses k3d + Skaffold + Helm instead of
+# a standalone gateway process.
+#
+# Usage:
+#   mise run e2e:helm                  # full suite, pkiInitJob PKI
+#   mise run e2e:helm:rust             # Rust only
+#   mise run e2e:helm:python           # Python only
+#   mise run e2e:helm:cert-manager     # full suite, cert-manager PKI
+#
+# Environment variables:
+#   HELM_E2E_SUITE          rust | python | all (default: all)
+#   HELM_E2E_PKI            pki-init | cert-manager (default: pki-init)
+#   HELM_E2E_KEEP_CLUSTER   1 to skip cluster deletion on exit (default: 0)
+#   HELM_E2E_CLUSTER_NAME   override k3d cluster name (default: derived from branch)
+#   KUBECONFIG              path to kubeconfig (default: <repo-root>/kubeconfig)
+#   OPENSHELL_PROVISION_TIMEOUT  sandbox ready timeout in seconds (default: 300)
+
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+SUITE="${HELM_E2E_SUITE:-all}"
+PKI_MODE="${HELM_E2E_PKI:-pki-init}"
+KEEP_CLUSTER="${HELM_E2E_KEEP_CLUSTER:-0}"
+
+# Derive cluster name the same way helm-k3s-local.sh does (last path component of branch).
+_branch_cluster_name() {
+  local branch
+  branch="$(git -C "${ROOT}" rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown")"
+  local suffix="${branch##*/}"
+  suffix="${suffix:0:24}"
+  echo "openshell-dev-${suffix}"
+}
+
+CLUSTER_NAME="${HELM_E2E_CLUSTER_NAME:-$(_branch_cluster_name)}"
+export KUBECONFIG="${KUBECONFIG:-${ROOT}/kubeconfig}"
+
+WORKDIR="$(mktemp -d "/tmp/openshell-helm-e2e.XXXXXX")"
+GATEWAY_NAME="openshell-helm-e2e-${CLUSTER_NAME}"
+GATEWAY_CONFIG_DIR="${HOME}/.config/openshell/gateways/${GATEWAY_NAME}"
+PF_PID=""
+PORT=""
+CLUSTER_CREATED=0
+
+cleanup() {
+  local exit_code=$?
+
+  if [ -n "${PF_PID}" ] && kill -0 "${PF_PID}" 2>/dev/null; then
+    echo "Stopping kubectl port-forward (pid ${PF_PID})..."
+    kill "${PF_PID}" 2>/dev/null || true
+    wait "${PF_PID}" 2>/dev/null || true
+  fi
+
+  if [ -d "${GATEWAY_CONFIG_DIR}" ]; then
+    rm -rf "${GATEWAY_CONFIG_DIR}"
+  fi
+
+  if [ "${KEEP_CLUSTER}" = "1" ]; then
+    echo "Keeping cluster '${CLUSTER_NAME}' (HELM_E2E_KEEP_CLUSTER=1)."
+  elif [ "${CLUSTER_CREATED}" = "1" ]; then
+    echo "Deleting cluster '${CLUSTER_NAME}'..."
+    HELM_K3S_CLUSTER_NAME="${CLUSTER_NAME}" \
+      bash "${ROOT}/tasks/scripts/helm-k3s-local.sh" delete 2>/dev/null || true
+  fi
+
+  rm -rf "${WORKDIR}" 2>/dev/null || true
+
+  if [ "${exit_code}" -ne 0 ]; then
+    echo "helm-e2e failed (exit ${exit_code})."
+  fi
+}
+trap cleanup EXIT
+
+# ── Preflight ────────────────────────────────────────────────────────────────
+require_cmd() {
+  if ! command -v "$1" >/dev/null 2>&1; then
+    echo "ERROR: '$1' is required but not found in PATH" >&2
+    exit 2
+  fi
+}
+
+require_cmd k3d
+require_cmd helm
+require_cmd kubectl
+require_cmd docker
+require_cmd openssl
+
+if ! docker info >/dev/null 2>&1; then
+  echo "ERROR: docker daemon is not reachable" >&2
+  exit 2
+fi
+
+echo "=== helm-e2e: suite=${SUITE} pki=${PKI_MODE} cluster=${CLUSTER_NAME} ==="
+
+# ── Cluster ──────────────────────────────────────────────────────────────────
+if k3d cluster get "${CLUSTER_NAME}" >/dev/null 2>&1; then
+  echo "Reusing existing k3d cluster '${CLUSTER_NAME}'."
+  # Refresh kubeconfig in case it's stale.
+  k3d kubeconfig write "${CLUSTER_NAME}" --output "${KUBECONFIG}" >/dev/null
+else
+  echo "Creating k3d cluster '${CLUSTER_NAME}'..."
+  HELM_K3S_CLUSTER_NAME="${CLUSTER_NAME}" \
+    bash "${ROOT}/tasks/scripts/helm-k3s-local.sh" create
+  CLUSTER_CREATED=1
+fi
+
+# ── cert-manager (optional) ──────────────────────────────────────────────────
+if [ "${PKI_MODE}" = "cert-manager" ]; then
+  echo "Installing cert-manager..."
+  helm repo add jetstack https://charts.jetstack.io --force-update >/dev/null 2>&1 || true
+  helm upgrade --install cert-manager jetstack/cert-manager \
+    --namespace cert-manager --create-namespace \
+    --set crds.enabled=true \
+    --wait 2>&1
+fi
+
+# ── Build images ─────────────────────────────────────────────────────────────
+# Use a fixed local tag so the image names are stable across runs and Helm
+# can reference them without Skaffold's digest-based tags.
+GATEWAY_IMAGE="openshell/gateway:helm-e2e"
+SUPERVISOR_IMAGE="openshell/supervisor:helm-e2e"
+
+echo "Building gateway image..."
+docker buildx build \
+  --build-arg BUILD_FROM_SOURCE=1 \
+  --target gateway \
+  --tag "${GATEWAY_IMAGE}" \
+  --load \
+  --file "${ROOT}/deploy/docker/Dockerfile.images" \
+  "${ROOT}" 2>&1
+
+echo "Building supervisor image..."
+docker buildx build \
+  --build-arg BUILD_FROM_SOURCE=1 \
+  --target supervisor \
+  --tag "${SUPERVISOR_IMAGE}" \
+  --load \
+  --file "${ROOT}/deploy/docker/Dockerfile.images" \
+  "${ROOT}" 2>&1
+
+# Load images into the k3d cluster nodes.
+echo "Loading images into k3d cluster..."
+k3d image import "${GATEWAY_IMAGE}" "${SUPERVISOR_IMAGE}" -c "${CLUSTER_NAME}" 2>&1
+
+# ── Deploy via Helm ───────────────────────────────────────────────────────────
+HELM_VALUES_FLAGS=(
+  -f "${ROOT}/deploy/helm/openshell/values.yaml"
+)
+if [ "${PKI_MODE}" = "cert-manager" ]; then
+  HELM_VALUES_FLAGS+=(-f "${ROOT}/deploy/helm/openshell/ci/values-cert-manager.yaml")
+fi
+
+echo "Deploying OpenShell via Helm (PKI: ${PKI_MODE})..."
+helm upgrade --install openshell "${ROOT}/deploy/helm/openshell" \
+  --namespace openshell --create-namespace \
+  "${HELM_VALUES_FLAGS[@]}" \
+  --set "image.repository=openshell/gateway" \
+  --set "image.tag=helm-e2e" \
+  --set "image.pullPolicy=Never" \
+  --set "supervisor.image.repository=openshell/supervisor" \
+  --set "supervisor.image.tag=helm-e2e" \
+  --set "supervisor.image.pullPolicy=Never" \
+  --wait --timeout 180s 2>&1
+
+# ── Wait for PKI ─────────────────────────────────────────────────────────────
+if [ "${PKI_MODE}" = "cert-manager" ]; then
+  echo "Waiting for cert-manager certificates to be ready..."
+  kubectl wait --for=condition=Ready certificate/openshell-server certificate/openshell-client \
+    -n openshell --timeout=120s
+else
+  echo "Waiting for pkiInitJob secrets..."
+  elapsed=0
+  while [ "${elapsed}" -lt 60 ]; do
+    if kubectl get secret openshell-client-tls -n openshell >/dev/null 2>&1; then
+      echo "PKI secrets ready after ${elapsed}s."
+      break
+    fi
+    sleep 3
+    elapsed=$((elapsed + 3))
+  done
+  if [ "${elapsed}" -ge 60 ]; then
+    echo "ERROR: pkiInitJob secrets not created within 60s" >&2
+    exit 1
+  fi
+fi
+
+# ── Port-forward ─────────────────────────────────────────────────────────────
+pick_port() {
+  python3 -c 'import socket; s=socket.socket(); s.bind(("",0)); print(s.getsockname()[1]); s.close()'
+}
+PORT=$(pick_port)
+
+echo "Port-forwarding openshell service → localhost:${PORT}..."
+kubectl port-forward -n openshell svc/openshell "${PORT}:8080" \
+  >"${WORKDIR}/pf.log" 2>&1 &
+PF_PID=$!
+
+# ── Register gateway with CLI ─────────────────────────────────────────────────
+mkdir -p "${GATEWAY_CONFIG_DIR}/mtls"
+
+kubectl get secret openshell-client-tls -n openshell \
+  -o jsonpath='{.data.ca\.crt}'  | base64 -d > "${GATEWAY_CONFIG_DIR}/mtls/ca.crt"
+kubectl get secret openshell-client-tls -n openshell \
+  -o jsonpath='{.data.tls\.crt}' | base64 -d > "${GATEWAY_CONFIG_DIR}/mtls/tls.crt"
+kubectl get secret openshell-client-tls -n openshell \
+  -o jsonpath='{.data.tls\.key}' | base64 -d > "${GATEWAY_CONFIG_DIR}/mtls/tls.key"
+
+cat >"${GATEWAY_CONFIG_DIR}/metadata.json" <<EOF
+{
+  "name": "${GATEWAY_NAME}",
+  "gateway_endpoint": "https://127.0.0.1:${PORT}",
+  "is_remote": false,
+  "gateway_port": ${PORT}
+}
+EOF
+
+export OPENSHELL_GATEWAY="${GATEWAY_NAME}"
+export OPENSHELL_PROVISION_TIMEOUT="${OPENSHELL_PROVISION_TIMEOUT:-300}"
+
+# ── Wait for gateway health ───────────────────────────────────────────────────
+CLI_BIN="${ROOT}/target/debug/openshell"
+if [ ! -f "${CLI_BIN}" ]; then
+  echo "Building openshell CLI..."
+  cargo build -p openshell-cli --features openshell-core/dev-settings 2>&1
+fi
+
+echo "Waiting for gateway to become healthy (port ${PORT})..."
+elapsed=0
+timeout=120
+while [ "${elapsed}" -lt "${timeout}" ]; do
+  if ! kill -0 "${PF_PID}" 2>/dev/null; then
+    echo "ERROR: port-forward exited unexpectedly" >&2
+    cat "${WORKDIR}/pf.log" || true
+    exit 1
+  fi
+  if "${CLI_BIN}" status --gateway "${GATEWAY_NAME}" >/dev/null 2>&1; then
+    echo "Gateway healthy after ${elapsed}s."
+    break
+  fi
+  sleep 3
+  elapsed=$((elapsed + 3))
+done
+if [ "${elapsed}" -ge "${timeout}" ]; then
+  echo "ERROR: gateway did not become healthy within ${timeout}s" >&2
+  cat "${WORKDIR}/pf.log" || true
+  exit 1
+fi
+
+# ── Run test suites ───────────────────────────────────────────────────────────
+run_rust() {
+  echo "--- Running Rust e2e ---"
+  cargo build -p openshell-cli --features openshell-core/dev-settings
+  cargo test --manifest-path e2e/rust/Cargo.toml --features e2e -- \
+    --skip gateway_resume_scenarios \
+    --skip docker_gpu_sandbox_runs_nvidia_smi \
+    --skip sandbox_from_custom_dockerfile \
+    --skip graphql_l7_enforces_allow_and_deny_rules_on_forward_and_connect_paths \
+    --skip forward_proxy_allows_l7_permitted_request \
+    --skip sandbox_reaches_host_openshell_internal_via_host_gateway_alias \
+    --skip sandbox_inference_local_routes_to_host_openshell_internal \
+    --nocapture
+}
+
+run_python() {
+  echo "--- Running Python e2e ---"
+  mise run --no-deps python:proto
+  UV_NO_SYNC=1 PYTHONPATH=python uv run pytest \
+    -o python_files='test_*.py' \
+    -m 'not gpu' \
+    -n "${E2E_PARALLEL:-5}" \
+    e2e/python
+}
+
+case "${SUITE}" in
+  rust)   run_rust ;;
+  python) run_python ;;
+  all)    run_rust; run_python ;;
+  *)
+    echo "ERROR: unknown HELM_E2E_SUITE '${SUITE}' (must be rust, python, or all)" >&2
+    exit 2
+    ;;
+esac
+
+echo "=== helm-e2e: all suites passed ==="

From 56d6faed3c6478118650204548bc062549514061 Mon Sep 17 00:00:00 2001
From: Taylor Mutch <tmutch@nvidia.com>
Date: Tue, 5 May 2026 14:29:49 -0700
Subject: [PATCH 2/8] ci: add Branch Helm E2E workflow with test:e2e-helm gate
 (#1162)

* feat: add kubernetes local-dev environment

* Add support for grpcRoute from Kubernetes Gateway API spec
* Add pkiInitJob to initialize mTLS resources
* Add sshHandshake init job
* Test integration with Envoy Gateway
* Add keycloak integration testing with Skaffold

* docs(helm-dev-environment): document TLS toggle and mTLS port-forward setup

Add a TLS behaviour section explaining that values-skaffold.yaml disables
TLS by default, and a port-forward connection guide covering both plaintext
and mTLS modes with the exact commands to extract client certs from the
cluster PKI secret.

* chore(helm): clarify TLS toggle in values-skaffold.yaml

* chore(helm): remove leftover cert-manager references

* feat(helm): restore cert-manager PKI support alongside pkiInitJob

Re-add the openshell.issuerSelfSigned helper, the mutual-exclusion guard
in pki-hook.yaml, and the certManager condition in the statefulset volume
mount. Add server.disableTls: false to values-cert-manager.yaml so the
overlay correctly overrides the skaffold dev default. Tested end-to-end
with cert-manager issuing mTLS certs and sandbox create over port-forward.

* fix(helm): fix port-forward collision and pki idempotency check

Use port 8090 for direct port-forward to avoid colliding with the k3d
LB binding on 8080 when Envoy Gateway is active.

Check both server and client TLS secrets before skipping PKI generation.
Previously only the server secret was checked, which would silently skip
generation if a partial cleanup left one half of the pair behind. Now
emits a clear error with a recovery command when partial state is detected.

* feat(helm): add lint matrix and Helm e2e test harness

Consolidates values overlays into deploy/helm/openshell/ci/, adds a
helm:lint matrix task that validates all configuration variants, and
introduces a helm-e2e.sh script that creates a k3d cluster, builds
images via docker buildx, deploys via Helm, and runs the Rust and
Python e2e suites. Tests that require Docker-native host networking
(host.openshell.internal SSRF) are skipped on the Kubernetes path.

* ci: add helm lint workflow triggered on helm chart changes

* ci: add helm lint workflow triggered on helm chart changes

* chore: trigger helm lint CI test

* Revert "chore: trigger helm lint CI test"

This reverts commit 6b6b0a5808b0b5a318c8bce5178921fdf40d967e.

* ci: add Branch Helm E2E workflow with test:e2e-helm gate
---
 .github/workflows/branch-helm-e2e.yml | 96 +++++++++++++++++++++++++++
 .github/workflows/e2e-gate.yml        | 14 +++-
 .github/workflows/e2e-label-help.yml  |  6 +-
 3 files changed, 114 insertions(+), 2 deletions(-)
 create mode 100644 .github/workflows/branch-helm-e2e.yml

diff --git a/.github/workflows/branch-helm-e2e.yml b/.github/workflows/branch-helm-e2e.yml
new file mode 100644
index 000000000..908b4d7d5
--- /dev/null
+++ b/.github/workflows/branch-helm-e2e.yml
@@ -0,0 +1,96 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+name: Branch Helm E2E
+
+on:
+  push:
+    branches:
+      - "pull-request/[0-9]+"
+  workflow_dispatch: {}
+
+permissions: {}
+
+jobs:
+  pr_metadata:
+    name: Resolve PR metadata
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      pull-requests: read
+    outputs:
+      should_run: ${{ steps.gate.outputs.should_run }}
+    steps:
+      - uses: actions/checkout@v6
+
+      - id: gate
+        uses: ./.github/actions/pr-gate
+        with:
+          required_label: test:e2e-helm
+
+  helm-e2e-rust:
+    name: Helm E2E (rust)
+    needs: [pr_metadata]
+    if: needs.pr_metadata.outputs.should_run == 'true'
+    runs-on: linux-amd64-cpu8
+    timeout-minutes: 60
+    permissions:
+      contents: read
+      packages: read
+    container:
+      image: ghcr.io/nvidia/openshell/ci:latest
+      credentials:
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+      options: --privileged
+      volumes:
+        - /var/run/docker.sock:/var/run/docker.sock
+    env:
+      MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      HELM_E2E_CLUSTER_NAME: openshell-helm-e2e-${{ github.run_id }}-rust
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Mark workspace safe for git
+        run: git config --global --add safe.directory "$GITHUB_WORKSPACE"
+
+      - name: Install tools
+        run: mise install --locked
+
+      - name: Run Helm E2E (Rust)
+        run: mise run e2e:helm:rust
+
+  helm-e2e-python:
+    name: Helm E2E (python)
+    needs: [pr_metadata]
+    if: needs.pr_metadata.outputs.should_run == 'true'
+    runs-on: linux-amd64-cpu8
+    timeout-minutes: 60
+    permissions:
+      contents: read
+      packages: read
+    container:
+      image: ghcr.io/nvidia/openshell/ci:latest
+      credentials:
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+      options: --privileged
+      volumes:
+        - /var/run/docker.sock:/var/run/docker.sock
+    env:
+      MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      HELM_E2E_CLUSTER_NAME: openshell-helm-e2e-${{ github.run_id }}-python
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Mark workspace safe for git
+        run: git config --global --add safe.directory "$GITHUB_WORKSPACE"
+
+      - name: Install tools
+        run: mise install --locked
+
+      - name: Install Python dependencies
+        run: uv sync --frozen && mise run --no-deps python:proto
+
+      - name: Run Helm E2E (Python)
+        run: mise run e2e:helm:python
diff --git a/.github/workflows/e2e-gate.yml b/.github/workflows/e2e-gate.yml
index 67959fa8d..0155a13d2 100644
--- a/.github/workflows/e2e-gate.yml
+++ b/.github/workflows/e2e-gate.yml
@@ -4,7 +4,7 @@ on:
   pull_request:
     types: [opened, synchronize, reopened, labeled, unlabeled, ready_for_review]
   workflow_run:
-    workflows: ["Branch E2E Checks", "GPU Test"]
+    workflows: ["Branch E2E Checks", "GPU Test", "Branch Helm E2E"]
     types: [completed]
 
 permissions: {}
@@ -36,6 +36,18 @@ jobs:
       required_label: test:e2e-gpu
       workflow_file: test-gpu.yml
 
+  helm-e2e:
+    name: Helm E2E
+    if: github.event_name == 'pull_request'
+    permissions:
+      contents: read
+      pull-requests: read
+      actions: read
+    uses: ./.github/workflows/e2e-gate-check.yml
+    with:
+      required_label: test:e2e-helm
+      workflow_file: branch-helm-e2e.yml
+
   # When the guarded workflow finishes, GitHub fires `workflow_run` in the
   # default-branch context — any check posted from here would land on `main`,
   # not on the PR. Instead, find the latest `pull_request`-triggered gate run
diff --git a/.github/workflows/e2e-label-help.yml b/.github/workflows/e2e-label-help.yml
index 2a61660d2..9d534b0ed 100644
--- a/.github/workflows/e2e-label-help.yml
+++ b/.github/workflows/e2e-label-help.yml
@@ -19,7 +19,10 @@ permissions: {}
 jobs:
   hint:
     name: Post next-step hint for E2E label
-    if: github.event.label.name == 'test:e2e' || github.event.label.name == 'test:e2e-gpu'
+    if: |
+      github.event.label.name == 'test:e2e' ||
+      github.event.label.name == 'test:e2e-gpu' ||
+      github.event.label.name == 'test:e2e-helm'
     runs-on: ubuntu-latest
     permissions:
       pull-requests: write
@@ -40,6 +43,7 @@ jobs:
           case "$LABEL_NAME" in
             test:e2e) workflow_file=branch-e2e.yml; workflow_name="Branch E2E Checks" ;;
             test:e2e-gpu) workflow_file=test-gpu.yml; workflow_name="GPU Test" ;;
+            test:e2e-helm) workflow_file=branch-helm-e2e.yml; workflow_name="Branch Helm E2E" ;;
             *) echo "Unrecognized label $LABEL_NAME"; exit 1 ;;
           esac
 

From 680fec853a8f487356407ede21b36f1fc8b16aa4 Mon Sep 17 00:00:00 2001
From: Taylor Mutch <taylormutch@gmail.com>
Date: Tue, 5 May 2026 14:58:02 -0700
Subject: [PATCH 3/8] fix(e2e): shorten Helm e2e cluster names to fit k3d
 32-char limit

CI run ids combined with the openshell-helm-e2e- prefix exceeded k3d's
32-character cluster-name limit (e.g. openshell-helm-e2e-25403379605-python
is 37 chars). Shorten the workflow prefix to helm-e2e- and tighten the
local-dev suffix truncation so both paths stay under the limit.
---
 .github/workflows/branch-helm-e2e.yml | 4 ++--
 tasks/scripts/helm-e2e.sh             | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/branch-helm-e2e.yml b/.github/workflows/branch-helm-e2e.yml
index 908b4d7d5..4165208eb 100644
--- a/.github/workflows/branch-helm-e2e.yml
+++ b/.github/workflows/branch-helm-e2e.yml
@@ -47,7 +47,7 @@ jobs:
         - /var/run/docker.sock:/var/run/docker.sock
     env:
       MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      HELM_E2E_CLUSTER_NAME: openshell-helm-e2e-${{ github.run_id }}-rust
+      HELM_E2E_CLUSTER_NAME: helm-e2e-${{ github.run_id }}-rust
     steps:
       - uses: actions/checkout@v6
 
@@ -79,7 +79,7 @@ jobs:
         - /var/run/docker.sock:/var/run/docker.sock
     env:
       MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      HELM_E2E_CLUSTER_NAME: openshell-helm-e2e-${{ github.run_id }}-python
+      HELM_E2E_CLUSTER_NAME: helm-e2e-${{ github.run_id }}-python
     steps:
       - uses: actions/checkout@v6
 
diff --git a/tasks/scripts/helm-e2e.sh b/tasks/scripts/helm-e2e.sh
index e0514cf66..c84a6c00b 100755
--- a/tasks/scripts/helm-e2e.sh
+++ b/tasks/scripts/helm-e2e.sh
@@ -35,7 +35,7 @@ _branch_cluster_name() {
   local branch
   branch="$(git -C "${ROOT}" rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown")"
   local suffix="${branch##*/}"
-  suffix="${suffix:0:24}"
+  suffix="${suffix:0:18}"
   echo "openshell-dev-${suffix}"
 }
 

From d663a7f40de8f08ff349a872314c734d744ac528 Mon Sep 17 00:00:00 2001
From: Taylor Mutch <taylormutch@gmail.com>
Date: Tue, 5 May 2026 15:18:33 -0700
Subject: [PATCH 4/8] ci(helm-e2e): reuse docker-build.yml for gateway and
 supervisor images

The Helm e2e jobs were rebuilding gateway and supervisor images from
source inside each container, duplicating the work docker-build.yml
already does on every PR. Add build-gateway and build-supervisor
reusable-workflow calls (linux/amd64 to match the runner) and have the
e2e jobs pull the resulting GHCR images via a new HELM_E2E_IMAGE_TAG
env var. The local-dev buildx path is preserved as the fallback when
the tag is unset, so 'mise run e2e:helm:*' still works without CI.
---
 .github/workflows/branch-helm-e2e.yml | 34 ++++++++++++++++--
 tasks/scripts/helm-e2e.sh             | 51 ++++++++++++++++++---------
 2 files changed, 66 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/branch-helm-e2e.yml b/.github/workflows/branch-helm-e2e.yml
index 4165208eb..1489477fa 100644
--- a/.github/workflows/branch-helm-e2e.yml
+++ b/.github/workflows/branch-helm-e2e.yml
@@ -28,9 +28,31 @@ jobs:
         with:
           required_label: test:e2e-helm
 
+  build-gateway:
+    needs: [pr_metadata]
+    if: needs.pr_metadata.outputs.should_run == 'true'
+    permissions:
+      contents: read
+      packages: write
+    uses: ./.github/workflows/docker-build.yml
+    with:
+      component: gateway
+      platform: linux/amd64
+
+  build-supervisor:
+    needs: [pr_metadata]
+    if: needs.pr_metadata.outputs.should_run == 'true'
+    permissions:
+      contents: read
+      packages: write
+    uses: ./.github/workflows/docker-build.yml
+    with:
+      component: supervisor
+      platform: linux/amd64
+
   helm-e2e-rust:
     name: Helm E2E (rust)
-    needs: [pr_metadata]
+    needs: [pr_metadata, build-gateway, build-supervisor]
     if: needs.pr_metadata.outputs.should_run == 'true'
     runs-on: linux-amd64-cpu8
     timeout-minutes: 60
@@ -48,12 +70,16 @@ jobs:
     env:
       MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       HELM_E2E_CLUSTER_NAME: helm-e2e-${{ github.run_id }}-rust
+      HELM_E2E_IMAGE_TAG: ${{ github.sha }}
     steps:
       - uses: actions/checkout@v6
 
       - name: Mark workspace safe for git
         run: git config --global --add safe.directory "$GITHUB_WORKSPACE"
 
+      - name: Log in to GHCR
+        run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin
+
       - name: Install tools
         run: mise install --locked
 
@@ -62,7 +88,7 @@ jobs:
 
   helm-e2e-python:
     name: Helm E2E (python)
-    needs: [pr_metadata]
+    needs: [pr_metadata, build-gateway, build-supervisor]
     if: needs.pr_metadata.outputs.should_run == 'true'
     runs-on: linux-amd64-cpu8
     timeout-minutes: 60
@@ -80,12 +106,16 @@ jobs:
     env:
       MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       HELM_E2E_CLUSTER_NAME: helm-e2e-${{ github.run_id }}-python
+      HELM_E2E_IMAGE_TAG: ${{ github.sha }}
     steps:
       - uses: actions/checkout@v6
 
       - name: Mark workspace safe for git
         run: git config --global --add safe.directory "$GITHUB_WORKSPACE"
 
+      - name: Log in to GHCR
+        run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin
+
       - name: Install tools
         run: mise install --locked
 
diff --git a/tasks/scripts/helm-e2e.sh b/tasks/scripts/helm-e2e.sh
index c84a6c00b..c27c0c306 100755
--- a/tasks/scripts/helm-e2e.sh
+++ b/tasks/scripts/helm-e2e.sh
@@ -20,6 +20,12 @@
 #   HELM_E2E_PKI            pki-init | cert-manager (default: pki-init)
 #   HELM_E2E_KEEP_CLUSTER   1 to skip cluster deletion on exit (default: 0)
 #   HELM_E2E_CLUSTER_NAME   override k3d cluster name (default: derived from branch)
+#   HELM_E2E_IMAGE_TAG      if set, pull gateway+supervisor images from
+#                           HELM_E2E_IMAGE_REGISTRY at this tag instead of
+#                           building them locally (used by CI to reuse the
+#                           images produced by docker-build.yml)
+#   HELM_E2E_IMAGE_REGISTRY registry to pull pre-built images from
+#                           (default: ghcr.io/nvidia/openshell)
 #   KUBECONFIG              path to kubeconfig (default: <repo-root>/kubeconfig)
 #   OPENSHELL_PROVISION_TIMEOUT  sandbox ready timeout in seconds (default: 300)
 
@@ -127,23 +133,34 @@ fi
 GATEWAY_IMAGE="openshell/gateway:helm-e2e"
 SUPERVISOR_IMAGE="openshell/supervisor:helm-e2e"
 
-echo "Building gateway image..."
-docker buildx build \
-  --build-arg BUILD_FROM_SOURCE=1 \
-  --target gateway \
-  --tag "${GATEWAY_IMAGE}" \
-  --load \
-  --file "${ROOT}/deploy/docker/Dockerfile.images" \
-  "${ROOT}" 2>&1
-
-echo "Building supervisor image..."
-docker buildx build \
-  --build-arg BUILD_FROM_SOURCE=1 \
-  --target supervisor \
-  --tag "${SUPERVISOR_IMAGE}" \
-  --load \
-  --file "${ROOT}/deploy/docker/Dockerfile.images" \
-  "${ROOT}" 2>&1
+if [ -n "${HELM_E2E_IMAGE_TAG:-}" ]; then
+  REGISTRY="${HELM_E2E_IMAGE_REGISTRY:-ghcr.io/nvidia/openshell}"
+  echo "Pulling pre-built gateway image (${REGISTRY}/gateway:${HELM_E2E_IMAGE_TAG})..."
+  docker pull "${REGISTRY}/gateway:${HELM_E2E_IMAGE_TAG}"
+  docker tag "${REGISTRY}/gateway:${HELM_E2E_IMAGE_TAG}" "${GATEWAY_IMAGE}"
+
+  echo "Pulling pre-built supervisor image (${REGISTRY}/supervisor:${HELM_E2E_IMAGE_TAG})..."
+  docker pull "${REGISTRY}/supervisor:${HELM_E2E_IMAGE_TAG}"
+  docker tag "${REGISTRY}/supervisor:${HELM_E2E_IMAGE_TAG}" "${SUPERVISOR_IMAGE}"
+else
+  echo "Building gateway image..."
+  docker buildx build \
+    --build-arg BUILD_FROM_SOURCE=1 \
+    --target gateway \
+    --tag "${GATEWAY_IMAGE}" \
+    --load \
+    --file "${ROOT}/deploy/docker/Dockerfile.images" \
+    "${ROOT}" 2>&1
+
+  echo "Building supervisor image..."
+  docker buildx build \
+    --build-arg BUILD_FROM_SOURCE=1 \
+    --target supervisor \
+    --tag "${SUPERVISOR_IMAGE}" \
+    --load \
+    --file "${ROOT}/deploy/docker/Dockerfile.images" \
+    "${ROOT}" 2>&1
+fi
 
 # Load images into the k3d cluster nodes.
 echo "Loading images into k3d cluster..."

From 95eac834fc54cff4ae802953fba1d7fa0d096e30 Mon Sep 17 00:00:00 2001
From: Taylor Mutch <taylormutch@gmail.com>
Date: Tue, 5 May 2026 15:35:49 -0700
Subject: [PATCH 5/8] fix(helm-e2e): rewrite kubeconfig server when running
 inside a container
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When helm-k3s-local.sh runs inside a Docker container that mounts the
host's docker socket (e.g., a GitHub Actions `container:` job), k3d
creates the cluster on the host's daemon and publishes the API server
on `0.0.0.0:<port>` of the host. From inside the CI container that
address is unreachable, so kubectl (and helm OpenAPI validation) fail
with 'dial tcp 0.0.0.0:<port>: connect: connection refused'.

After merging the kubeconfig, detect that we're in a container via
/.dockerenv and rewrite the server URL to the default-route gateway
(which routes to the docker host on standard sibling-container setups).
The API cert isn't signed for the gateway IP, so also mark the cluster
insecure-skip-tls-verify and clear the embedded CA — CI-only path; the
local-dev case where 0.0.0.0 already works is unchanged.
---
 tasks/scripts/helm-k3s-local.sh | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/tasks/scripts/helm-k3s-local.sh b/tasks/scripts/helm-k3s-local.sh
index 3f268c2dc..9dda86b61 100755
--- a/tasks/scripts/helm-k3s-local.sh
+++ b/tasks/scripts/helm-k3s-local.sh
@@ -114,6 +114,34 @@ merge_kubeconfig() {
   rm -f "${tmp}"
 
   kubectl --kubeconfig="${KUBECONFIG_TARGET}" config use-context "$(k3d_context_name)"
+
+  # When this script runs inside a container (e.g., a GitHub Actions
+  # `container:` job mounting /var/run/docker.sock), k3d publishes the API
+  # server on the host's `0.0.0.0:<port>` but `0.0.0.0` from inside the
+  # container is not the host. Rewrite the server URL to the default-route
+  # gateway, which routes to the docker host. The API server cert is signed
+  # for `0.0.0.0` / `127.0.0.1` and won't have the gateway IP as a SAN, so
+  # mark the cluster insecure-skip-tls-verify (CI-only path; local dev keeps
+  # the default secure setup).
+  if [[ -f /.dockerenv ]]; then
+    local context old_server new_server host_addr
+    context="$(k3d_context_name)"
+    old_server=$(kubectl --kubeconfig="${KUBECONFIG_TARGET}" config view --raw \
+      -o "jsonpath={.clusters[?(@.name=='${context}')].cluster.server}")
+    if [[ "${old_server}" == https://0.0.0.0:* ]]; then
+      host_addr=$(ip route show default 2>/dev/null | awk '/default/ {print $3; exit}')
+      if [[ -n "${host_addr}" ]]; then
+        new_server="${old_server//0.0.0.0/${host_addr}}"
+        echo "Inside container; rewriting kubeconfig server ${old_server} -> ${new_server} (insecure-skip-tls-verify)."
+        kubectl --kubeconfig="${KUBECONFIG_TARGET}" config unset \
+          "clusters.${context}.certificate-authority-data" >/dev/null 2>&1 || true
+        kubectl --kubeconfig="${KUBECONFIG_TARGET}" config set-cluster "${context}" \
+          --server="${new_server}" --insecure-skip-tls-verify=true >/dev/null
+      else
+        echo "warning: running inside a container but could not detect a default-route gateway; kubectl may fail to reach the API server." >&2
+      fi
+    fi
+  fi
 }
 
 apply_base_manifests() {

From d138767f5ff03c9a8dde53e4e8b4a88f1a663505 Mon Sep 17 00:00:00 2001
From: Taylor Mutch <taylormutch@gmail.com>
Date: Tue, 5 May 2026 15:47:23 -0700
Subject: [PATCH 6/8] fix(docker): copy providers/ into in-Docker rust-builder
 stage

PR #1037 added include_str!("../../../providers/*.yaml") in
crates/openshell-providers/src/profiles.rs, but the BUILD_FROM_SOURCE=1
path of Dockerfile.images only COPY's Cargo.toml/Cargo.lock, crates/,
and proto/. With providers/ missing the cargo build inside the rust-
builder stage fails to read the embedded YAML. The release path is
unaffected because it copies pre-built binaries from
deploy/docker/.build/prebuilt-binaries/.

This breaks 'mise run e2e:helm:*' and any other workflow that builds
images from source via this Dockerfile (e.g., the local helm-e2e
harness). Add 'COPY providers/ providers/' alongside the other source
inputs.
---
 deploy/docker/Dockerfile.images | 1 +
 1 file changed, 1 insertion(+)

diff --git a/deploy/docker/Dockerfile.images b/deploy/docker/Dockerfile.images
index 359789ffd..746aaad4a 100644
--- a/deploy/docker/Dockerfile.images
+++ b/deploy/docker/Dockerfile.images
@@ -48,6 +48,7 @@ WORKDIR /build
 COPY Cargo.toml Cargo.lock ./
 COPY crates/ crates/
 COPY proto/ proto/
+COPY providers/ providers/
 
 RUN --mount=type=cache,target=/usr/local/cargo/registry \
     --mount=type=cache,target=/build/target \

From ecfd83b23524f108d1950af5fbe8cc010cbf711b Mon Sep 17 00:00:00 2001
From: Taylor Mutch <taylormutch@gmail.com>
Date: Tue, 5 May 2026 15:54:19 -0700
Subject: [PATCH 7/8] fix(helm-e2e): read default gateway from /proc/net/route,
 not iproute2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The CI container (ghcr.io/nvidia/openshell/ci:latest) does not have the
`ip` command installed, so the kubeconfig-rewrite block exited 127 with
`set -euo pipefail`. Read the default gateway directly from
/proc/net/route instead — that file is always present on Linux and
needs no extra package. Decode the gateway field as a little-endian
32-bit hex string into dotted decimal.
---
 tasks/scripts/helm-k3s-local.sh | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/tasks/scripts/helm-k3s-local.sh b/tasks/scripts/helm-k3s-local.sh
index 9dda86b61..8c181bc67 100755
--- a/tasks/scripts/helm-k3s-local.sh
+++ b/tasks/scripts/helm-k3s-local.sh
@@ -129,7 +129,19 @@ merge_kubeconfig() {
     old_server=$(kubectl --kubeconfig="${KUBECONFIG_TARGET}" config view --raw \
       -o "jsonpath={.clusters[?(@.name=='${context}')].cluster.server}")
     if [[ "${old_server}" == https://0.0.0.0:* ]]; then
-      host_addr=$(ip route show default 2>/dev/null | awk '/default/ {print $3; exit}')
+      # Read the default-route gateway from /proc/net/route directly to avoid
+      # depending on the `ip` command, which is not in the CI image. The
+      # gateway field is a little-endian 32-bit hex value, so we read pairs
+      # of hex digits in reverse and format as dotted decimal.
+      host_addr=$(awk '$2=="00000000" {
+        gw = $3
+        printf "%d.%d.%d.%d",
+          strtonum("0x" substr(gw,7,2)),
+          strtonum("0x" substr(gw,5,2)),
+          strtonum("0x" substr(gw,3,2)),
+          strtonum("0x" substr(gw,1,2))
+        exit
+      }' /proc/net/route 2>/dev/null) || host_addr=""
       if [[ -n "${host_addr}" ]]; then
         new_server="${old_server//0.0.0.0/${host_addr}}"
         echo "Inside container; rewriting kubeconfig server ${old_server} -> ${new_server} (insecure-skip-tls-verify)."

From 24b1d0f1dd8039f973a2ba18817e68a5fb7b1ed1 Mon Sep 17 00:00:00 2001
From: Taylor Mutch <taylormutch@gmail.com>
Date: Tue, 5 May 2026 16:26:49 -0700
Subject: [PATCH 8/8] ci(helm-e2e): use kind in CI, keep k3d for local dev

The previous attempts to make the in-container kubectl reach the host's
k3d API server kept hitting tooling gaps (missing iproute2, gawk-only
strtonum). Step back and follow the conventional pattern instead:

- Drop the `container:` block from the helm-e2e jobs and run on the
  bare runner. Install mise via `curl https://mise.run | sh`.
- Use `helm/kind-action` to provision a kind cluster on the runner.
  Because the workflow steps run on the runner directly, the kind API
  server is reachable through the standard kubeconfig the action writes.
- Add HELM_E2E_SKIP_CLUSTER and HELM_E2E_IMAGE_LOADER env vars to
  helm-e2e.sh so it can drive the existing flow against either a self-
  managed k3d cluster (default; what 'mise run e2e:helm:*' uses locally)
  or a caller-managed kind cluster (CI). Image loading switches between
  'k3d image import' and 'kind load docker-image' accordingly.
- Revert the in-container kubeconfig-rewrite hacks in helm-k3s-local.sh;
  they are no longer needed once CI runs on the bare runner.
---
 .github/workflows/branch-helm-e2e.yml | 50 +++++++++++++++------------
 tasks/scripts/helm-e2e.sh             | 50 +++++++++++++++++++++++----
 tasks/scripts/helm-k3s-local.sh       | 40 ---------------------
 3 files changed, 72 insertions(+), 68 deletions(-)

diff --git a/.github/workflows/branch-helm-e2e.yml b/.github/workflows/branch-helm-e2e.yml
index 1489477fa..e804fafd5 100644
--- a/.github/workflows/branch-helm-e2e.yml
+++ b/.github/workflows/branch-helm-e2e.yml
@@ -59,30 +59,33 @@ jobs:
     permissions:
       contents: read
       packages: read
-    container:
-      image: ghcr.io/nvidia/openshell/ci:latest
-      credentials:
-        username: ${{ github.actor }}
-        password: ${{ secrets.GITHUB_TOKEN }}
-      options: --privileged
-      volumes:
-        - /var/run/docker.sock:/var/run/docker.sock
     env:
       MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       HELM_E2E_CLUSTER_NAME: helm-e2e-${{ github.run_id }}-rust
       HELM_E2E_IMAGE_TAG: ${{ github.sha }}
+      HELM_E2E_SKIP_CLUSTER: "1"
+      HELM_E2E_IMAGE_LOADER: kind
     steps:
       - uses: actions/checkout@v6
 
-      - name: Mark workspace safe for git
-        run: git config --global --add safe.directory "$GITHUB_WORKSPACE"
-
       - name: Log in to GHCR
         run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin
 
+      - name: Install mise
+        run: |
+          curl https://mise.run | sh
+          echo "$HOME/.local/bin" >> "$GITHUB_PATH"
+          echo "$HOME/.local/share/mise/shims" >> "$GITHUB_PATH"
+
       - name: Install tools
         run: mise install --locked
 
+      - name: Create kind cluster
+        uses: helm/kind-action@v1
+        with:
+          cluster_name: ${{ env.HELM_E2E_CLUSTER_NAME }}
+          wait: 120s
+
       - name: Run Helm E2E (Rust)
         run: mise run e2e:helm:rust
 
@@ -95,32 +98,35 @@ jobs:
     permissions:
       contents: read
       packages: read
-    container:
-      image: ghcr.io/nvidia/openshell/ci:latest
-      credentials:
-        username: ${{ github.actor }}
-        password: ${{ secrets.GITHUB_TOKEN }}
-      options: --privileged
-      volumes:
-        - /var/run/docker.sock:/var/run/docker.sock
     env:
       MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       HELM_E2E_CLUSTER_NAME: helm-e2e-${{ github.run_id }}-python
       HELM_E2E_IMAGE_TAG: ${{ github.sha }}
+      HELM_E2E_SKIP_CLUSTER: "1"
+      HELM_E2E_IMAGE_LOADER: kind
     steps:
       - uses: actions/checkout@v6
 
-      - name: Mark workspace safe for git
-        run: git config --global --add safe.directory "$GITHUB_WORKSPACE"
-
       - name: Log in to GHCR
         run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin
 
+      - name: Install mise
+        run: |
+          curl https://mise.run | sh
+          echo "$HOME/.local/bin" >> "$GITHUB_PATH"
+          echo "$HOME/.local/share/mise/shims" >> "$GITHUB_PATH"
+
       - name: Install tools
         run: mise install --locked
 
       - name: Install Python dependencies
         run: uv sync --frozen && mise run --no-deps python:proto
 
+      - name: Create kind cluster
+        uses: helm/kind-action@v1
+        with:
+          cluster_name: ${{ env.HELM_E2E_CLUSTER_NAME }}
+          wait: 120s
+
       - name: Run Helm E2E (Python)
         run: mise run e2e:helm:python
diff --git a/tasks/scripts/helm-e2e.sh b/tasks/scripts/helm-e2e.sh
index c27c0c306..1d453ef66 100755
--- a/tasks/scripts/helm-e2e.sh
+++ b/tasks/scripts/helm-e2e.sh
@@ -19,7 +19,15 @@
 #   HELM_E2E_SUITE          rust | python | all (default: all)
 #   HELM_E2E_PKI            pki-init | cert-manager (default: pki-init)
 #   HELM_E2E_KEEP_CLUSTER   1 to skip cluster deletion on exit (default: 0)
-#   HELM_E2E_CLUSTER_NAME   override k3d cluster name (default: derived from branch)
+#   HELM_E2E_CLUSTER_NAME   override cluster name (default: derived from branch)
+#   HELM_E2E_SKIP_CLUSTER   1 if the caller has already provisioned the cluster
+#                           (and KUBECONFIG points at it). The script will not
+#                           create or delete the cluster. Used by CI, where
+#                           helm/kind-action provisions a kind cluster before
+#                           this script runs.
+#   HELM_E2E_IMAGE_LOADER   k3d | kind | none — which loader to use to import
+#                           the gateway and supervisor images into the cluster
+#                           (default: k3d for local dev; CI sets kind)
 #   HELM_E2E_IMAGE_TAG      if set, pull gateway+supervisor images from
 #                           HELM_E2E_IMAGE_REGISTRY at this tag instead of
 #                           building them locally (used by CI to reuse the
@@ -35,6 +43,8 @@ ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
 SUITE="${HELM_E2E_SUITE:-all}"
 PKI_MODE="${HELM_E2E_PKI:-pki-init}"
 KEEP_CLUSTER="${HELM_E2E_KEEP_CLUSTER:-0}"
+SKIP_CLUSTER="${HELM_E2E_SKIP_CLUSTER:-0}"
+IMAGE_LOADER="${HELM_E2E_IMAGE_LOADER:-k3d}"
 
 # Derive cluster name the same way helm-k3s-local.sh does (last path component of branch).
 _branch_cluster_name() {
@@ -92,12 +102,27 @@ require_cmd() {
   fi
 }
 
-require_cmd k3d
 require_cmd helm
 require_cmd kubectl
 require_cmd docker
 require_cmd openssl
 
+# k3d is only needed when this script manages the cluster lifecycle. CI hands
+# us a pre-existing kind cluster via HELM_E2E_SKIP_CLUSTER=1.
+if [ "${SKIP_CLUSTER}" != "1" ]; then
+  require_cmd k3d
+fi
+case "${IMAGE_LOADER}" in
+  k3d|kind|none) ;;
+  *)
+    echo "ERROR: unknown HELM_E2E_IMAGE_LOADER '${IMAGE_LOADER}' (must be k3d, kind, or none)" >&2
+    exit 2
+    ;;
+esac
+if [ "${IMAGE_LOADER}" = "kind" ]; then
+  require_cmd kind
+fi
+
 if ! docker info >/dev/null 2>&1; then
   echo "ERROR: docker daemon is not reachable" >&2
   exit 2
@@ -106,7 +131,9 @@ fi
 echo "=== helm-e2e: suite=${SUITE} pki=${PKI_MODE} cluster=${CLUSTER_NAME} ==="
 
 # ── Cluster ──────────────────────────────────────────────────────────────────
-if k3d cluster get "${CLUSTER_NAME}" >/dev/null 2>&1; then
+if [ "${SKIP_CLUSTER}" = "1" ]; then
+  echo "Using pre-existing cluster '${CLUSTER_NAME}' (HELM_E2E_SKIP_CLUSTER=1)."
+elif k3d cluster get "${CLUSTER_NAME}" >/dev/null 2>&1; then
   echo "Reusing existing k3d cluster '${CLUSTER_NAME}'."
   # Refresh kubeconfig in case it's stale.
   k3d kubeconfig write "${CLUSTER_NAME}" --output "${KUBECONFIG}" >/dev/null
@@ -162,9 +189,20 @@ else
     "${ROOT}" 2>&1
 fi
 
-# Load images into the k3d cluster nodes.
-echo "Loading images into k3d cluster..."
-k3d image import "${GATEWAY_IMAGE}" "${SUPERVISOR_IMAGE}" -c "${CLUSTER_NAME}" 2>&1
+# Load images into the cluster nodes.
+case "${IMAGE_LOADER}" in
+  k3d)
+    echo "Loading images into k3d cluster '${CLUSTER_NAME}'..."
+    k3d image import "${GATEWAY_IMAGE}" "${SUPERVISOR_IMAGE}" -c "${CLUSTER_NAME}" 2>&1
+    ;;
+  kind)
+    echo "Loading images into kind cluster '${CLUSTER_NAME}'..."
+    kind load docker-image "${GATEWAY_IMAGE}" "${SUPERVISOR_IMAGE}" --name "${CLUSTER_NAME}" 2>&1
+    ;;
+  none)
+    echo "Skipping image load (HELM_E2E_IMAGE_LOADER=none); the cluster must already have ${GATEWAY_IMAGE} and ${SUPERVISOR_IMAGE}."
+    ;;
+esac
 
 # ── Deploy via Helm ───────────────────────────────────────────────────────────
 HELM_VALUES_FLAGS=(
diff --git a/tasks/scripts/helm-k3s-local.sh b/tasks/scripts/helm-k3s-local.sh
index 8c181bc67..3f268c2dc 100755
--- a/tasks/scripts/helm-k3s-local.sh
+++ b/tasks/scripts/helm-k3s-local.sh
@@ -114,46 +114,6 @@ merge_kubeconfig() {
   rm -f "${tmp}"
 
   kubectl --kubeconfig="${KUBECONFIG_TARGET}" config use-context "$(k3d_context_name)"
-
-  # When this script runs inside a container (e.g., a GitHub Actions
-  # `container:` job mounting /var/run/docker.sock), k3d publishes the API
-  # server on the host's `0.0.0.0:<port>` but `0.0.0.0` from inside the
-  # container is not the host. Rewrite the server URL to the default-route
-  # gateway, which routes to the docker host. The API server cert is signed
-  # for `0.0.0.0` / `127.0.0.1` and won't have the gateway IP as a SAN, so
-  # mark the cluster insecure-skip-tls-verify (CI-only path; local dev keeps
-  # the default secure setup).
-  if [[ -f /.dockerenv ]]; then
-    local context old_server new_server host_addr
-    context="$(k3d_context_name)"
-    old_server=$(kubectl --kubeconfig="${KUBECONFIG_TARGET}" config view --raw \
-      -o "jsonpath={.clusters[?(@.name=='${context}')].cluster.server}")
-    if [[ "${old_server}" == https://0.0.0.0:* ]]; then
-      # Read the default-route gateway from /proc/net/route directly to avoid
-      # depending on the `ip` command, which is not in the CI image. The
-      # gateway field is a little-endian 32-bit hex value, so we read pairs
-      # of hex digits in reverse and format as dotted decimal.
-      host_addr=$(awk '$2=="00000000" {
-        gw = $3
-        printf "%d.%d.%d.%d",
-          strtonum("0x" substr(gw,7,2)),
-          strtonum("0x" substr(gw,5,2)),
-          strtonum("0x" substr(gw,3,2)),
-          strtonum("0x" substr(gw,1,2))
-        exit
-      }' /proc/net/route 2>/dev/null) || host_addr=""
-      if [[ -n "${host_addr}" ]]; then
-        new_server="${old_server//0.0.0.0/${host_addr}}"
-        echo "Inside container; rewriting kubeconfig server ${old_server} -> ${new_server} (insecure-skip-tls-verify)."
-        kubectl --kubeconfig="${KUBECONFIG_TARGET}" config unset \
-          "clusters.${context}.certificate-authority-data" >/dev/null 2>&1 || true
-        kubectl --kubeconfig="${KUBECONFIG_TARGET}" config set-cluster "${context}" \
-          --server="${new_server}" --insecure-skip-tls-verify=true >/dev/null
-      else
-        echo "warning: running inside a container but could not detect a default-route gateway; kubectl may fail to reach the API server." >&2
-      fi
-    fi
-  fi
 }
 
 apply_base_manifests() {