From 82f97308d6b98fd2a79980b72e6c069a1ff9abc7 Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Tue, 5 May 2026 14:04:51 -0700 Subject: [PATCH 1/8] test(e2e): Add a Helm specific e2e harness and linting workflow --- .agents/skills/helm-dev-environment/SKILL.md | 14 +- .github/workflows/helm-lint.yml | 57 ++++ deploy/helm/openshell/.helmignore | 6 +- .../{ => ci}/values-cert-manager.yaml | 2 +- .../openshell/{ => ci}/values-gateway.yaml | 2 +- .../openshell/{ => ci}/values-keycloak.yaml | 2 +- .../openshell/{ => ci}/values-skaffold.yaml | 0 .../openshell/ci/values-tls-disabled.yaml | 10 + deploy/helm/openshell/skaffold.yaml | 10 +- tasks/helm.toml | 36 ++- tasks/scripts/helm-e2e.sh | 291 ++++++++++++++++++ 11 files changed, 408 insertions(+), 22 deletions(-) create mode 100644 .github/workflows/helm-lint.yml rename deploy/helm/openshell/{ => ci}/values-cert-manager.yaml (84%) rename deploy/helm/openshell/{ => ci}/values-gateway.yaml (92%) rename deploy/helm/openshell/{ => ci}/values-keycloak.yaml (95%) rename deploy/helm/openshell/{ => ci}/values-skaffold.yaml (100%) create mode 100644 deploy/helm/openshell/ci/values-tls-disabled.yaml create mode 100755 tasks/scripts/helm-e2e.sh diff --git a/.agents/skills/helm-dev-environment/SKILL.md b/.agents/skills/helm-dev-environment/SKILL.md index 986ce1490..0db44f1f8 100644 --- a/.agents/skills/helm-dev-environment/SKILL.md +++ b/.agents/skills/helm-dev-environment/SKILL.md @@ -63,7 +63,7 @@ The gateway Service uses ClusterIP. Access is via Envoy Gateway (port `8080`) or ### TLS behaviour -`values-skaffold.yaml` sets `server.disableTls: true`, so Skaffold-based deploys run +`ci/values-skaffold.yaml` sets `server.disableTls: true`, so Skaffold-based deploys run plaintext by default. To test with TLS enabled, comment out that line and redeploy. | Mode | `server.disableTls` | Gateway scheme | @@ -160,7 +160,7 @@ imports the openshell realm from `scripts/keycloak-realm.json`, and prints a por command for acquiring tokens from the CLI. Then activate OIDC in the OpenShell Helm chart: -1. Uncomment `#- values-keycloak.yaml` in `skaffold.yaml` +1. Uncomment `#- ci/values-keycloak.yaml` in `skaffold.yaml` 2. Redeploy: `mise run helm:skaffold:run` To remove Keycloak: @@ -191,10 +191,12 @@ mise run helm:k3s:status |------|---------| | `deploy/helm/openshell/skaffold.yaml` | Skaffold config — images, Helm releases, values overlays | | `deploy/helm/openshell/values.yaml` | Default Helm values | -| `deploy/helm/openshell/values-skaffold.yaml` | Dev overrides (image pull policy, local image names) | -| `deploy/helm/openshell/values-cert-manager.yaml` | cert-manager TLS overlay (opt-in; disables pkiInitJob) | -| `deploy/helm/openshell/values-gateway.yaml` | Envoy Gateway GRPCRoute + Gateway overlay | -| `deploy/helm/openshell/values-keycloak.yaml` | Keycloak OIDC overlay | +| `deploy/helm/openshell/ci/values-skaffold.yaml` | Dev overrides (image pull policy, TLS disabled for local Skaffold) | +| `deploy/helm/openshell/ci/values-cert-manager.yaml` | cert-manager PKI overlay (opt-in; disables pkiInitJob) | +| `deploy/helm/openshell/ci/values-gateway.yaml` | Envoy Gateway GRPCRoute + Gateway overlay | +| `deploy/helm/openshell/ci/values-keycloak.yaml` | Keycloak OIDC overlay | +| `deploy/helm/openshell/ci/values-tls-disabled.yaml` | Lint-only: TLS + auth disabled (reverse-proxy edge termination) | | `deploy/kube/manifests/envoy-gateway-openshell.yaml` | GatewayClass for Envoy Gateway (`mise run helm:gateway:apply`) | | `tasks/scripts/helm-k3s-local.sh` | k3d cluster create/delete/start/stop/status | +| `tasks/scripts/helm-e2e.sh` | Bootstrap k3d cluster and run Rust + Python e2e via Helm | | `tasks/scripts/keycloak-k8s-setup.sh` | Keycloak deploy + realm import | diff --git a/.github/workflows/helm-lint.yml b/.github/workflows/helm-lint.yml new file mode 100644 index 000000000..8b7184133 --- /dev/null +++ b/.github/workflows/helm-lint.yml @@ -0,0 +1,57 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +name: Helm Lint + +on: + push: + branches: + - "pull-request/[0-9]+" + paths: + - "deploy/helm/**" + workflow_dispatch: + +env: + MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + +permissions: + contents: read + packages: read + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + pr_metadata: + name: Resolve PR metadata + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: read + outputs: + should_run: ${{ steps.gate.outputs.should_run }} + steps: + - uses: actions/checkout@v6 + + - id: gate + uses: ./.github/actions/pr-gate + + helm-lint: + name: Helm Lint + needs: pr_metadata + if: needs.pr_metadata.outputs.should_run == 'true' + runs-on: linux-amd64-cpu8 + container: + image: ghcr.io/nvidia/openshell/ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + steps: + - uses: actions/checkout@v6 + + - name: Install tools + run: mise install --locked + + - name: Lint Helm chart + run: mise run helm:lint diff --git a/deploy/helm/openshell/.helmignore b/deploy/helm/openshell/.helmignore index 798d0e7c8..a12325802 100644 --- a/deploy/helm/openshell/.helmignore +++ b/deploy/helm/openshell/.helmignore @@ -19,8 +19,4 @@ # Ignore development files skaffold.yaml -values-keycloak.yaml -values-ingress.yaml -values-gateway.yaml -values-cert-manager.yaml -values-skaffold.yaml +ci/ diff --git a/deploy/helm/openshell/values-cert-manager.yaml b/deploy/helm/openshell/ci/values-cert-manager.yaml similarity index 84% rename from deploy/helm/openshell/values-cert-manager.yaml rename to deploy/helm/openshell/ci/values-cert-manager.yaml index bb024d716..ed99c8b46 100644 --- a/deploy/helm/openshell/values-cert-manager.yaml +++ b/deploy/helm/openshell/ci/values-cert-manager.yaml @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 # Merge after values.yaml when cert-manager CRDs are installed, e.g.: -# helm install ... -f values.yaml -f values-cert-manager.yaml +# helm install ... -f values.yaml -f ci/values-cert-manager.yaml # Or add this file to skaffold manifests.helm.releases[].valuesFiles. server: disableTls: false diff --git a/deploy/helm/openshell/values-gateway.yaml b/deploy/helm/openshell/ci/values-gateway.yaml similarity index 92% rename from deploy/helm/openshell/values-gateway.yaml rename to deploy/helm/openshell/ci/values-gateway.yaml index c43a4cd45..196192213 100644 --- a/deploy/helm/openshell/values-gateway.yaml +++ b/deploy/helm/openshell/ci/values-gateway.yaml @@ -5,7 +5,7 @@ # # Requires Envoy Gateway in the cluster (installed via skaffold.yaml). # Add this file to the openshell release valuesFiles to activate: -# uncomment values-gateway.yaml in deploy/helm/openshell/skaffold.yaml +# uncomment ci/values-gateway.yaml in deploy/helm/openshell/skaffold.yaml # # Envoy Gateway will create an Envoy proxy Deployment and a LoadBalancer # Service (named envoy---*) in the openshell namespace. diff --git a/deploy/helm/openshell/values-keycloak.yaml b/deploy/helm/openshell/ci/values-keycloak.yaml similarity index 95% rename from deploy/helm/openshell/values-keycloak.yaml rename to deploy/helm/openshell/ci/values-keycloak.yaml index 42bb2ad4e..cc6ca658b 100644 --- a/deploy/helm/openshell/values-keycloak.yaml +++ b/deploy/helm/openshell/ci/values-keycloak.yaml @@ -8,7 +8,7 @@ # # Then layer this file on top of values.yaml when deploying: # helm upgrade --install openshell . \ -# -f values.yaml -f values-skaffold.yaml -f values-keycloak.yaml +# -f values.yaml -f ci/values-skaffold.yaml -f ci/values-keycloak.yaml # # Or add this file to skaffold.yaml valuesFiles for iterative dev. # diff --git a/deploy/helm/openshell/values-skaffold.yaml b/deploy/helm/openshell/ci/values-skaffold.yaml similarity index 100% rename from deploy/helm/openshell/values-skaffold.yaml rename to deploy/helm/openshell/ci/values-skaffold.yaml diff --git a/deploy/helm/openshell/ci/values-tls-disabled.yaml b/deploy/helm/openshell/ci/values-tls-disabled.yaml new file mode 100644 index 000000000..ea7c7900c --- /dev/null +++ b/deploy/helm/openshell/ci/values-tls-disabled.yaml @@ -0,0 +1,10 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# CI lint target: TLS disabled (plaintext HTTP, no client cert requirement). +# Typical when a reverse proxy or tunnel terminates TLS at the edge. +server: + disableTls: true + disableGatewayAuth: true +pkiInitJob: + enabled: false diff --git a/deploy/helm/openshell/skaffold.yaml b/deploy/helm/openshell/skaffold.yaml index fe7b96cf2..2de9ee4e6 100644 --- a/deploy/helm/openshell/skaffold.yaml +++ b/deploy/helm/openshell/skaffold.yaml @@ -87,16 +87,16 @@ deploy: createNamespace: true valuesFiles: - values.yaml - - values-skaffold.yaml - # Add values-cert-manager.yaml here (and uncomment the cert-manager + - ci/values-skaffold.yaml + # Add ci/values-cert-manager.yaml here (and uncomment the cert-manager # release above) to switch from pkiInitJob to cert-manager for PKI. - #- values-cert-manager.yaml + #- ci/values-cert-manager.yaml # To enable OIDC with a local Keycloak instance, run the one-time # setup task first, then uncomment the line below: # mise run keycloak:k8s:setup - #- values-keycloak.yaml + #- ci/values-keycloak.yaml # To enable the Gateway API HTTPRoute (requires Envoy Gateway above): - #- values-gateway.yaml + #- ci/values-gateway.yaml setValueTemplates: image.repository: '{{.IMAGE_REPO_openshell_gateway}}' image.tag: '{{.IMAGE_TAG_openshell_gateway}}' diff --git a/tasks/helm.toml b/tasks/helm.toml index c7949865b..9ef2ae832 100644 --- a/tasks/helm.toml +++ b/tasks/helm.toml @@ -4,9 +4,18 @@ # Helm chart tasks ["helm:lint"] -description = "Lint the openshell helm chart" -run = "helm lint deploy/helm/openshell" -hide = true +description = "Lint the openshell Helm chart (defaults + all CI configuration variants)" +run = """ + set -e + echo "--- helm lint: defaults ---" + helm lint deploy/helm/openshell + for f in deploy/helm/openshell/ci/values-*.yaml; do + variant=$(basename "$f" .yaml | sed 's/values-//') + echo "--- helm lint: $variant ---" + helm lint deploy/helm/openshell -f "$f" + done + echo "All variants passed." +""" ["helm:skaffold:dev"] description = "Run skaffold dev for deploy/helm/openshell (iterative deploy)" @@ -59,3 +68,24 @@ hide = true ["helm:gateway:apply"] description = "Apply the Envoy GatewayClass manifest (run after helm:skaffold:run when gateway routing is enabled)" run = "kubectl apply -f deploy/kube/manifests/envoy-gateway-openshell.yaml" + +# Helm e2e — boots a k3d cluster via the Helm path and runs the Rust + Python suites + +["e2e:helm"] +description = "Bootstrap Helm k3d cluster and run Rust + Python e2e suites" +run = "tasks/scripts/helm-e2e.sh" + +["e2e:helm:rust"] +description = "Bootstrap Helm k3d cluster and run Rust e2e only" +env = { HELM_E2E_SUITE = "rust" } +run = "tasks/scripts/helm-e2e.sh" + +["e2e:helm:python"] +description = "Bootstrap Helm k3d cluster and run Python e2e only" +env = { HELM_E2E_SUITE = "python" } +run = "tasks/scripts/helm-e2e.sh" + +["e2e:helm:cert-manager"] +description = "Bootstrap Helm k3d cluster with cert-manager PKI and run full e2e" +env = { HELM_E2E_PKI = "cert-manager" } +run = "tasks/scripts/helm-e2e.sh" diff --git a/tasks/scripts/helm-e2e.sh b/tasks/scripts/helm-e2e.sh new file mode 100755 index 000000000..e0514cf66 --- /dev/null +++ b/tasks/scripts/helm-e2e.sh @@ -0,0 +1,291 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Run Rust and/or Python e2e tests against a gateway deployed via the Helm chart +# on a local k3d cluster (k3s backed by Docker). +# +# The script follows the same preflight → bootstrap → register → test → cleanup +# pattern as e2e/rust/e2e-docker.sh, but uses k3d + Skaffold + Helm instead of +# a standalone gateway process. +# +# Usage: +# mise run e2e:helm # full suite, pkiInitJob PKI +# mise run e2e:helm:rust # Rust only +# mise run e2e:helm:python # Python only +# mise run e2e:helm:cert-manager # full suite, cert-manager PKI +# +# Environment variables: +# HELM_E2E_SUITE rust | python | all (default: all) +# HELM_E2E_PKI pki-init | cert-manager (default: pki-init) +# HELM_E2E_KEEP_CLUSTER 1 to skip cluster deletion on exit (default: 0) +# HELM_E2E_CLUSTER_NAME override k3d cluster name (default: derived from branch) +# KUBECONFIG path to kubeconfig (default: /kubeconfig) +# OPENSHELL_PROVISION_TIMEOUT sandbox ready timeout in seconds (default: 300) + +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +SUITE="${HELM_E2E_SUITE:-all}" +PKI_MODE="${HELM_E2E_PKI:-pki-init}" +KEEP_CLUSTER="${HELM_E2E_KEEP_CLUSTER:-0}" + +# Derive cluster name the same way helm-k3s-local.sh does (last path component of branch). +_branch_cluster_name() { + local branch + branch="$(git -C "${ROOT}" rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown")" + local suffix="${branch##*/}" + suffix="${suffix:0:24}" + echo "openshell-dev-${suffix}" +} + +CLUSTER_NAME="${HELM_E2E_CLUSTER_NAME:-$(_branch_cluster_name)}" +export KUBECONFIG="${KUBECONFIG:-${ROOT}/kubeconfig}" + +WORKDIR="$(mktemp -d "/tmp/openshell-helm-e2e.XXXXXX")" +GATEWAY_NAME="openshell-helm-e2e-${CLUSTER_NAME}" +GATEWAY_CONFIG_DIR="${HOME}/.config/openshell/gateways/${GATEWAY_NAME}" +PF_PID="" +PORT="" +CLUSTER_CREATED=0 + +cleanup() { + local exit_code=$? + + if [ -n "${PF_PID}" ] && kill -0 "${PF_PID}" 2>/dev/null; then + echo "Stopping kubectl port-forward (pid ${PF_PID})..." + kill "${PF_PID}" 2>/dev/null || true + wait "${PF_PID}" 2>/dev/null || true + fi + + if [ -d "${GATEWAY_CONFIG_DIR}" ]; then + rm -rf "${GATEWAY_CONFIG_DIR}" + fi + + if [ "${KEEP_CLUSTER}" = "1" ]; then + echo "Keeping cluster '${CLUSTER_NAME}' (HELM_E2E_KEEP_CLUSTER=1)." + elif [ "${CLUSTER_CREATED}" = "1" ]; then + echo "Deleting cluster '${CLUSTER_NAME}'..." + HELM_K3S_CLUSTER_NAME="${CLUSTER_NAME}" \ + bash "${ROOT}/tasks/scripts/helm-k3s-local.sh" delete 2>/dev/null || true + fi + + rm -rf "${WORKDIR}" 2>/dev/null || true + + if [ "${exit_code}" -ne 0 ]; then + echo "helm-e2e failed (exit ${exit_code})." + fi +} +trap cleanup EXIT + +# ── Preflight ──────────────────────────────────────────────────────────────── +require_cmd() { + if ! command -v "$1" >/dev/null 2>&1; then + echo "ERROR: '$1' is required but not found in PATH" >&2 + exit 2 + fi +} + +require_cmd k3d +require_cmd helm +require_cmd kubectl +require_cmd docker +require_cmd openssl + +if ! docker info >/dev/null 2>&1; then + echo "ERROR: docker daemon is not reachable" >&2 + exit 2 +fi + +echo "=== helm-e2e: suite=${SUITE} pki=${PKI_MODE} cluster=${CLUSTER_NAME} ===" + +# ── Cluster ────────────────────────────────────────────────────────────────── +if k3d cluster get "${CLUSTER_NAME}" >/dev/null 2>&1; then + echo "Reusing existing k3d cluster '${CLUSTER_NAME}'." + # Refresh kubeconfig in case it's stale. + k3d kubeconfig write "${CLUSTER_NAME}" --output "${KUBECONFIG}" >/dev/null +else + echo "Creating k3d cluster '${CLUSTER_NAME}'..." + HELM_K3S_CLUSTER_NAME="${CLUSTER_NAME}" \ + bash "${ROOT}/tasks/scripts/helm-k3s-local.sh" create + CLUSTER_CREATED=1 +fi + +# ── cert-manager (optional) ────────────────────────────────────────────────── +if [ "${PKI_MODE}" = "cert-manager" ]; then + echo "Installing cert-manager..." + helm repo add jetstack https://charts.jetstack.io --force-update >/dev/null 2>&1 || true + helm upgrade --install cert-manager jetstack/cert-manager \ + --namespace cert-manager --create-namespace \ + --set crds.enabled=true \ + --wait 2>&1 +fi + +# ── Build images ───────────────────────────────────────────────────────────── +# Use a fixed local tag so the image names are stable across runs and Helm +# can reference them without Skaffold's digest-based tags. +GATEWAY_IMAGE="openshell/gateway:helm-e2e" +SUPERVISOR_IMAGE="openshell/supervisor:helm-e2e" + +echo "Building gateway image..." +docker buildx build \ + --build-arg BUILD_FROM_SOURCE=1 \ + --target gateway \ + --tag "${GATEWAY_IMAGE}" \ + --load \ + --file "${ROOT}/deploy/docker/Dockerfile.images" \ + "${ROOT}" 2>&1 + +echo "Building supervisor image..." +docker buildx build \ + --build-arg BUILD_FROM_SOURCE=1 \ + --target supervisor \ + --tag "${SUPERVISOR_IMAGE}" \ + --load \ + --file "${ROOT}/deploy/docker/Dockerfile.images" \ + "${ROOT}" 2>&1 + +# Load images into the k3d cluster nodes. +echo "Loading images into k3d cluster..." +k3d image import "${GATEWAY_IMAGE}" "${SUPERVISOR_IMAGE}" -c "${CLUSTER_NAME}" 2>&1 + +# ── Deploy via Helm ─────────────────────────────────────────────────────────── +HELM_VALUES_FLAGS=( + -f "${ROOT}/deploy/helm/openshell/values.yaml" +) +if [ "${PKI_MODE}" = "cert-manager" ]; then + HELM_VALUES_FLAGS+=(-f "${ROOT}/deploy/helm/openshell/ci/values-cert-manager.yaml") +fi + +echo "Deploying OpenShell via Helm (PKI: ${PKI_MODE})..." +helm upgrade --install openshell "${ROOT}/deploy/helm/openshell" \ + --namespace openshell --create-namespace \ + "${HELM_VALUES_FLAGS[@]}" \ + --set "image.repository=openshell/gateway" \ + --set "image.tag=helm-e2e" \ + --set "image.pullPolicy=Never" \ + --set "supervisor.image.repository=openshell/supervisor" \ + --set "supervisor.image.tag=helm-e2e" \ + --set "supervisor.image.pullPolicy=Never" \ + --wait --timeout 180s 2>&1 + +# ── Wait for PKI ───────────────────────────────────────────────────────────── +if [ "${PKI_MODE}" = "cert-manager" ]; then + echo "Waiting for cert-manager certificates to be ready..." + kubectl wait --for=condition=Ready certificate/openshell-server certificate/openshell-client \ + -n openshell --timeout=120s +else + echo "Waiting for pkiInitJob secrets..." + elapsed=0 + while [ "${elapsed}" -lt 60 ]; do + if kubectl get secret openshell-client-tls -n openshell >/dev/null 2>&1; then + echo "PKI secrets ready after ${elapsed}s." + break + fi + sleep 3 + elapsed=$((elapsed + 3)) + done + if [ "${elapsed}" -ge 60 ]; then + echo "ERROR: pkiInitJob secrets not created within 60s" >&2 + exit 1 + fi +fi + +# ── Port-forward ───────────────────────────────────────────────────────────── +pick_port() { + python3 -c 'import socket; s=socket.socket(); s.bind(("",0)); print(s.getsockname()[1]); s.close()' +} +PORT=$(pick_port) + +echo "Port-forwarding openshell service → localhost:${PORT}..." +kubectl port-forward -n openshell svc/openshell "${PORT}:8080" \ + >"${WORKDIR}/pf.log" 2>&1 & +PF_PID=$! + +# ── Register gateway with CLI ───────────────────────────────────────────────── +mkdir -p "${GATEWAY_CONFIG_DIR}/mtls" + +kubectl get secret openshell-client-tls -n openshell \ + -o jsonpath='{.data.ca\.crt}' | base64 -d > "${GATEWAY_CONFIG_DIR}/mtls/ca.crt" +kubectl get secret openshell-client-tls -n openshell \ + -o jsonpath='{.data.tls\.crt}' | base64 -d > "${GATEWAY_CONFIG_DIR}/mtls/tls.crt" +kubectl get secret openshell-client-tls -n openshell \ + -o jsonpath='{.data.tls\.key}' | base64 -d > "${GATEWAY_CONFIG_DIR}/mtls/tls.key" + +cat >"${GATEWAY_CONFIG_DIR}/metadata.json" <&1 +fi + +echo "Waiting for gateway to become healthy (port ${PORT})..." +elapsed=0 +timeout=120 +while [ "${elapsed}" -lt "${timeout}" ]; do + if ! kill -0 "${PF_PID}" 2>/dev/null; then + echo "ERROR: port-forward exited unexpectedly" >&2 + cat "${WORKDIR}/pf.log" || true + exit 1 + fi + if "${CLI_BIN}" status --gateway "${GATEWAY_NAME}" >/dev/null 2>&1; then + echo "Gateway healthy after ${elapsed}s." + break + fi + sleep 3 + elapsed=$((elapsed + 3)) +done +if [ "${elapsed}" -ge "${timeout}" ]; then + echo "ERROR: gateway did not become healthy within ${timeout}s" >&2 + cat "${WORKDIR}/pf.log" || true + exit 1 +fi + +# ── Run test suites ─────────────────────────────────────────────────────────── +run_rust() { + echo "--- Running Rust e2e ---" + cargo build -p openshell-cli --features openshell-core/dev-settings + cargo test --manifest-path e2e/rust/Cargo.toml --features e2e -- \ + --skip gateway_resume_scenarios \ + --skip docker_gpu_sandbox_runs_nvidia_smi \ + --skip sandbox_from_custom_dockerfile \ + --skip graphql_l7_enforces_allow_and_deny_rules_on_forward_and_connect_paths \ + --skip forward_proxy_allows_l7_permitted_request \ + --skip sandbox_reaches_host_openshell_internal_via_host_gateway_alias \ + --skip sandbox_inference_local_routes_to_host_openshell_internal \ + --nocapture +} + +run_python() { + echo "--- Running Python e2e ---" + mise run --no-deps python:proto + UV_NO_SYNC=1 PYTHONPATH=python uv run pytest \ + -o python_files='test_*.py' \ + -m 'not gpu' \ + -n "${E2E_PARALLEL:-5}" \ + e2e/python +} + +case "${SUITE}" in + rust) run_rust ;; + python) run_python ;; + all) run_rust; run_python ;; + *) + echo "ERROR: unknown HELM_E2E_SUITE '${SUITE}' (must be rust, python, or all)" >&2 + exit 2 + ;; +esac + +echo "=== helm-e2e: all suites passed ===" From 56d6faed3c6478118650204548bc062549514061 Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Tue, 5 May 2026 14:29:49 -0700 Subject: [PATCH 2/8] ci: add Branch Helm E2E workflow with test:e2e-helm gate (#1162) * feat: add kubernetes local-dev environment * Add support for grpcRoute from Kubernetes Gateway API spec * Add pkiInitJob to initialize mTLS resources * Add sshHandshake init job * Test integration with Envoy Gateway * Add keycloak integration testing with Skaffold * docs(helm-dev-environment): document TLS toggle and mTLS port-forward setup Add a TLS behaviour section explaining that values-skaffold.yaml disables TLS by default, and a port-forward connection guide covering both plaintext and mTLS modes with the exact commands to extract client certs from the cluster PKI secret. * chore(helm): clarify TLS toggle in values-skaffold.yaml * chore(helm): remove leftover cert-manager references * feat(helm): restore cert-manager PKI support alongside pkiInitJob Re-add the openshell.issuerSelfSigned helper, the mutual-exclusion guard in pki-hook.yaml, and the certManager condition in the statefulset volume mount. Add server.disableTls: false to values-cert-manager.yaml so the overlay correctly overrides the skaffold dev default. Tested end-to-end with cert-manager issuing mTLS certs and sandbox create over port-forward. * fix(helm): fix port-forward collision and pki idempotency check Use port 8090 for direct port-forward to avoid colliding with the k3d LB binding on 8080 when Envoy Gateway is active. Check both server and client TLS secrets before skipping PKI generation. Previously only the server secret was checked, which would silently skip generation if a partial cleanup left one half of the pair behind. Now emits a clear error with a recovery command when partial state is detected. * feat(helm): add lint matrix and Helm e2e test harness Consolidates values overlays into deploy/helm/openshell/ci/, adds a helm:lint matrix task that validates all configuration variants, and introduces a helm-e2e.sh script that creates a k3d cluster, builds images via docker buildx, deploys via Helm, and runs the Rust and Python e2e suites. Tests that require Docker-native host networking (host.openshell.internal SSRF) are skipped on the Kubernetes path. * ci: add helm lint workflow triggered on helm chart changes * ci: add helm lint workflow triggered on helm chart changes * chore: trigger helm lint CI test * Revert "chore: trigger helm lint CI test" This reverts commit 6b6b0a5808b0b5a318c8bce5178921fdf40d967e. * ci: add Branch Helm E2E workflow with test:e2e-helm gate --- .github/workflows/branch-helm-e2e.yml | 96 +++++++++++++++++++++++++++ .github/workflows/e2e-gate.yml | 14 +++- .github/workflows/e2e-label-help.yml | 6 +- 3 files changed, 114 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/branch-helm-e2e.yml diff --git a/.github/workflows/branch-helm-e2e.yml b/.github/workflows/branch-helm-e2e.yml new file mode 100644 index 000000000..908b4d7d5 --- /dev/null +++ b/.github/workflows/branch-helm-e2e.yml @@ -0,0 +1,96 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +name: Branch Helm E2E + +on: + push: + branches: + - "pull-request/[0-9]+" + workflow_dispatch: {} + +permissions: {} + +jobs: + pr_metadata: + name: Resolve PR metadata + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: read + outputs: + should_run: ${{ steps.gate.outputs.should_run }} + steps: + - uses: actions/checkout@v6 + + - id: gate + uses: ./.github/actions/pr-gate + with: + required_label: test:e2e-helm + + helm-e2e-rust: + name: Helm E2E (rust) + needs: [pr_metadata] + if: needs.pr_metadata.outputs.should_run == 'true' + runs-on: linux-amd64-cpu8 + timeout-minutes: 60 + permissions: + contents: read + packages: read + container: + image: ghcr.io/nvidia/openshell/ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + options: --privileged + volumes: + - /var/run/docker.sock:/var/run/docker.sock + env: + MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + HELM_E2E_CLUSTER_NAME: openshell-helm-e2e-${{ github.run_id }}-rust + steps: + - uses: actions/checkout@v6 + + - name: Mark workspace safe for git + run: git config --global --add safe.directory "$GITHUB_WORKSPACE" + + - name: Install tools + run: mise install --locked + + - name: Run Helm E2E (Rust) + run: mise run e2e:helm:rust + + helm-e2e-python: + name: Helm E2E (python) + needs: [pr_metadata] + if: needs.pr_metadata.outputs.should_run == 'true' + runs-on: linux-amd64-cpu8 + timeout-minutes: 60 + permissions: + contents: read + packages: read + container: + image: ghcr.io/nvidia/openshell/ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + options: --privileged + volumes: + - /var/run/docker.sock:/var/run/docker.sock + env: + MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + HELM_E2E_CLUSTER_NAME: openshell-helm-e2e-${{ github.run_id }}-python + steps: + - uses: actions/checkout@v6 + + - name: Mark workspace safe for git + run: git config --global --add safe.directory "$GITHUB_WORKSPACE" + + - name: Install tools + run: mise install --locked + + - name: Install Python dependencies + run: uv sync --frozen && mise run --no-deps python:proto + + - name: Run Helm E2E (Python) + run: mise run e2e:helm:python diff --git a/.github/workflows/e2e-gate.yml b/.github/workflows/e2e-gate.yml index 67959fa8d..0155a13d2 100644 --- a/.github/workflows/e2e-gate.yml +++ b/.github/workflows/e2e-gate.yml @@ -4,7 +4,7 @@ on: pull_request: types: [opened, synchronize, reopened, labeled, unlabeled, ready_for_review] workflow_run: - workflows: ["Branch E2E Checks", "GPU Test"] + workflows: ["Branch E2E Checks", "GPU Test", "Branch Helm E2E"] types: [completed] permissions: {} @@ -36,6 +36,18 @@ jobs: required_label: test:e2e-gpu workflow_file: test-gpu.yml + helm-e2e: + name: Helm E2E + if: github.event_name == 'pull_request' + permissions: + contents: read + pull-requests: read + actions: read + uses: ./.github/workflows/e2e-gate-check.yml + with: + required_label: test:e2e-helm + workflow_file: branch-helm-e2e.yml + # When the guarded workflow finishes, GitHub fires `workflow_run` in the # default-branch context — any check posted from here would land on `main`, # not on the PR. Instead, find the latest `pull_request`-triggered gate run diff --git a/.github/workflows/e2e-label-help.yml b/.github/workflows/e2e-label-help.yml index 2a61660d2..9d534b0ed 100644 --- a/.github/workflows/e2e-label-help.yml +++ b/.github/workflows/e2e-label-help.yml @@ -19,7 +19,10 @@ permissions: {} jobs: hint: name: Post next-step hint for E2E label - if: github.event.label.name == 'test:e2e' || github.event.label.name == 'test:e2e-gpu' + if: | + github.event.label.name == 'test:e2e' || + github.event.label.name == 'test:e2e-gpu' || + github.event.label.name == 'test:e2e-helm' runs-on: ubuntu-latest permissions: pull-requests: write @@ -40,6 +43,7 @@ jobs: case "$LABEL_NAME" in test:e2e) workflow_file=branch-e2e.yml; workflow_name="Branch E2E Checks" ;; test:e2e-gpu) workflow_file=test-gpu.yml; workflow_name="GPU Test" ;; + test:e2e-helm) workflow_file=branch-helm-e2e.yml; workflow_name="Branch Helm E2E" ;; *) echo "Unrecognized label $LABEL_NAME"; exit 1 ;; esac From 680fec853a8f487356407ede21b36f1fc8b16aa4 Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Tue, 5 May 2026 14:58:02 -0700 Subject: [PATCH 3/8] fix(e2e): shorten Helm e2e cluster names to fit k3d 32-char limit CI run ids combined with the openshell-helm-e2e- prefix exceeded k3d's 32-character cluster-name limit (e.g. openshell-helm-e2e-25403379605-python is 37 chars). Shorten the workflow prefix to helm-e2e- and tighten the local-dev suffix truncation so both paths stay under the limit. --- .github/workflows/branch-helm-e2e.yml | 4 ++-- tasks/scripts/helm-e2e.sh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/branch-helm-e2e.yml b/.github/workflows/branch-helm-e2e.yml index 908b4d7d5..4165208eb 100644 --- a/.github/workflows/branch-helm-e2e.yml +++ b/.github/workflows/branch-helm-e2e.yml @@ -47,7 +47,7 @@ jobs: - /var/run/docker.sock:/var/run/docker.sock env: MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - HELM_E2E_CLUSTER_NAME: openshell-helm-e2e-${{ github.run_id }}-rust + HELM_E2E_CLUSTER_NAME: helm-e2e-${{ github.run_id }}-rust steps: - uses: actions/checkout@v6 @@ -79,7 +79,7 @@ jobs: - /var/run/docker.sock:/var/run/docker.sock env: MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - HELM_E2E_CLUSTER_NAME: openshell-helm-e2e-${{ github.run_id }}-python + HELM_E2E_CLUSTER_NAME: helm-e2e-${{ github.run_id }}-python steps: - uses: actions/checkout@v6 diff --git a/tasks/scripts/helm-e2e.sh b/tasks/scripts/helm-e2e.sh index e0514cf66..c84a6c00b 100755 --- a/tasks/scripts/helm-e2e.sh +++ b/tasks/scripts/helm-e2e.sh @@ -35,7 +35,7 @@ _branch_cluster_name() { local branch branch="$(git -C "${ROOT}" rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown")" local suffix="${branch##*/}" - suffix="${suffix:0:24}" + suffix="${suffix:0:18}" echo "openshell-dev-${suffix}" } From d663a7f40de8f08ff349a872314c734d744ac528 Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Tue, 5 May 2026 15:18:33 -0700 Subject: [PATCH 4/8] ci(helm-e2e): reuse docker-build.yml for gateway and supervisor images The Helm e2e jobs were rebuilding gateway and supervisor images from source inside each container, duplicating the work docker-build.yml already does on every PR. Add build-gateway and build-supervisor reusable-workflow calls (linux/amd64 to match the runner) and have the e2e jobs pull the resulting GHCR images via a new HELM_E2E_IMAGE_TAG env var. The local-dev buildx path is preserved as the fallback when the tag is unset, so 'mise run e2e:helm:*' still works without CI. --- .github/workflows/branch-helm-e2e.yml | 34 ++++++++++++++++-- tasks/scripts/helm-e2e.sh | 51 ++++++++++++++++++--------- 2 files changed, 66 insertions(+), 19 deletions(-) diff --git a/.github/workflows/branch-helm-e2e.yml b/.github/workflows/branch-helm-e2e.yml index 4165208eb..1489477fa 100644 --- a/.github/workflows/branch-helm-e2e.yml +++ b/.github/workflows/branch-helm-e2e.yml @@ -28,9 +28,31 @@ jobs: with: required_label: test:e2e-helm + build-gateway: + needs: [pr_metadata] + if: needs.pr_metadata.outputs.should_run == 'true' + permissions: + contents: read + packages: write + uses: ./.github/workflows/docker-build.yml + with: + component: gateway + platform: linux/amd64 + + build-supervisor: + needs: [pr_metadata] + if: needs.pr_metadata.outputs.should_run == 'true' + permissions: + contents: read + packages: write + uses: ./.github/workflows/docker-build.yml + with: + component: supervisor + platform: linux/amd64 + helm-e2e-rust: name: Helm E2E (rust) - needs: [pr_metadata] + needs: [pr_metadata, build-gateway, build-supervisor] if: needs.pr_metadata.outputs.should_run == 'true' runs-on: linux-amd64-cpu8 timeout-minutes: 60 @@ -48,12 +70,16 @@ jobs: env: MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} HELM_E2E_CLUSTER_NAME: helm-e2e-${{ github.run_id }}-rust + HELM_E2E_IMAGE_TAG: ${{ github.sha }} steps: - uses: actions/checkout@v6 - name: Mark workspace safe for git run: git config --global --add safe.directory "$GITHUB_WORKSPACE" + - name: Log in to GHCR + run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin + - name: Install tools run: mise install --locked @@ -62,7 +88,7 @@ jobs: helm-e2e-python: name: Helm E2E (python) - needs: [pr_metadata] + needs: [pr_metadata, build-gateway, build-supervisor] if: needs.pr_metadata.outputs.should_run == 'true' runs-on: linux-amd64-cpu8 timeout-minutes: 60 @@ -80,12 +106,16 @@ jobs: env: MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} HELM_E2E_CLUSTER_NAME: helm-e2e-${{ github.run_id }}-python + HELM_E2E_IMAGE_TAG: ${{ github.sha }} steps: - uses: actions/checkout@v6 - name: Mark workspace safe for git run: git config --global --add safe.directory "$GITHUB_WORKSPACE" + - name: Log in to GHCR + run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin + - name: Install tools run: mise install --locked diff --git a/tasks/scripts/helm-e2e.sh b/tasks/scripts/helm-e2e.sh index c84a6c00b..c27c0c306 100755 --- a/tasks/scripts/helm-e2e.sh +++ b/tasks/scripts/helm-e2e.sh @@ -20,6 +20,12 @@ # HELM_E2E_PKI pki-init | cert-manager (default: pki-init) # HELM_E2E_KEEP_CLUSTER 1 to skip cluster deletion on exit (default: 0) # HELM_E2E_CLUSTER_NAME override k3d cluster name (default: derived from branch) +# HELM_E2E_IMAGE_TAG if set, pull gateway+supervisor images from +# HELM_E2E_IMAGE_REGISTRY at this tag instead of +# building them locally (used by CI to reuse the +# images produced by docker-build.yml) +# HELM_E2E_IMAGE_REGISTRY registry to pull pre-built images from +# (default: ghcr.io/nvidia/openshell) # KUBECONFIG path to kubeconfig (default: /kubeconfig) # OPENSHELL_PROVISION_TIMEOUT sandbox ready timeout in seconds (default: 300) @@ -127,23 +133,34 @@ fi GATEWAY_IMAGE="openshell/gateway:helm-e2e" SUPERVISOR_IMAGE="openshell/supervisor:helm-e2e" -echo "Building gateway image..." -docker buildx build \ - --build-arg BUILD_FROM_SOURCE=1 \ - --target gateway \ - --tag "${GATEWAY_IMAGE}" \ - --load \ - --file "${ROOT}/deploy/docker/Dockerfile.images" \ - "${ROOT}" 2>&1 - -echo "Building supervisor image..." -docker buildx build \ - --build-arg BUILD_FROM_SOURCE=1 \ - --target supervisor \ - --tag "${SUPERVISOR_IMAGE}" \ - --load \ - --file "${ROOT}/deploy/docker/Dockerfile.images" \ - "${ROOT}" 2>&1 +if [ -n "${HELM_E2E_IMAGE_TAG:-}" ]; then + REGISTRY="${HELM_E2E_IMAGE_REGISTRY:-ghcr.io/nvidia/openshell}" + echo "Pulling pre-built gateway image (${REGISTRY}/gateway:${HELM_E2E_IMAGE_TAG})..." + docker pull "${REGISTRY}/gateway:${HELM_E2E_IMAGE_TAG}" + docker tag "${REGISTRY}/gateway:${HELM_E2E_IMAGE_TAG}" "${GATEWAY_IMAGE}" + + echo "Pulling pre-built supervisor image (${REGISTRY}/supervisor:${HELM_E2E_IMAGE_TAG})..." + docker pull "${REGISTRY}/supervisor:${HELM_E2E_IMAGE_TAG}" + docker tag "${REGISTRY}/supervisor:${HELM_E2E_IMAGE_TAG}" "${SUPERVISOR_IMAGE}" +else + echo "Building gateway image..." + docker buildx build \ + --build-arg BUILD_FROM_SOURCE=1 \ + --target gateway \ + --tag "${GATEWAY_IMAGE}" \ + --load \ + --file "${ROOT}/deploy/docker/Dockerfile.images" \ + "${ROOT}" 2>&1 + + echo "Building supervisor image..." + docker buildx build \ + --build-arg BUILD_FROM_SOURCE=1 \ + --target supervisor \ + --tag "${SUPERVISOR_IMAGE}" \ + --load \ + --file "${ROOT}/deploy/docker/Dockerfile.images" \ + "${ROOT}" 2>&1 +fi # Load images into the k3d cluster nodes. echo "Loading images into k3d cluster..." From 95eac834fc54cff4ae802953fba1d7fa0d096e30 Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Tue, 5 May 2026 15:35:49 -0700 Subject: [PATCH 5/8] fix(helm-e2e): rewrite kubeconfig server when running inside a container MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When helm-k3s-local.sh runs inside a Docker container that mounts the host's docker socket (e.g., a GitHub Actions `container:` job), k3d creates the cluster on the host's daemon and publishes the API server on `0.0.0.0:` of the host. From inside the CI container that address is unreachable, so kubectl (and helm OpenAPI validation) fail with 'dial tcp 0.0.0.0:: connect: connection refused'. After merging the kubeconfig, detect that we're in a container via /.dockerenv and rewrite the server URL to the default-route gateway (which routes to the docker host on standard sibling-container setups). The API cert isn't signed for the gateway IP, so also mark the cluster insecure-skip-tls-verify and clear the embedded CA — CI-only path; the local-dev case where 0.0.0.0 already works is unchanged. --- tasks/scripts/helm-k3s-local.sh | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tasks/scripts/helm-k3s-local.sh b/tasks/scripts/helm-k3s-local.sh index 3f268c2dc..9dda86b61 100755 --- a/tasks/scripts/helm-k3s-local.sh +++ b/tasks/scripts/helm-k3s-local.sh @@ -114,6 +114,34 @@ merge_kubeconfig() { rm -f "${tmp}" kubectl --kubeconfig="${KUBECONFIG_TARGET}" config use-context "$(k3d_context_name)" + + # When this script runs inside a container (e.g., a GitHub Actions + # `container:` job mounting /var/run/docker.sock), k3d publishes the API + # server on the host's `0.0.0.0:` but `0.0.0.0` from inside the + # container is not the host. Rewrite the server URL to the default-route + # gateway, which routes to the docker host. The API server cert is signed + # for `0.0.0.0` / `127.0.0.1` and won't have the gateway IP as a SAN, so + # mark the cluster insecure-skip-tls-verify (CI-only path; local dev keeps + # the default secure setup). + if [[ -f /.dockerenv ]]; then + local context old_server new_server host_addr + context="$(k3d_context_name)" + old_server=$(kubectl --kubeconfig="${KUBECONFIG_TARGET}" config view --raw \ + -o "jsonpath={.clusters[?(@.name=='${context}')].cluster.server}") + if [[ "${old_server}" == https://0.0.0.0:* ]]; then + host_addr=$(ip route show default 2>/dev/null | awk '/default/ {print $3; exit}') + if [[ -n "${host_addr}" ]]; then + new_server="${old_server//0.0.0.0/${host_addr}}" + echo "Inside container; rewriting kubeconfig server ${old_server} -> ${new_server} (insecure-skip-tls-verify)." + kubectl --kubeconfig="${KUBECONFIG_TARGET}" config unset \ + "clusters.${context}.certificate-authority-data" >/dev/null 2>&1 || true + kubectl --kubeconfig="${KUBECONFIG_TARGET}" config set-cluster "${context}" \ + --server="${new_server}" --insecure-skip-tls-verify=true >/dev/null + else + echo "warning: running inside a container but could not detect a default-route gateway; kubectl may fail to reach the API server." >&2 + fi + fi + fi } apply_base_manifests() { From d138767f5ff03c9a8dde53e4e8b4a88f1a663505 Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Tue, 5 May 2026 15:47:23 -0700 Subject: [PATCH 6/8] fix(docker): copy providers/ into in-Docker rust-builder stage PR #1037 added include_str!("../../../providers/*.yaml") in crates/openshell-providers/src/profiles.rs, but the BUILD_FROM_SOURCE=1 path of Dockerfile.images only COPY's Cargo.toml/Cargo.lock, crates/, and proto/. With providers/ missing the cargo build inside the rust- builder stage fails to read the embedded YAML. The release path is unaffected because it copies pre-built binaries from deploy/docker/.build/prebuilt-binaries/. This breaks 'mise run e2e:helm:*' and any other workflow that builds images from source via this Dockerfile (e.g., the local helm-e2e harness). Add 'COPY providers/ providers/' alongside the other source inputs. --- deploy/docker/Dockerfile.images | 1 + 1 file changed, 1 insertion(+) diff --git a/deploy/docker/Dockerfile.images b/deploy/docker/Dockerfile.images index 359789ffd..746aaad4a 100644 --- a/deploy/docker/Dockerfile.images +++ b/deploy/docker/Dockerfile.images @@ -48,6 +48,7 @@ WORKDIR /build COPY Cargo.toml Cargo.lock ./ COPY crates/ crates/ COPY proto/ proto/ +COPY providers/ providers/ RUN --mount=type=cache,target=/usr/local/cargo/registry \ --mount=type=cache,target=/build/target \ From ecfd83b23524f108d1950af5fbe8cc010cbf711b Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Tue, 5 May 2026 15:54:19 -0700 Subject: [PATCH 7/8] fix(helm-e2e): read default gateway from /proc/net/route, not iproute2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CI container (ghcr.io/nvidia/openshell/ci:latest) does not have the `ip` command installed, so the kubeconfig-rewrite block exited 127 with `set -euo pipefail`. Read the default gateway directly from /proc/net/route instead — that file is always present on Linux and needs no extra package. Decode the gateway field as a little-endian 32-bit hex string into dotted decimal. --- tasks/scripts/helm-k3s-local.sh | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tasks/scripts/helm-k3s-local.sh b/tasks/scripts/helm-k3s-local.sh index 9dda86b61..8c181bc67 100755 --- a/tasks/scripts/helm-k3s-local.sh +++ b/tasks/scripts/helm-k3s-local.sh @@ -129,7 +129,19 @@ merge_kubeconfig() { old_server=$(kubectl --kubeconfig="${KUBECONFIG_TARGET}" config view --raw \ -o "jsonpath={.clusters[?(@.name=='${context}')].cluster.server}") if [[ "${old_server}" == https://0.0.0.0:* ]]; then - host_addr=$(ip route show default 2>/dev/null | awk '/default/ {print $3; exit}') + # Read the default-route gateway from /proc/net/route directly to avoid + # depending on the `ip` command, which is not in the CI image. The + # gateway field is a little-endian 32-bit hex value, so we read pairs + # of hex digits in reverse and format as dotted decimal. + host_addr=$(awk '$2=="00000000" { + gw = $3 + printf "%d.%d.%d.%d", + strtonum("0x" substr(gw,7,2)), + strtonum("0x" substr(gw,5,2)), + strtonum("0x" substr(gw,3,2)), + strtonum("0x" substr(gw,1,2)) + exit + }' /proc/net/route 2>/dev/null) || host_addr="" if [[ -n "${host_addr}" ]]; then new_server="${old_server//0.0.0.0/${host_addr}}" echo "Inside container; rewriting kubeconfig server ${old_server} -> ${new_server} (insecure-skip-tls-verify)." From 24b1d0f1dd8039f973a2ba18817e68a5fb7b1ed1 Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Tue, 5 May 2026 16:26:49 -0700 Subject: [PATCH 8/8] ci(helm-e2e): use kind in CI, keep k3d for local dev The previous attempts to make the in-container kubectl reach the host's k3d API server kept hitting tooling gaps (missing iproute2, gawk-only strtonum). Step back and follow the conventional pattern instead: - Drop the `container:` block from the helm-e2e jobs and run on the bare runner. Install mise via `curl https://mise.run | sh`. - Use `helm/kind-action` to provision a kind cluster on the runner. Because the workflow steps run on the runner directly, the kind API server is reachable through the standard kubeconfig the action writes. - Add HELM_E2E_SKIP_CLUSTER and HELM_E2E_IMAGE_LOADER env vars to helm-e2e.sh so it can drive the existing flow against either a self- managed k3d cluster (default; what 'mise run e2e:helm:*' uses locally) or a caller-managed kind cluster (CI). Image loading switches between 'k3d image import' and 'kind load docker-image' accordingly. - Revert the in-container kubeconfig-rewrite hacks in helm-k3s-local.sh; they are no longer needed once CI runs on the bare runner. --- .github/workflows/branch-helm-e2e.yml | 50 +++++++++++++++------------ tasks/scripts/helm-e2e.sh | 50 +++++++++++++++++++++++---- tasks/scripts/helm-k3s-local.sh | 40 --------------------- 3 files changed, 72 insertions(+), 68 deletions(-) diff --git a/.github/workflows/branch-helm-e2e.yml b/.github/workflows/branch-helm-e2e.yml index 1489477fa..e804fafd5 100644 --- a/.github/workflows/branch-helm-e2e.yml +++ b/.github/workflows/branch-helm-e2e.yml @@ -59,30 +59,33 @@ jobs: permissions: contents: read packages: read - container: - image: ghcr.io/nvidia/openshell/ci:latest - credentials: - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - options: --privileged - volumes: - - /var/run/docker.sock:/var/run/docker.sock env: MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} HELM_E2E_CLUSTER_NAME: helm-e2e-${{ github.run_id }}-rust HELM_E2E_IMAGE_TAG: ${{ github.sha }} + HELM_E2E_SKIP_CLUSTER: "1" + HELM_E2E_IMAGE_LOADER: kind steps: - uses: actions/checkout@v6 - - name: Mark workspace safe for git - run: git config --global --add safe.directory "$GITHUB_WORKSPACE" - - name: Log in to GHCR run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin + - name: Install mise + run: | + curl https://mise.run | sh + echo "$HOME/.local/bin" >> "$GITHUB_PATH" + echo "$HOME/.local/share/mise/shims" >> "$GITHUB_PATH" + - name: Install tools run: mise install --locked + - name: Create kind cluster + uses: helm/kind-action@v1 + with: + cluster_name: ${{ env.HELM_E2E_CLUSTER_NAME }} + wait: 120s + - name: Run Helm E2E (Rust) run: mise run e2e:helm:rust @@ -95,32 +98,35 @@ jobs: permissions: contents: read packages: read - container: - image: ghcr.io/nvidia/openshell/ci:latest - credentials: - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - options: --privileged - volumes: - - /var/run/docker.sock:/var/run/docker.sock env: MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} HELM_E2E_CLUSTER_NAME: helm-e2e-${{ github.run_id }}-python HELM_E2E_IMAGE_TAG: ${{ github.sha }} + HELM_E2E_SKIP_CLUSTER: "1" + HELM_E2E_IMAGE_LOADER: kind steps: - uses: actions/checkout@v6 - - name: Mark workspace safe for git - run: git config --global --add safe.directory "$GITHUB_WORKSPACE" - - name: Log in to GHCR run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin + - name: Install mise + run: | + curl https://mise.run | sh + echo "$HOME/.local/bin" >> "$GITHUB_PATH" + echo "$HOME/.local/share/mise/shims" >> "$GITHUB_PATH" + - name: Install tools run: mise install --locked - name: Install Python dependencies run: uv sync --frozen && mise run --no-deps python:proto + - name: Create kind cluster + uses: helm/kind-action@v1 + with: + cluster_name: ${{ env.HELM_E2E_CLUSTER_NAME }} + wait: 120s + - name: Run Helm E2E (Python) run: mise run e2e:helm:python diff --git a/tasks/scripts/helm-e2e.sh b/tasks/scripts/helm-e2e.sh index c27c0c306..1d453ef66 100755 --- a/tasks/scripts/helm-e2e.sh +++ b/tasks/scripts/helm-e2e.sh @@ -19,7 +19,15 @@ # HELM_E2E_SUITE rust | python | all (default: all) # HELM_E2E_PKI pki-init | cert-manager (default: pki-init) # HELM_E2E_KEEP_CLUSTER 1 to skip cluster deletion on exit (default: 0) -# HELM_E2E_CLUSTER_NAME override k3d cluster name (default: derived from branch) +# HELM_E2E_CLUSTER_NAME override cluster name (default: derived from branch) +# HELM_E2E_SKIP_CLUSTER 1 if the caller has already provisioned the cluster +# (and KUBECONFIG points at it). The script will not +# create or delete the cluster. Used by CI, where +# helm/kind-action provisions a kind cluster before +# this script runs. +# HELM_E2E_IMAGE_LOADER k3d | kind | none — which loader to use to import +# the gateway and supervisor images into the cluster +# (default: k3d for local dev; CI sets kind) # HELM_E2E_IMAGE_TAG if set, pull gateway+supervisor images from # HELM_E2E_IMAGE_REGISTRY at this tag instead of # building them locally (used by CI to reuse the @@ -35,6 +43,8 @@ ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" SUITE="${HELM_E2E_SUITE:-all}" PKI_MODE="${HELM_E2E_PKI:-pki-init}" KEEP_CLUSTER="${HELM_E2E_KEEP_CLUSTER:-0}" +SKIP_CLUSTER="${HELM_E2E_SKIP_CLUSTER:-0}" +IMAGE_LOADER="${HELM_E2E_IMAGE_LOADER:-k3d}" # Derive cluster name the same way helm-k3s-local.sh does (last path component of branch). _branch_cluster_name() { @@ -92,12 +102,27 @@ require_cmd() { fi } -require_cmd k3d require_cmd helm require_cmd kubectl require_cmd docker require_cmd openssl +# k3d is only needed when this script manages the cluster lifecycle. CI hands +# us a pre-existing kind cluster via HELM_E2E_SKIP_CLUSTER=1. +if [ "${SKIP_CLUSTER}" != "1" ]; then + require_cmd k3d +fi +case "${IMAGE_LOADER}" in + k3d|kind|none) ;; + *) + echo "ERROR: unknown HELM_E2E_IMAGE_LOADER '${IMAGE_LOADER}' (must be k3d, kind, or none)" >&2 + exit 2 + ;; +esac +if [ "${IMAGE_LOADER}" = "kind" ]; then + require_cmd kind +fi + if ! docker info >/dev/null 2>&1; then echo "ERROR: docker daemon is not reachable" >&2 exit 2 @@ -106,7 +131,9 @@ fi echo "=== helm-e2e: suite=${SUITE} pki=${PKI_MODE} cluster=${CLUSTER_NAME} ===" # ── Cluster ────────────────────────────────────────────────────────────────── -if k3d cluster get "${CLUSTER_NAME}" >/dev/null 2>&1; then +if [ "${SKIP_CLUSTER}" = "1" ]; then + echo "Using pre-existing cluster '${CLUSTER_NAME}' (HELM_E2E_SKIP_CLUSTER=1)." +elif k3d cluster get "${CLUSTER_NAME}" >/dev/null 2>&1; then echo "Reusing existing k3d cluster '${CLUSTER_NAME}'." # Refresh kubeconfig in case it's stale. k3d kubeconfig write "${CLUSTER_NAME}" --output "${KUBECONFIG}" >/dev/null @@ -162,9 +189,20 @@ else "${ROOT}" 2>&1 fi -# Load images into the k3d cluster nodes. -echo "Loading images into k3d cluster..." -k3d image import "${GATEWAY_IMAGE}" "${SUPERVISOR_IMAGE}" -c "${CLUSTER_NAME}" 2>&1 +# Load images into the cluster nodes. +case "${IMAGE_LOADER}" in + k3d) + echo "Loading images into k3d cluster '${CLUSTER_NAME}'..." + k3d image import "${GATEWAY_IMAGE}" "${SUPERVISOR_IMAGE}" -c "${CLUSTER_NAME}" 2>&1 + ;; + kind) + echo "Loading images into kind cluster '${CLUSTER_NAME}'..." + kind load docker-image "${GATEWAY_IMAGE}" "${SUPERVISOR_IMAGE}" --name "${CLUSTER_NAME}" 2>&1 + ;; + none) + echo "Skipping image load (HELM_E2E_IMAGE_LOADER=none); the cluster must already have ${GATEWAY_IMAGE} and ${SUPERVISOR_IMAGE}." + ;; +esac # ── Deploy via Helm ─────────────────────────────────────────────────────────── HELM_VALUES_FLAGS=( diff --git a/tasks/scripts/helm-k3s-local.sh b/tasks/scripts/helm-k3s-local.sh index 8c181bc67..3f268c2dc 100755 --- a/tasks/scripts/helm-k3s-local.sh +++ b/tasks/scripts/helm-k3s-local.sh @@ -114,46 +114,6 @@ merge_kubeconfig() { rm -f "${tmp}" kubectl --kubeconfig="${KUBECONFIG_TARGET}" config use-context "$(k3d_context_name)" - - # When this script runs inside a container (e.g., a GitHub Actions - # `container:` job mounting /var/run/docker.sock), k3d publishes the API - # server on the host's `0.0.0.0:` but `0.0.0.0` from inside the - # container is not the host. Rewrite the server URL to the default-route - # gateway, which routes to the docker host. The API server cert is signed - # for `0.0.0.0` / `127.0.0.1` and won't have the gateway IP as a SAN, so - # mark the cluster insecure-skip-tls-verify (CI-only path; local dev keeps - # the default secure setup). - if [[ -f /.dockerenv ]]; then - local context old_server new_server host_addr - context="$(k3d_context_name)" - old_server=$(kubectl --kubeconfig="${KUBECONFIG_TARGET}" config view --raw \ - -o "jsonpath={.clusters[?(@.name=='${context}')].cluster.server}") - if [[ "${old_server}" == https://0.0.0.0:* ]]; then - # Read the default-route gateway from /proc/net/route directly to avoid - # depending on the `ip` command, which is not in the CI image. The - # gateway field is a little-endian 32-bit hex value, so we read pairs - # of hex digits in reverse and format as dotted decimal. - host_addr=$(awk '$2=="00000000" { - gw = $3 - printf "%d.%d.%d.%d", - strtonum("0x" substr(gw,7,2)), - strtonum("0x" substr(gw,5,2)), - strtonum("0x" substr(gw,3,2)), - strtonum("0x" substr(gw,1,2)) - exit - }' /proc/net/route 2>/dev/null) || host_addr="" - if [[ -n "${host_addr}" ]]; then - new_server="${old_server//0.0.0.0/${host_addr}}" - echo "Inside container; rewriting kubeconfig server ${old_server} -> ${new_server} (insecure-skip-tls-verify)." - kubectl --kubeconfig="${KUBECONFIG_TARGET}" config unset \ - "clusters.${context}.certificate-authority-data" >/dev/null 2>&1 || true - kubectl --kubeconfig="${KUBECONFIG_TARGET}" config set-cluster "${context}" \ - --server="${new_server}" --insecure-skip-tls-verify=true >/dev/null - else - echo "warning: running inside a container but could not detect a default-route gateway; kubectl may fail to reach the API server." >&2 - fi - fi - fi } apply_base_manifests() {