diff --git a/.agents/skills/helm-dev-environment/SKILL.md b/.agents/skills/helm-dev-environment/SKILL.md index 986ce1490..0db44f1f8 100644 --- a/.agents/skills/helm-dev-environment/SKILL.md +++ b/.agents/skills/helm-dev-environment/SKILL.md @@ -63,7 +63,7 @@ The gateway Service uses ClusterIP. Access is via Envoy Gateway (port `8080`) or ### TLS behaviour -`values-skaffold.yaml` sets `server.disableTls: true`, so Skaffold-based deploys run +`ci/values-skaffold.yaml` sets `server.disableTls: true`, so Skaffold-based deploys run plaintext by default. To test with TLS enabled, comment out that line and redeploy. | Mode | `server.disableTls` | Gateway scheme | @@ -160,7 +160,7 @@ imports the openshell realm from `scripts/keycloak-realm.json`, and prints a por command for acquiring tokens from the CLI. Then activate OIDC in the OpenShell Helm chart: -1. Uncomment `#- values-keycloak.yaml` in `skaffold.yaml` +1. Uncomment `#- ci/values-keycloak.yaml` in `skaffold.yaml` 2. Redeploy: `mise run helm:skaffold:run` To remove Keycloak: @@ -191,10 +191,12 @@ mise run helm:k3s:status |------|---------| | `deploy/helm/openshell/skaffold.yaml` | Skaffold config — images, Helm releases, values overlays | | `deploy/helm/openshell/values.yaml` | Default Helm values | -| `deploy/helm/openshell/values-skaffold.yaml` | Dev overrides (image pull policy, local image names) | -| `deploy/helm/openshell/values-cert-manager.yaml` | cert-manager TLS overlay (opt-in; disables pkiInitJob) | -| `deploy/helm/openshell/values-gateway.yaml` | Envoy Gateway GRPCRoute + Gateway overlay | -| `deploy/helm/openshell/values-keycloak.yaml` | Keycloak OIDC overlay | +| `deploy/helm/openshell/ci/values-skaffold.yaml` | Dev overrides (image pull policy, TLS disabled for local Skaffold) | +| `deploy/helm/openshell/ci/values-cert-manager.yaml` | cert-manager PKI overlay (opt-in; disables pkiInitJob) | +| `deploy/helm/openshell/ci/values-gateway.yaml` | Envoy Gateway GRPCRoute + Gateway overlay | +| `deploy/helm/openshell/ci/values-keycloak.yaml` | Keycloak OIDC overlay | +| `deploy/helm/openshell/ci/values-tls-disabled.yaml` | Lint-only: TLS + auth disabled (reverse-proxy edge termination) | | `deploy/kube/manifests/envoy-gateway-openshell.yaml` | GatewayClass for Envoy Gateway (`mise run helm:gateway:apply`) | | `tasks/scripts/helm-k3s-local.sh` | k3d cluster create/delete/start/stop/status | +| `tasks/scripts/helm-e2e.sh` | Bootstrap k3d cluster and run Rust + Python e2e via Helm | | `tasks/scripts/keycloak-k8s-setup.sh` | Keycloak deploy + realm import | diff --git a/.github/workflows/branch-helm-e2e.yml b/.github/workflows/branch-helm-e2e.yml new file mode 100644 index 000000000..e804fafd5 --- /dev/null +++ b/.github/workflows/branch-helm-e2e.yml @@ -0,0 +1,132 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +name: Branch Helm E2E + +on: + push: + branches: + - "pull-request/[0-9]+" + workflow_dispatch: {} + +permissions: {} + +jobs: + pr_metadata: + name: Resolve PR metadata + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: read + outputs: + should_run: ${{ steps.gate.outputs.should_run }} + steps: + - uses: actions/checkout@v6 + + - id: gate + uses: ./.github/actions/pr-gate + with: + required_label: test:e2e-helm + + build-gateway: + needs: [pr_metadata] + if: needs.pr_metadata.outputs.should_run == 'true' + permissions: + contents: read + packages: write + uses: ./.github/workflows/docker-build.yml + with: + component: gateway + platform: linux/amd64 + + build-supervisor: + needs: [pr_metadata] + if: needs.pr_metadata.outputs.should_run == 'true' + permissions: + contents: read + packages: write + uses: ./.github/workflows/docker-build.yml + with: + component: supervisor + platform: linux/amd64 + + helm-e2e-rust: + name: Helm E2E (rust) + needs: [pr_metadata, build-gateway, build-supervisor] + if: needs.pr_metadata.outputs.should_run == 'true' + runs-on: linux-amd64-cpu8 + timeout-minutes: 60 + permissions: + contents: read + packages: read + env: + MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + HELM_E2E_CLUSTER_NAME: helm-e2e-${{ github.run_id }}-rust + HELM_E2E_IMAGE_TAG: ${{ github.sha }} + HELM_E2E_SKIP_CLUSTER: "1" + HELM_E2E_IMAGE_LOADER: kind + steps: + - uses: actions/checkout@v6 + + - name: Log in to GHCR + run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin + + - name: Install mise + run: | + curl https://mise.run | sh + echo "$HOME/.local/bin" >> "$GITHUB_PATH" + echo "$HOME/.local/share/mise/shims" >> "$GITHUB_PATH" + + - name: Install tools + run: mise install --locked + + - name: Create kind cluster + uses: helm/kind-action@v1 + with: + cluster_name: ${{ env.HELM_E2E_CLUSTER_NAME }} + wait: 120s + + - name: Run Helm E2E (Rust) + run: mise run e2e:helm:rust + + helm-e2e-python: + name: Helm E2E (python) + needs: [pr_metadata, build-gateway, build-supervisor] + if: needs.pr_metadata.outputs.should_run == 'true' + runs-on: linux-amd64-cpu8 + timeout-minutes: 60 + permissions: + contents: read + packages: read + env: + MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + HELM_E2E_CLUSTER_NAME: helm-e2e-${{ github.run_id }}-python + HELM_E2E_IMAGE_TAG: ${{ github.sha }} + HELM_E2E_SKIP_CLUSTER: "1" + HELM_E2E_IMAGE_LOADER: kind + steps: + - uses: actions/checkout@v6 + + - name: Log in to GHCR + run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin + + - name: Install mise + run: | + curl https://mise.run | sh + echo "$HOME/.local/bin" >> "$GITHUB_PATH" + echo "$HOME/.local/share/mise/shims" >> "$GITHUB_PATH" + + - name: Install tools + run: mise install --locked + + - name: Install Python dependencies + run: uv sync --frozen && mise run --no-deps python:proto + + - name: Create kind cluster + uses: helm/kind-action@v1 + with: + cluster_name: ${{ env.HELM_E2E_CLUSTER_NAME }} + wait: 120s + + - name: Run Helm E2E (Python) + run: mise run e2e:helm:python diff --git a/.github/workflows/e2e-gate.yml b/.github/workflows/e2e-gate.yml index 67959fa8d..0155a13d2 100644 --- a/.github/workflows/e2e-gate.yml +++ b/.github/workflows/e2e-gate.yml @@ -4,7 +4,7 @@ on: pull_request: types: [opened, synchronize, reopened, labeled, unlabeled, ready_for_review] workflow_run: - workflows: ["Branch E2E Checks", "GPU Test"] + workflows: ["Branch E2E Checks", "GPU Test", "Branch Helm E2E"] types: [completed] permissions: {} @@ -36,6 +36,18 @@ jobs: required_label: test:e2e-gpu workflow_file: test-gpu.yml + helm-e2e: + name: Helm E2E + if: github.event_name == 'pull_request' + permissions: + contents: read + pull-requests: read + actions: read + uses: ./.github/workflows/e2e-gate-check.yml + with: + required_label: test:e2e-helm + workflow_file: branch-helm-e2e.yml + # When the guarded workflow finishes, GitHub fires `workflow_run` in the # default-branch context — any check posted from here would land on `main`, # not on the PR. Instead, find the latest `pull_request`-triggered gate run diff --git a/.github/workflows/e2e-label-help.yml b/.github/workflows/e2e-label-help.yml index 2a61660d2..9d534b0ed 100644 --- a/.github/workflows/e2e-label-help.yml +++ b/.github/workflows/e2e-label-help.yml @@ -19,7 +19,10 @@ permissions: {} jobs: hint: name: Post next-step hint for E2E label - if: github.event.label.name == 'test:e2e' || github.event.label.name == 'test:e2e-gpu' + if: | + github.event.label.name == 'test:e2e' || + github.event.label.name == 'test:e2e-gpu' || + github.event.label.name == 'test:e2e-helm' runs-on: ubuntu-latest permissions: pull-requests: write @@ -40,6 +43,7 @@ jobs: case "$LABEL_NAME" in test:e2e) workflow_file=branch-e2e.yml; workflow_name="Branch E2E Checks" ;; test:e2e-gpu) workflow_file=test-gpu.yml; workflow_name="GPU Test" ;; + test:e2e-helm) workflow_file=branch-helm-e2e.yml; workflow_name="Branch Helm E2E" ;; *) echo "Unrecognized label $LABEL_NAME"; exit 1 ;; esac diff --git a/.github/workflows/helm-lint.yml b/.github/workflows/helm-lint.yml new file mode 100644 index 000000000..8b7184133 --- /dev/null +++ b/.github/workflows/helm-lint.yml @@ -0,0 +1,57 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +name: Helm Lint + +on: + push: + branches: + - "pull-request/[0-9]+" + paths: + - "deploy/helm/**" + workflow_dispatch: + +env: + MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + +permissions: + contents: read + packages: read + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + pr_metadata: + name: Resolve PR metadata + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: read + outputs: + should_run: ${{ steps.gate.outputs.should_run }} + steps: + - uses: actions/checkout@v6 + + - id: gate + uses: ./.github/actions/pr-gate + + helm-lint: + name: Helm Lint + needs: pr_metadata + if: needs.pr_metadata.outputs.should_run == 'true' + runs-on: linux-amd64-cpu8 + container: + image: ghcr.io/nvidia/openshell/ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + steps: + - uses: actions/checkout@v6 + + - name: Install tools + run: mise install --locked + + - name: Lint Helm chart + run: mise run helm:lint diff --git a/deploy/docker/Dockerfile.images b/deploy/docker/Dockerfile.images index 359789ffd..746aaad4a 100644 --- a/deploy/docker/Dockerfile.images +++ b/deploy/docker/Dockerfile.images @@ -48,6 +48,7 @@ WORKDIR /build COPY Cargo.toml Cargo.lock ./ COPY crates/ crates/ COPY proto/ proto/ +COPY providers/ providers/ RUN --mount=type=cache,target=/usr/local/cargo/registry \ --mount=type=cache,target=/build/target \ diff --git a/deploy/helm/openshell/.helmignore b/deploy/helm/openshell/.helmignore index 798d0e7c8..a12325802 100644 --- a/deploy/helm/openshell/.helmignore +++ b/deploy/helm/openshell/.helmignore @@ -19,8 +19,4 @@ # Ignore development files skaffold.yaml -values-keycloak.yaml -values-ingress.yaml -values-gateway.yaml -values-cert-manager.yaml -values-skaffold.yaml +ci/ diff --git a/deploy/helm/openshell/values-cert-manager.yaml b/deploy/helm/openshell/ci/values-cert-manager.yaml similarity index 84% rename from deploy/helm/openshell/values-cert-manager.yaml rename to deploy/helm/openshell/ci/values-cert-manager.yaml index bb024d716..ed99c8b46 100644 --- a/deploy/helm/openshell/values-cert-manager.yaml +++ b/deploy/helm/openshell/ci/values-cert-manager.yaml @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 # Merge after values.yaml when cert-manager CRDs are installed, e.g.: -# helm install ... -f values.yaml -f values-cert-manager.yaml +# helm install ... -f values.yaml -f ci/values-cert-manager.yaml # Or add this file to skaffold manifests.helm.releases[].valuesFiles. server: disableTls: false diff --git a/deploy/helm/openshell/values-gateway.yaml b/deploy/helm/openshell/ci/values-gateway.yaml similarity index 92% rename from deploy/helm/openshell/values-gateway.yaml rename to deploy/helm/openshell/ci/values-gateway.yaml index c43a4cd45..196192213 100644 --- a/deploy/helm/openshell/values-gateway.yaml +++ b/deploy/helm/openshell/ci/values-gateway.yaml @@ -5,7 +5,7 @@ # # Requires Envoy Gateway in the cluster (installed via skaffold.yaml). # Add this file to the openshell release valuesFiles to activate: -# uncomment values-gateway.yaml in deploy/helm/openshell/skaffold.yaml +# uncomment ci/values-gateway.yaml in deploy/helm/openshell/skaffold.yaml # # Envoy Gateway will create an Envoy proxy Deployment and a LoadBalancer # Service (named envoy---*) in the openshell namespace. diff --git a/deploy/helm/openshell/values-keycloak.yaml b/deploy/helm/openshell/ci/values-keycloak.yaml similarity index 95% rename from deploy/helm/openshell/values-keycloak.yaml rename to deploy/helm/openshell/ci/values-keycloak.yaml index 42bb2ad4e..cc6ca658b 100644 --- a/deploy/helm/openshell/values-keycloak.yaml +++ b/deploy/helm/openshell/ci/values-keycloak.yaml @@ -8,7 +8,7 @@ # # Then layer this file on top of values.yaml when deploying: # helm upgrade --install openshell . \ -# -f values.yaml -f values-skaffold.yaml -f values-keycloak.yaml +# -f values.yaml -f ci/values-skaffold.yaml -f ci/values-keycloak.yaml # # Or add this file to skaffold.yaml valuesFiles for iterative dev. # diff --git a/deploy/helm/openshell/values-skaffold.yaml b/deploy/helm/openshell/ci/values-skaffold.yaml similarity index 100% rename from deploy/helm/openshell/values-skaffold.yaml rename to deploy/helm/openshell/ci/values-skaffold.yaml diff --git a/deploy/helm/openshell/ci/values-tls-disabled.yaml b/deploy/helm/openshell/ci/values-tls-disabled.yaml new file mode 100644 index 000000000..ea7c7900c --- /dev/null +++ b/deploy/helm/openshell/ci/values-tls-disabled.yaml @@ -0,0 +1,10 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# CI lint target: TLS disabled (plaintext HTTP, no client cert requirement). +# Typical when a reverse proxy or tunnel terminates TLS at the edge. +server: + disableTls: true + disableGatewayAuth: true +pkiInitJob: + enabled: false diff --git a/deploy/helm/openshell/skaffold.yaml b/deploy/helm/openshell/skaffold.yaml index fe7b96cf2..2de9ee4e6 100644 --- a/deploy/helm/openshell/skaffold.yaml +++ b/deploy/helm/openshell/skaffold.yaml @@ -87,16 +87,16 @@ deploy: createNamespace: true valuesFiles: - values.yaml - - values-skaffold.yaml - # Add values-cert-manager.yaml here (and uncomment the cert-manager + - ci/values-skaffold.yaml + # Add ci/values-cert-manager.yaml here (and uncomment the cert-manager # release above) to switch from pkiInitJob to cert-manager for PKI. - #- values-cert-manager.yaml + #- ci/values-cert-manager.yaml # To enable OIDC with a local Keycloak instance, run the one-time # setup task first, then uncomment the line below: # mise run keycloak:k8s:setup - #- values-keycloak.yaml + #- ci/values-keycloak.yaml # To enable the Gateway API HTTPRoute (requires Envoy Gateway above): - #- values-gateway.yaml + #- ci/values-gateway.yaml setValueTemplates: image.repository: '{{.IMAGE_REPO_openshell_gateway}}' image.tag: '{{.IMAGE_TAG_openshell_gateway}}' diff --git a/tasks/helm.toml b/tasks/helm.toml index c7949865b..9ef2ae832 100644 --- a/tasks/helm.toml +++ b/tasks/helm.toml @@ -4,9 +4,18 @@ # Helm chart tasks ["helm:lint"] -description = "Lint the openshell helm chart" -run = "helm lint deploy/helm/openshell" -hide = true +description = "Lint the openshell Helm chart (defaults + all CI configuration variants)" +run = """ + set -e + echo "--- helm lint: defaults ---" + helm lint deploy/helm/openshell + for f in deploy/helm/openshell/ci/values-*.yaml; do + variant=$(basename "$f" .yaml | sed 's/values-//') + echo "--- helm lint: $variant ---" + helm lint deploy/helm/openshell -f "$f" + done + echo "All variants passed." +""" ["helm:skaffold:dev"] description = "Run skaffold dev for deploy/helm/openshell (iterative deploy)" @@ -59,3 +68,24 @@ hide = true ["helm:gateway:apply"] description = "Apply the Envoy GatewayClass manifest (run after helm:skaffold:run when gateway routing is enabled)" run = "kubectl apply -f deploy/kube/manifests/envoy-gateway-openshell.yaml" + +# Helm e2e — boots a k3d cluster via the Helm path and runs the Rust + Python suites + +["e2e:helm"] +description = "Bootstrap Helm k3d cluster and run Rust + Python e2e suites" +run = "tasks/scripts/helm-e2e.sh" + +["e2e:helm:rust"] +description = "Bootstrap Helm k3d cluster and run Rust e2e only" +env = { HELM_E2E_SUITE = "rust" } +run = "tasks/scripts/helm-e2e.sh" + +["e2e:helm:python"] +description = "Bootstrap Helm k3d cluster and run Python e2e only" +env = { HELM_E2E_SUITE = "python" } +run = "tasks/scripts/helm-e2e.sh" + +["e2e:helm:cert-manager"] +description = "Bootstrap Helm k3d cluster with cert-manager PKI and run full e2e" +env = { HELM_E2E_PKI = "cert-manager" } +run = "tasks/scripts/helm-e2e.sh" diff --git a/tasks/scripts/helm-e2e.sh b/tasks/scripts/helm-e2e.sh new file mode 100755 index 000000000..1d453ef66 --- /dev/null +++ b/tasks/scripts/helm-e2e.sh @@ -0,0 +1,346 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Run Rust and/or Python e2e tests against a gateway deployed via the Helm chart +# on a local k3d cluster (k3s backed by Docker). +# +# The script follows the same preflight → bootstrap → register → test → cleanup +# pattern as e2e/rust/e2e-docker.sh, but uses k3d + Skaffold + Helm instead of +# a standalone gateway process. +# +# Usage: +# mise run e2e:helm # full suite, pkiInitJob PKI +# mise run e2e:helm:rust # Rust only +# mise run e2e:helm:python # Python only +# mise run e2e:helm:cert-manager # full suite, cert-manager PKI +# +# Environment variables: +# HELM_E2E_SUITE rust | python | all (default: all) +# HELM_E2E_PKI pki-init | cert-manager (default: pki-init) +# HELM_E2E_KEEP_CLUSTER 1 to skip cluster deletion on exit (default: 0) +# HELM_E2E_CLUSTER_NAME override cluster name (default: derived from branch) +# HELM_E2E_SKIP_CLUSTER 1 if the caller has already provisioned the cluster +# (and KUBECONFIG points at it). The script will not +# create or delete the cluster. Used by CI, where +# helm/kind-action provisions a kind cluster before +# this script runs. +# HELM_E2E_IMAGE_LOADER k3d | kind | none — which loader to use to import +# the gateway and supervisor images into the cluster +# (default: k3d for local dev; CI sets kind) +# HELM_E2E_IMAGE_TAG if set, pull gateway+supervisor images from +# HELM_E2E_IMAGE_REGISTRY at this tag instead of +# building them locally (used by CI to reuse the +# images produced by docker-build.yml) +# HELM_E2E_IMAGE_REGISTRY registry to pull pre-built images from +# (default: ghcr.io/nvidia/openshell) +# KUBECONFIG path to kubeconfig (default: /kubeconfig) +# OPENSHELL_PROVISION_TIMEOUT sandbox ready timeout in seconds (default: 300) + +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +SUITE="${HELM_E2E_SUITE:-all}" +PKI_MODE="${HELM_E2E_PKI:-pki-init}" +KEEP_CLUSTER="${HELM_E2E_KEEP_CLUSTER:-0}" +SKIP_CLUSTER="${HELM_E2E_SKIP_CLUSTER:-0}" +IMAGE_LOADER="${HELM_E2E_IMAGE_LOADER:-k3d}" + +# Derive cluster name the same way helm-k3s-local.sh does (last path component of branch). +_branch_cluster_name() { + local branch + branch="$(git -C "${ROOT}" rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown")" + local suffix="${branch##*/}" + suffix="${suffix:0:18}" + echo "openshell-dev-${suffix}" +} + +CLUSTER_NAME="${HELM_E2E_CLUSTER_NAME:-$(_branch_cluster_name)}" +export KUBECONFIG="${KUBECONFIG:-${ROOT}/kubeconfig}" + +WORKDIR="$(mktemp -d "/tmp/openshell-helm-e2e.XXXXXX")" +GATEWAY_NAME="openshell-helm-e2e-${CLUSTER_NAME}" +GATEWAY_CONFIG_DIR="${HOME}/.config/openshell/gateways/${GATEWAY_NAME}" +PF_PID="" +PORT="" +CLUSTER_CREATED=0 + +cleanup() { + local exit_code=$? + + if [ -n "${PF_PID}" ] && kill -0 "${PF_PID}" 2>/dev/null; then + echo "Stopping kubectl port-forward (pid ${PF_PID})..." + kill "${PF_PID}" 2>/dev/null || true + wait "${PF_PID}" 2>/dev/null || true + fi + + if [ -d "${GATEWAY_CONFIG_DIR}" ]; then + rm -rf "${GATEWAY_CONFIG_DIR}" + fi + + if [ "${KEEP_CLUSTER}" = "1" ]; then + echo "Keeping cluster '${CLUSTER_NAME}' (HELM_E2E_KEEP_CLUSTER=1)." + elif [ "${CLUSTER_CREATED}" = "1" ]; then + echo "Deleting cluster '${CLUSTER_NAME}'..." + HELM_K3S_CLUSTER_NAME="${CLUSTER_NAME}" \ + bash "${ROOT}/tasks/scripts/helm-k3s-local.sh" delete 2>/dev/null || true + fi + + rm -rf "${WORKDIR}" 2>/dev/null || true + + if [ "${exit_code}" -ne 0 ]; then + echo "helm-e2e failed (exit ${exit_code})." + fi +} +trap cleanup EXIT + +# ── Preflight ──────────────────────────────────────────────────────────────── +require_cmd() { + if ! command -v "$1" >/dev/null 2>&1; then + echo "ERROR: '$1' is required but not found in PATH" >&2 + exit 2 + fi +} + +require_cmd helm +require_cmd kubectl +require_cmd docker +require_cmd openssl + +# k3d is only needed when this script manages the cluster lifecycle. CI hands +# us a pre-existing kind cluster via HELM_E2E_SKIP_CLUSTER=1. +if [ "${SKIP_CLUSTER}" != "1" ]; then + require_cmd k3d +fi +case "${IMAGE_LOADER}" in + k3d|kind|none) ;; + *) + echo "ERROR: unknown HELM_E2E_IMAGE_LOADER '${IMAGE_LOADER}' (must be k3d, kind, or none)" >&2 + exit 2 + ;; +esac +if [ "${IMAGE_LOADER}" = "kind" ]; then + require_cmd kind +fi + +if ! docker info >/dev/null 2>&1; then + echo "ERROR: docker daemon is not reachable" >&2 + exit 2 +fi + +echo "=== helm-e2e: suite=${SUITE} pki=${PKI_MODE} cluster=${CLUSTER_NAME} ===" + +# ── Cluster ────────────────────────────────────────────────────────────────── +if [ "${SKIP_CLUSTER}" = "1" ]; then + echo "Using pre-existing cluster '${CLUSTER_NAME}' (HELM_E2E_SKIP_CLUSTER=1)." +elif k3d cluster get "${CLUSTER_NAME}" >/dev/null 2>&1; then + echo "Reusing existing k3d cluster '${CLUSTER_NAME}'." + # Refresh kubeconfig in case it's stale. + k3d kubeconfig write "${CLUSTER_NAME}" --output "${KUBECONFIG}" >/dev/null +else + echo "Creating k3d cluster '${CLUSTER_NAME}'..." + HELM_K3S_CLUSTER_NAME="${CLUSTER_NAME}" \ + bash "${ROOT}/tasks/scripts/helm-k3s-local.sh" create + CLUSTER_CREATED=1 +fi + +# ── cert-manager (optional) ────────────────────────────────────────────────── +if [ "${PKI_MODE}" = "cert-manager" ]; then + echo "Installing cert-manager..." + helm repo add jetstack https://charts.jetstack.io --force-update >/dev/null 2>&1 || true + helm upgrade --install cert-manager jetstack/cert-manager \ + --namespace cert-manager --create-namespace \ + --set crds.enabled=true \ + --wait 2>&1 +fi + +# ── Build images ───────────────────────────────────────────────────────────── +# Use a fixed local tag so the image names are stable across runs and Helm +# can reference them without Skaffold's digest-based tags. +GATEWAY_IMAGE="openshell/gateway:helm-e2e" +SUPERVISOR_IMAGE="openshell/supervisor:helm-e2e" + +if [ -n "${HELM_E2E_IMAGE_TAG:-}" ]; then + REGISTRY="${HELM_E2E_IMAGE_REGISTRY:-ghcr.io/nvidia/openshell}" + echo "Pulling pre-built gateway image (${REGISTRY}/gateway:${HELM_E2E_IMAGE_TAG})..." + docker pull "${REGISTRY}/gateway:${HELM_E2E_IMAGE_TAG}" + docker tag "${REGISTRY}/gateway:${HELM_E2E_IMAGE_TAG}" "${GATEWAY_IMAGE}" + + echo "Pulling pre-built supervisor image (${REGISTRY}/supervisor:${HELM_E2E_IMAGE_TAG})..." + docker pull "${REGISTRY}/supervisor:${HELM_E2E_IMAGE_TAG}" + docker tag "${REGISTRY}/supervisor:${HELM_E2E_IMAGE_TAG}" "${SUPERVISOR_IMAGE}" +else + echo "Building gateway image..." + docker buildx build \ + --build-arg BUILD_FROM_SOURCE=1 \ + --target gateway \ + --tag "${GATEWAY_IMAGE}" \ + --load \ + --file "${ROOT}/deploy/docker/Dockerfile.images" \ + "${ROOT}" 2>&1 + + echo "Building supervisor image..." + docker buildx build \ + --build-arg BUILD_FROM_SOURCE=1 \ + --target supervisor \ + --tag "${SUPERVISOR_IMAGE}" \ + --load \ + --file "${ROOT}/deploy/docker/Dockerfile.images" \ + "${ROOT}" 2>&1 +fi + +# Load images into the cluster nodes. +case "${IMAGE_LOADER}" in + k3d) + echo "Loading images into k3d cluster '${CLUSTER_NAME}'..." + k3d image import "${GATEWAY_IMAGE}" "${SUPERVISOR_IMAGE}" -c "${CLUSTER_NAME}" 2>&1 + ;; + kind) + echo "Loading images into kind cluster '${CLUSTER_NAME}'..." + kind load docker-image "${GATEWAY_IMAGE}" "${SUPERVISOR_IMAGE}" --name "${CLUSTER_NAME}" 2>&1 + ;; + none) + echo "Skipping image load (HELM_E2E_IMAGE_LOADER=none); the cluster must already have ${GATEWAY_IMAGE} and ${SUPERVISOR_IMAGE}." + ;; +esac + +# ── Deploy via Helm ─────────────────────────────────────────────────────────── +HELM_VALUES_FLAGS=( + -f "${ROOT}/deploy/helm/openshell/values.yaml" +) +if [ "${PKI_MODE}" = "cert-manager" ]; then + HELM_VALUES_FLAGS+=(-f "${ROOT}/deploy/helm/openshell/ci/values-cert-manager.yaml") +fi + +echo "Deploying OpenShell via Helm (PKI: ${PKI_MODE})..." +helm upgrade --install openshell "${ROOT}/deploy/helm/openshell" \ + --namespace openshell --create-namespace \ + "${HELM_VALUES_FLAGS[@]}" \ + --set "image.repository=openshell/gateway" \ + --set "image.tag=helm-e2e" \ + --set "image.pullPolicy=Never" \ + --set "supervisor.image.repository=openshell/supervisor" \ + --set "supervisor.image.tag=helm-e2e" \ + --set "supervisor.image.pullPolicy=Never" \ + --wait --timeout 180s 2>&1 + +# ── Wait for PKI ───────────────────────────────────────────────────────────── +if [ "${PKI_MODE}" = "cert-manager" ]; then + echo "Waiting for cert-manager certificates to be ready..." + kubectl wait --for=condition=Ready certificate/openshell-server certificate/openshell-client \ + -n openshell --timeout=120s +else + echo "Waiting for pkiInitJob secrets..." + elapsed=0 + while [ "${elapsed}" -lt 60 ]; do + if kubectl get secret openshell-client-tls -n openshell >/dev/null 2>&1; then + echo "PKI secrets ready after ${elapsed}s." + break + fi + sleep 3 + elapsed=$((elapsed + 3)) + done + if [ "${elapsed}" -ge 60 ]; then + echo "ERROR: pkiInitJob secrets not created within 60s" >&2 + exit 1 + fi +fi + +# ── Port-forward ───────────────────────────────────────────────────────────── +pick_port() { + python3 -c 'import socket; s=socket.socket(); s.bind(("",0)); print(s.getsockname()[1]); s.close()' +} +PORT=$(pick_port) + +echo "Port-forwarding openshell service → localhost:${PORT}..." +kubectl port-forward -n openshell svc/openshell "${PORT}:8080" \ + >"${WORKDIR}/pf.log" 2>&1 & +PF_PID=$! + +# ── Register gateway with CLI ───────────────────────────────────────────────── +mkdir -p "${GATEWAY_CONFIG_DIR}/mtls" + +kubectl get secret openshell-client-tls -n openshell \ + -o jsonpath='{.data.ca\.crt}' | base64 -d > "${GATEWAY_CONFIG_DIR}/mtls/ca.crt" +kubectl get secret openshell-client-tls -n openshell \ + -o jsonpath='{.data.tls\.crt}' | base64 -d > "${GATEWAY_CONFIG_DIR}/mtls/tls.crt" +kubectl get secret openshell-client-tls -n openshell \ + -o jsonpath='{.data.tls\.key}' | base64 -d > "${GATEWAY_CONFIG_DIR}/mtls/tls.key" + +cat >"${GATEWAY_CONFIG_DIR}/metadata.json" <&1 +fi + +echo "Waiting for gateway to become healthy (port ${PORT})..." +elapsed=0 +timeout=120 +while [ "${elapsed}" -lt "${timeout}" ]; do + if ! kill -0 "${PF_PID}" 2>/dev/null; then + echo "ERROR: port-forward exited unexpectedly" >&2 + cat "${WORKDIR}/pf.log" || true + exit 1 + fi + if "${CLI_BIN}" status --gateway "${GATEWAY_NAME}" >/dev/null 2>&1; then + echo "Gateway healthy after ${elapsed}s." + break + fi + sleep 3 + elapsed=$((elapsed + 3)) +done +if [ "${elapsed}" -ge "${timeout}" ]; then + echo "ERROR: gateway did not become healthy within ${timeout}s" >&2 + cat "${WORKDIR}/pf.log" || true + exit 1 +fi + +# ── Run test suites ─────────────────────────────────────────────────────────── +run_rust() { + echo "--- Running Rust e2e ---" + cargo build -p openshell-cli --features openshell-core/dev-settings + cargo test --manifest-path e2e/rust/Cargo.toml --features e2e -- \ + --skip gateway_resume_scenarios \ + --skip docker_gpu_sandbox_runs_nvidia_smi \ + --skip sandbox_from_custom_dockerfile \ + --skip graphql_l7_enforces_allow_and_deny_rules_on_forward_and_connect_paths \ + --skip forward_proxy_allows_l7_permitted_request \ + --skip sandbox_reaches_host_openshell_internal_via_host_gateway_alias \ + --skip sandbox_inference_local_routes_to_host_openshell_internal \ + --nocapture +} + +run_python() { + echo "--- Running Python e2e ---" + mise run --no-deps python:proto + UV_NO_SYNC=1 PYTHONPATH=python uv run pytest \ + -o python_files='test_*.py' \ + -m 'not gpu' \ + -n "${E2E_PARALLEL:-5}" \ + e2e/python +} + +case "${SUITE}" in + rust) run_rust ;; + python) run_python ;; + all) run_rust; run_python ;; + *) + echo "ERROR: unknown HELM_E2E_SUITE '${SUITE}' (must be rust, python, or all)" >&2 + exit 2 + ;; +esac + +echo "=== helm-e2e: all suites passed ==="