diff --git a/.github/workflows/branch-e2e.yml b/.github/workflows/branch-e2e.yml index f330aaa47..cb2f5440e 100644 --- a/.github/workflows/branch-e2e.yml +++ b/.github/workflows/branch-e2e.yml @@ -35,30 +35,8 @@ jobs: component: gateway platform: linux/arm64 - build-cluster: - needs: [pr_metadata] - if: needs.pr_metadata.outputs.should_run == 'true' - permissions: - contents: read - packages: write - uses: ./.github/workflows/docker-build.yml - with: - component: cluster - platform: linux/arm64 - - build-supervisor: - needs: [pr_metadata] - if: needs.pr_metadata.outputs.should_run == 'true' - permissions: - contents: read - packages: write - uses: ./.github/workflows/docker-build.yml - with: - component: supervisor - platform: linux/arm64 - e2e: - needs: [pr_metadata, build-gateway, build-cluster, build-supervisor] + needs: [pr_metadata, build-gateway] if: needs.pr_metadata.outputs.should_run == 'true' permissions: contents: read diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml index d34576863..536cfb2a5 100644 --- a/.github/workflows/e2e-test.yml +++ b/.github/workflows/e2e-test.yml @@ -27,17 +27,9 @@ jobs: matrix: include: - suite: python - cluster: e2e-python - port: "8080" cmd: "mise run --no-deps --skip-deps e2e:python" - suite: rust - cluster: e2e-rust - port: "8081" cmd: "mise run --no-deps --skip-deps e2e:rust" - - suite: gateway-resume - cluster: e2e-resume - port: "8082" - cmd: "cargo test --manifest-path e2e/rust/Cargo.toml --features e2e --test gateway_resume" container: image: ghcr.io/nvidia/openshell/ci:latest credentials: @@ -46,6 +38,7 @@ jobs: options: --privileged volumes: - /var/run/docker.sock:/var/run/docker.sock + - /home/runner/_work:/home/runner/_work env: MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} IMAGE_TAG: ${{ inputs.image-tag }} @@ -54,37 +47,19 @@ jobs: OPENSHELL_REGISTRY_NAMESPACE: nvidia/openshell OPENSHELL_REGISTRY_USERNAME: ${{ github.actor }} OPENSHELL_REGISTRY_PASSWORD: ${{ secrets.GITHUB_TOKEN }} - OPENSHELL_GATEWAY: ${{ matrix.cluster }} steps: - uses: actions/checkout@v6 - name: Log in to GHCR run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin - - name: Pull cluster image - run: docker pull ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag }} - - name: Install Python dependencies and generate protobuf stubs if: matrix.suite == 'python' run: uv sync --frozen && mise run --no-deps python:proto - - name: Build Rust CLI - if: matrix.suite != 'python' - run: cargo build -p openshell-cli --features openshell-core/dev-settings - - name: Install SSH client if: matrix.suite != 'python' run: apt-get update && apt-get install -y --no-install-recommends openssh-client && rm -rf /var/lib/apt/lists/* - - name: Bootstrap cluster - env: - GATEWAY_HOST: host.docker.internal - GATEWAY_PORT: ${{ matrix.port }} - CLUSTER_NAME: ${{ matrix.cluster }} - SKIP_IMAGE_PUSH: "1" - SKIP_CLUSTER_IMAGE_BUILD: "1" - OPENSHELL_CLUSTER_IMAGE: ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag }} - run: mise run --no-deps --skip-deps cluster - - name: Run tests run: ${{ matrix.cmd }} diff --git a/TESTING.md b/TESTING.md index eab16603b..c356b4a62 100644 --- a/TESTING.md +++ b/TESTING.md @@ -4,7 +4,7 @@ ```bash mise run test # Rust + Python unit tests -mise run e2e # End-to-end tests (requires a running cluster) +mise run e2e # End-to-end tests (starts a Docker-backed gateway) mise run ci # Everything: lint, compile checks, and tests ``` @@ -72,8 +72,17 @@ mise run test:python # uv run pytest python/ ## E2E Tests -E2E tests run against a live cluster. `mise run e2e` deploys changed components -before running the suite. +E2E tests run against a live gateway. By default, `mise run e2e` starts an +ephemeral standalone gateway with the Docker compute driver, runs the suite, +and cleans it up afterward. To run the suite against an existing plaintext +gateway, set `OPENSHELL_GATEWAY_ENDPOINT`: + +```bash +OPENSHELL_GATEWAY_ENDPOINT=http://127.0.0.1:18080 mise run e2e +``` + +Raw endpoint mode is HTTP-only. Use a named gateway config when a gateway +requires mTLS. ### Python E2E (`e2e/python/`) @@ -125,7 +134,7 @@ def test_multiply(sandbox): | Fixture | Scope | Purpose | |---|---|---| -| `sandbox_client` | session | gRPC client connected to the active cluster | +| `sandbox_client` | session | gRPC client connected to the active gateway | | `sandbox` | function | Factory returning a `Sandbox` context manager | | `inference_client` | session | Client for managing inference routes | | `mock_inference_route` | session | Creates a mock OpenAI-protocol route for tests | @@ -168,3 +177,4 @@ The harness (`e2e/rust/src/harness/`) provides: | Variable | Purpose | |---|---| | `OPENSHELL_GATEWAY` | Override active gateway name for E2E tests | +| `OPENSHELL_GATEWAY_ENDPOINT` | Run E2E tests against an existing plaintext HTTP gateway endpoint | diff --git a/crates/openshell-driver-docker/src/lib.rs b/crates/openshell-driver-docker/src/lib.rs index 2241e3e90..44c274203 100644 --- a/crates/openshell-driver-docker/src/lib.rs +++ b/crates/openshell-driver-docker/src/lib.rs @@ -231,7 +231,9 @@ impl DockerComputeDriver { } let network_name = docker_network_name(docker_config); let bridge_gateway_ip = ensure_bridge_network(&docker, &network_name).await?; - let gateway_route = docker_gateway_route(&info, bridge_gateway_ip, gateway_port); + let host_gateway_ip = parse_optional_host_gateway_ip(&config.host_gateway_ip)?; + let gateway_route = + docker_gateway_route(&info, bridge_gateway_ip, gateway_port, host_gateway_ip); let grpc_endpoint = docker_container_openshell_endpoint( &config.grpc_endpoint, HOST_OPENSHELL_INTERNAL, @@ -797,11 +799,21 @@ impl ComputeDriver for DockerComputeDriver { let request = request.into_inner(); require_sandbox_identifier(&request.sandbox_id, &request.sandbox_name)?; - Ok(Response::new(DeleteSandboxResponse { - deleted: self - .delete_sandbox_inner(&request.sandbox_id, &request.sandbox_name) - .await?, - })) + let event_sandbox_id = request.sandbox_id.clone(); + let deleted = self + .delete_sandbox_inner(&request.sandbox_id, &request.sandbox_name) + .await?; + if deleted && !event_sandbox_id.is_empty() { + let _ = self.events.send(WatchSandboxesEvent { + payload: Some(watch_sandboxes_event::Payload::Deleted( + WatchSandboxesDeletedEvent { + sandbox_id: event_sandbox_id, + }, + )), + }); + } + + Ok(Response::new(DeleteSandboxResponse { deleted })) } async fn watch_sandboxes( @@ -1064,11 +1076,32 @@ fn docker_network_name(config: &DockerComputeConfig) -> String { name.to_string() } +fn parse_optional_host_gateway_ip(value: &str) -> CoreResult> { + let trimmed = value.trim(); + if trimmed.is_empty() { + return Ok(None); + } + + trimmed.parse().map(Some).map_err(|err| { + Error::config(format!( + "invalid OPENSHELL_HOST_GATEWAY_IP value '{trimmed}': {err}" + )) + }) +} + fn docker_gateway_route( info: &SystemInfo, bridge_gateway_ip: IpAddr, port: u16, + host_gateway_ip: Option, ) -> DockerGatewayRoute { + if let Some(host_alias_ip) = host_gateway_ip { + return DockerGatewayRoute::Bridge { + bind_address: SocketAddr::new(host_alias_ip, port), + host_alias_ip, + }; + } + if is_docker_desktop(info) { DockerGatewayRoute::HostGateway } else { diff --git a/crates/openshell-driver-docker/src/tests.rs b/crates/openshell-driver-docker/src/tests.rs index ae93f5b66..e41f2688e 100644 --- a/crates/openshell-driver-docker/src/tests.rs +++ b/crates/openshell-driver-docker/src/tests.rs @@ -154,6 +154,7 @@ fn docker_gateway_route_uses_host_gateway_for_docker_desktop() { &info, IpAddr::V4(Ipv4Addr::new(172, 18, 0, 1)), DEFAULT_SERVER_PORT, + None, ), DockerGatewayRoute::HostGateway ); @@ -174,6 +175,7 @@ fn docker_gateway_route_uses_bridge_gateway_for_linux_docker() { &info, IpAddr::V4(Ipv4Addr::new(172, 18, 0, 1)), DEFAULT_SERVER_PORT, + None, ); assert_eq!( @@ -192,6 +194,51 @@ fn docker_gateway_route_uses_bridge_gateway_for_linux_docker() { ); } +#[test] +fn docker_gateway_route_prefers_configured_host_gateway_ip() { + let info = SystemInfo { + operating_system: Some("Ubuntu 24.04 LTS".to_string()), + ..Default::default() + }; + + let route = docker_gateway_route( + &info, + IpAddr::V4(Ipv4Addr::new(172, 18, 0, 1)), + DEFAULT_SERVER_PORT, + Some(IpAddr::V4(Ipv4Addr::new(172, 20, 0, 4))), + ); + + assert_eq!( + route, + DockerGatewayRoute::Bridge { + bind_address: "172.20.0.4:8080".parse().unwrap(), + host_alias_ip: IpAddr::V4(Ipv4Addr::new(172, 20, 0, 4)), + } + ); + assert_eq!( + docker_extra_hosts(&route), + vec![ + "host.docker.internal:172.20.0.4".to_string(), + "host.openshell.internal:172.20.0.4".to_string() + ] + ); +} + +#[test] +fn parse_optional_host_gateway_ip_rejects_invalid_values() { + assert_eq!(parse_optional_host_gateway_ip("").unwrap(), None); + assert_eq!( + parse_optional_host_gateway_ip("172.20.0.4").unwrap(), + Some(IpAddr::V4(Ipv4Addr::new(172, 20, 0, 4))) + ); + assert!( + parse_optional_host_gateway_ip("not-an-ip") + .unwrap_err() + .to_string() + .contains("OPENSHELL_HOST_GATEWAY_IP") + ); +} + #[test] fn parse_cpu_limit_supports_cores_and_millicores() { assert_eq!(parse_cpu_limit("250m").unwrap(), Some(250_000_000)); diff --git a/e2e/python/test_security_tls.py b/e2e/python/test_security_tls.py index ce4db129f..fa9059fa4 100644 --- a/e2e/python/test_security_tls.py +++ b/e2e/python/test_security_tls.py @@ -36,6 +36,8 @@ def _xdg_config_home() -> pathlib.Path: def _resolve_cluster_name() -> str: + if os.environ.get("OPENSHELL_GATEWAY_ENDPOINT"): + return os.environ.get("OPENSHELL_GATEWAY", "openshell-e2e-endpoint") env_cluster = os.environ.get("OPENSHELL_GATEWAY") if env_cluster: return env_cluster @@ -44,6 +46,13 @@ def _resolve_cluster_name() -> str: def _cluster_metadata(cluster_name: str) -> dict: + endpoint = os.environ.get("OPENSHELL_GATEWAY_ENDPOINT") + if endpoint: + return { + "name": cluster_name, + "gateway_endpoint": endpoint, + "auth_mode": "plaintext", + } metadata_path = ( _xdg_config_home() / "openshell" / "gateways" / cluster_name / "metadata.json" ) diff --git a/e2e/rust/e2e-docker.sh b/e2e/rust/e2e-docker.sh index fdb849a6f..ebdf631bb 100755 --- a/e2e/rust/e2e-docker.sh +++ b/e2e/rust/e2e-docker.sh @@ -2,336 +2,19 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -# Run a Rust e2e test against a standalone gateway running the -# bundled Docker compute driver. -# -# Unlike the Kubernetes driver (which deploys a k3s cluster) or the VM -# driver (which boots libkrun), the Docker driver runs in-process inside -# the gateway binary and uses the local Docker daemon to run sandbox -# containers. This script: -# -# 1. Builds openshell-gateway, openshell-cli, and a Linux ELF -# openshell-sandbox binary (via the Docker image build pipeline on -# non-Linux hosts so macOS linker fd limits are avoided). -# 2. Ensures the sandbox base image exists locally; containers launch -# from it with the freshly built openshell-sandbox binary bind-mounted -# over the image-provided copy. -# 3. Generates an ephemeral mTLS PKI (CA, server cert, client cert). -# 4. Starts openshell-gateway with --drivers=docker, binding to a -# random free host port. -# 5. Installs the client cert into the CLI gateway config dir and -# runs the selected Rust e2e test. -# 6. Tears the gateway process down on exit. -# -# Usage: -# mise run e2e:docker -# mise run e2e:docker:gpu +# Run a Rust e2e test against a standalone gateway running the bundled Docker +# compute driver. Set OPENSHELL_GATEWAY_ENDPOINT=http://host:port to reuse an +# existing plaintext gateway instead of starting an ephemeral one. set -euo pipefail ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" -WORKDIR="$(mktemp -d "/tmp/openshell-e2e-docker.XXXXXX")" -GATEWAY_BIN="${ROOT}/target/debug/openshell-gateway" -CLI_BIN="${ROOT}/target/debug/openshell" -STATE_DIR="" -GATEWAY_CONFIG_DIR="" -GATEWAY_PID="" -GATEWAY_LOG="${WORKDIR}/gateway.log" E2E_TEST="${OPENSHELL_E2E_DOCKER_TEST:-smoke}" -GPU_MODE="${OPENSHELL_E2E_DOCKER_GPU:-0}" -# Unique sandbox namespace for this test run. Set just before the gateway -# is started so cleanup can filter Docker containers strictly to ones -# this run created, even when other OpenShell sandboxes are present on -# the host. Empty until assigned -- cleanup treats an empty namespace as -# "do nothing" so an early-exit trap never touches unrelated containers. -E2E_NAMESPACE="" -cleanup() { - local exit_code=$? - if [ -n "${GATEWAY_PID}" ] && kill -0 "${GATEWAY_PID}" 2>/dev/null; then - echo "Stopping openshell-gateway (pid ${GATEWAY_PID})..." - kill "${GATEWAY_PID}" 2>/dev/null || true - wait "${GATEWAY_PID}" 2>/dev/null || true - fi +cargo build -p openshell-cli --features openshell-core/dev-settings - # On failure, preserve sandbox container logs for post-mortem - # debugging before removing the containers. Filter strictly to - # containers this test run created (managed-by + this run's unique - # sandbox namespace) so we never touch unrelated OpenShell sandboxes - # the developer may be running in parallel. - if [ "${exit_code}" -ne 0 ] \ - && [ -n "${E2E_NAMESPACE}" ] \ - && command -v docker >/dev/null 2>&1; then - local ids - ids=$(docker ps -aq \ - --filter "label=openshell.ai/managed-by=openshell" \ - --filter "label=openshell.ai/sandbox-namespace=${E2E_NAMESPACE}" \ - 2>/dev/null || true) - if [ -n "${ids}" ]; then - echo "=== sandbox container logs (preserved for debugging) ===" - for id in ${ids}; do - echo "--- container ${id} (inspect) ---" - docker inspect --format '{{.Name}} state={{.State.Status}} exit={{.State.ExitCode}} restarts={{.RestartCount}} error={{.State.Error}}' "${id}" 2>/dev/null || true - echo "--- container ${id} (last 80 log lines) ---" - docker logs --tail 80 "${id}" 2>&1 || true - done - echo "=== end sandbox container logs ===" - fi - fi - - # Remove any lingering sandbox containers the gateway failed to clean - # up. Scope the filter to this run's namespace so we don't force-remove - # sandboxes belonging to other gateways or test runs on the same host. - if [ -n "${E2E_NAMESPACE}" ] && command -v docker >/dev/null 2>&1; then - local stale - stale=$(docker ps -aq \ - --filter "label=openshell.ai/managed-by=openshell" \ - --filter "label=openshell.ai/sandbox-namespace=${E2E_NAMESPACE}" \ - 2>/dev/null || true) - if [ -n "${stale}" ]; then - # shellcheck disable=SC2086 - docker rm -f ${stale} >/dev/null 2>&1 || true - fi - fi - - if [ "${exit_code}" -ne 0 ] && [ -f "${GATEWAY_LOG}" ]; then - echo "=== gateway log (preserved for debugging) ===" - cat "${GATEWAY_LOG}" - echo "=== end gateway log ===" - fi - - # Remove gateway CLI config we created so repeated runs don't - # accumulate stale gateway entries. - if [ -n "${GATEWAY_CONFIG_DIR}" ] && [ -d "${GATEWAY_CONFIG_DIR}" ]; then - rm -rf "${GATEWAY_CONFIG_DIR}" - fi - - rm -rf "${WORKDIR}" 2>/dev/null || true -} -trap cleanup EXIT - -# ── Preflight ──────────────────────────────────────────────────────── -if ! command -v docker >/dev/null 2>&1; then - echo "ERROR: docker CLI is required to run e2e:docker" >&2 - exit 2 -fi -if ! docker info >/dev/null 2>&1; then - echo "ERROR: docker daemon is not reachable (docker info failed)" >&2 - exit 2 -fi -if ! command -v openssl >/dev/null 2>&1; then - echo "ERROR: openssl is required to generate ephemeral PKI" >&2 - exit 2 -fi -if [ "${GPU_MODE}" = "1" ]; then - DOCKER_CDI_SPEC_DIRS="$(docker info --format '{{json .CDISpecDirs}}' 2>/dev/null || true)" - if [ -z "${DOCKER_CDI_SPEC_DIRS}" ] \ - || [ "${DOCKER_CDI_SPEC_DIRS}" = "null" ] \ - || [ "${DOCKER_CDI_SPEC_DIRS}" = "[]" ] \ - || [ "${DOCKER_CDI_SPEC_DIRS}" = "" ]; then - echo "ERROR: e2e:docker:gpu requires Docker CDI support." >&2 - echo " Generate CDI specs and restart Docker, then verify docker info reports CDISpecDirs." >&2 - exit 2 - fi -fi - -normalize_arch() { - case "$1" in - x86_64|amd64) echo "amd64" ;; - aarch64|arm64) echo "arm64" ;; - *) echo "$1" ;; - esac -} - -linux_target_triple() { - case "$1" in - amd64) echo "x86_64-unknown-linux-gnu" ;; - arm64) echo "aarch64-unknown-linux-gnu" ;; - *) - echo "ERROR: unsupported Docker daemon architecture '$1'" >&2 - exit 2 - ;; - esac -} - -# Detect Linux arch of the Docker daemon so we build the matching -# openshell-sandbox binary. -DAEMON_ARCH="$(normalize_arch "$(docker info --format '{{.Architecture}}' 2>/dev/null || true)")" -SUPERVISOR_TARGET="$(linux_target_triple "${DAEMON_ARCH}")" -HOST_OS="$(uname -s)" -HOST_ARCH="$(normalize_arch "$(uname -m)")" -SUPERVISOR_OUT_DIR="${WORKDIR}/supervisor/${DAEMON_ARCH}" -SUPERVISOR_BIN="${SUPERVISOR_OUT_DIR}/openshell-sandbox" - -# ── Build binaries ─────────────────────────────────────────────────── -# Cap build parallelism to avoid OOM when run alongside a docker build or -# on memory-constrained developer machines. Override with CARGO_BUILD_JOBS. -CARGO_BUILD_JOBS_ARG=() -if [ -n "${CARGO_BUILD_JOBS:-}" ]; then - CARGO_BUILD_JOBS_ARG=(-j "${CARGO_BUILD_JOBS}") -fi - -echo "Building openshell-gateway and openshell-cli..." -cargo build ${CARGO_BUILD_JOBS_ARG[@]+"${CARGO_BUILD_JOBS_ARG[@]}"} \ - -p openshell-server --bin openshell-gateway \ - -p openshell-cli --features openshell-core/dev-settings - -echo "Building openshell-sandbox for ${SUPERVISOR_TARGET}..." -mkdir -p "${SUPERVISOR_OUT_DIR}" -if [ "${HOST_OS}" = "Linux" ] && [ "${HOST_ARCH}" = "${DAEMON_ARCH}" ]; then - rustup target add "${SUPERVISOR_TARGET}" >/dev/null 2>&1 || true - cargo build ${CARGO_BUILD_JOBS_ARG[@]+"${CARGO_BUILD_JOBS_ARG[@]}"} \ - --release -p openshell-sandbox --target "${SUPERVISOR_TARGET}" - cp "${ROOT}/target/${SUPERVISOR_TARGET}/release/openshell-sandbox" "${SUPERVISOR_BIN}" -else - CONTAINER_ENGINE=docker \ - DOCKER_PLATFORM="linux/${DAEMON_ARCH}" \ - DOCKER_OUTPUT="type=local,dest=${SUPERVISOR_OUT_DIR}" \ - bash "${ROOT}/tasks/scripts/docker-build-image.sh" supervisor-output -fi - -if [ ! -f "${SUPERVISOR_BIN}" ]; then - echo "ERROR: expected supervisor binary at ${SUPERVISOR_BIN}" >&2 - exit 1 -fi -chmod +x "${SUPERVISOR_BIN}" - -# ── Ensure a sandbox base image is available locally ──────────────── -# The bundled openshell-sandbox binary enforces a 'sandbox' user/group -# in the image. Use the community sandbox base image (also what real -# deployments default to). Callers can override with -# OPENSHELL_E2E_DOCKER_SANDBOX_IMAGE if they have a smaller local image -# with the required 'sandbox' user. CDI injects the NVIDIA userspace -# stack at runtime, so the GPU lane uses the same base image. -DEFAULT_SANDBOX_IMAGE="ghcr.io/nvidia/openshell-community/sandboxes/base:latest" -SANDBOX_IMAGE="${OPENSHELL_E2E_DOCKER_SANDBOX_IMAGE:-${DEFAULT_SANDBOX_IMAGE}}" -if ! docker image inspect "${SANDBOX_IMAGE}" >/dev/null 2>&1; then - echo "Pulling ${SANDBOX_IMAGE}..." - docker pull "${SANDBOX_IMAGE}" -fi - -# ── Generate ephemeral mTLS PKI ────────────────────────────────────── -PKI_DIR="${WORKDIR}/pki" -mkdir -p "${PKI_DIR}" -cd "${PKI_DIR}" - -cat > openssl.cnf <<'EOF' -[req] -distinguished_name = dn -prompt = no -[dn] -CN = openshell-server -[san_server] -subjectAltName = @alt_server -[alt_server] -DNS.1 = localhost -DNS.2 = host.openshell.internal -DNS.3 = host.docker.internal -IP.1 = 127.0.0.1 -IP.2 = ::1 -[san_client] -subjectAltName = DNS:openshell-client -EOF - -openssl req -x509 -newkey rsa:2048 -nodes -days 30 \ - -keyout ca.key -out ca.crt -subj "/CN=openshell-e2e-ca" >/dev/null 2>&1 - -openssl req -newkey rsa:2048 -nodes -keyout server.key -out server.csr \ - -config openssl.cnf >/dev/null 2>&1 -openssl x509 -req -in server.csr -CA ca.crt -CAkey ca.key -CAcreateserial \ - -out server.crt -days 30 -extfile openssl.cnf -extensions san_server >/dev/null 2>&1 - -openssl req -newkey rsa:2048 -nodes -keyout client.key -out client.csr \ - -subj "/CN=openshell-client" >/dev/null 2>&1 -openssl x509 -req -in client.csr -CA ca.crt -CAkey ca.key -CAcreateserial \ - -out client.crt -days 30 -extfile openssl.cnf -extensions san_client >/dev/null 2>&1 - -cd "${ROOT}" - -# ── Pick a free port ───────────────────────────────────────────────── -pick_port() { - python3 -c 'import socket; s=socket.socket(); s.bind(("",0)); print(s.getsockname()[1]); s.close()' -} -HOST_PORT=$(pick_port) - -STATE_DIR="${WORKDIR}/state" -mkdir -p "${STATE_DIR}" - -# Containers started by the docker driver reach the host gateway via -# host.openshell.internal (mapped to host-gateway by the driver). The -# gateway itself binds to 0.0.0.0:${HOST_PORT}. -GATEWAY_ENDPOINT="https://host.openshell.internal:${HOST_PORT}" - -# Unique per-run sandbox namespace. The Docker driver stamps every -# container with `openshell.ai/sandbox-namespace=`, so cleanup can -# filter on this value and never touch sandboxes belonging to other -# gateways or test runs on the same host. -E2E_NAMESPACE="e2e-docker-$$-${HOST_PORT}" - -echo "Starting openshell-gateway on port ${HOST_PORT} (namespace: ${E2E_NAMESPACE})..." -"${GATEWAY_BIN}" \ - --port "${HOST_PORT}" \ - --drivers docker \ - --sandbox-namespace "${E2E_NAMESPACE}" \ - --tls-cert "${PKI_DIR}/server.crt" \ - --tls-key "${PKI_DIR}/server.key" \ - --tls-client-ca "${PKI_DIR}/ca.crt" \ - --db-url "sqlite:${STATE_DIR}/gateway.db?mode=rwc" \ - --grpc-endpoint "${GATEWAY_ENDPOINT}" \ - --docker-supervisor-bin "${SUPERVISOR_BIN}" \ - --docker-tls-ca "${PKI_DIR}/ca.crt" \ - --docker-tls-cert "${PKI_DIR}/client.crt" \ - --docker-tls-key "${PKI_DIR}/client.key" \ - --sandbox-image "${SANDBOX_IMAGE}" \ - --sandbox-image-pull-policy IfNotPresent \ - >"${GATEWAY_LOG}" 2>&1 & -GATEWAY_PID=$! - -# ── Register the gateway with the CLI ──────────────────────────────── -# Writes both metadata.json (for `--gateway ` lookup) and the mTLS -# bundle the CLI uses to authenticate to the gateway. -GATEWAY_NAME="openshell-e2e-docker-${HOST_PORT}" -CLI_GATEWAY_ENDPOINT="https://127.0.0.1:${HOST_PORT}" -GATEWAY_CONFIG_DIR="${HOME}/.config/openshell/gateways/${GATEWAY_NAME}" -mkdir -p "${GATEWAY_CONFIG_DIR}/mtls" -cp "${PKI_DIR}/ca.crt" "${GATEWAY_CONFIG_DIR}/mtls/ca.crt" -cp "${PKI_DIR}/client.crt" "${GATEWAY_CONFIG_DIR}/mtls/tls.crt" -cp "${PKI_DIR}/client.key" "${GATEWAY_CONFIG_DIR}/mtls/tls.key" -cat >"${GATEWAY_CONFIG_DIR}/metadata.json" </dev/null; then - echo "ERROR: openshell-gateway exited before becoming healthy" - exit 1 - fi - if "${CLI_BIN}" status >/dev/null 2>&1; then - echo "Gateway healthy after ${elapsed}s." - break - fi - sleep 2 - elapsed=$((elapsed + 2)) -done -if [ "${elapsed}" -ge "${timeout}" ]; then - echo "ERROR: gateway did not become healthy within ${timeout}s" - exit 1 -fi - -# ── Run the selected test ──────────────────────────────────────────── -echo "Running e2e ${E2E_TEST} test (gateway: ${OPENSHELL_GATEWAY}, endpoint: ${CLI_GATEWAY_ENDPOINT})..." -cargo test --manifest-path e2e/rust/Cargo.toml --features e2e --test "${E2E_TEST}" -- --nocapture - -echo "${E2E_TEST} test passed." +exec "${ROOT}/e2e/with-docker-gateway.sh" \ + cargo test --manifest-path "${ROOT}/e2e/rust/Cargo.toml" \ + --features e2e \ + --test "${E2E_TEST}" \ + -- --nocapture diff --git a/e2e/rust/src/harness/gateway.rs b/e2e/rust/src/harness/gateway.rs new file mode 100644 index 000000000..e68d88c2b --- /dev/null +++ b/e2e/rust/src/harness/gateway.rs @@ -0,0 +1,210 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Gateway process controls for Docker-backed e2e tests. +//! +//! The shell wrapper still prepares the expensive shared setup: binaries, +//! PKI, state directories, Docker network, and the first gateway launch. This +//! helper owns restart mechanics inside Rust tests by reading the wrapper's +//! exported process metadata. + +use std::fs::{self, OpenOptions}; +use std::io::Write; +use std::path::PathBuf; +use std::process::{Command, Stdio}; +use std::thread; +use std::time::Duration; + +/// A gateway process owned by the current Docker e2e wrapper run. +pub struct ManagedGateway { + bin: PathBuf, + args_file: PathBuf, + log: PathBuf, + pid_file: PathBuf, +} + +impl ManagedGateway { + /// Build controls from explicit wrapper-provided process metadata. + pub fn new( + bin: impl Into, + args_file: impl Into, + log: impl Into, + pid_file: impl Into, + ) -> Self { + Self { + bin: bin.into(), + args_file: args_file.into(), + log: log.into(), + pid_file: pid_file.into(), + } + } + + /// Load managed gateway controls from the environment. + /// + /// Returns `Ok(None)` when the current e2e run does not own the gateway, + /// such as `OPENSHELL_GATEWAY_ENDPOINT=http://...` existing-endpoint mode. + pub fn from_env() -> Result, String> { + let Some(bin) = std::env::var_os("OPENSHELL_E2E_GATEWAY_BIN") else { + return Ok(None); + }; + + Ok(Some(Self { + bin: PathBuf::from(bin), + args_file: env_path("OPENSHELL_E2E_GATEWAY_ARGS_FILE")?, + log: env_path("OPENSHELL_E2E_GATEWAY_LOG")?, + pid_file: env_path("OPENSHELL_E2E_GATEWAY_PID_FILE")?, + })) + } + + /// Start the gateway if it is not already running. + pub fn start(&self) -> Result<(), String> { + if let Some(pid) = self.current_pid()? { + if process_running(pid)? { + return Ok(()); + } + let _ = fs::remove_file(&self.pid_file); + } + + let args = self.gateway_args()?; + let mut log = OpenOptions::new() + .create(true) + .append(true) + .open(&self.log) + .map_err(|err| format!("open gateway log '{}': {err}", self.log.display()))?; + writeln!( + log, + "\n=== starting openshell-gateway from Rust e2e harness ===" + ) + .map_err(|err| format!("write gateway log marker: {err}"))?; + let stderr = log + .try_clone() + .map_err(|err| format!("clone gateway log handle: {err}"))?; + + let child = Command::new(&self.bin) + .args(args) + .stdout(Stdio::from(log)) + .stderr(Stdio::from(stderr)) + .spawn() + .map_err(|err| format!("start openshell-gateway '{}': {err}", self.bin.display()))?; + let pid = child.id(); + fs::write(&self.pid_file, format!("{pid}\n")).map_err(|err| { + format!( + "write gateway pid file '{}': {err}", + self.pid_file.display() + ) + })?; + + Ok(()) + } + + /// Stop the gateway if it is running. + pub fn stop(&self) -> Result<(), String> { + let Some(pid) = self.current_pid()? else { + return Ok(()); + }; + if !process_running(pid)? { + let _ = fs::remove_file(&self.pid_file); + return Ok(()); + } + + send_signal(pid, None)?; + for _ in 0..60 { + if !process_running(pid)? { + let _ = fs::remove_file(&self.pid_file); + return Ok(()); + } + thread::sleep(Duration::from_secs(1)); + } + + send_signal(pid, Some("-9"))?; + let _ = fs::remove_file(&self.pid_file); + Ok(()) + } + + fn current_pid(&self) -> Result, String> { + let Ok(raw) = fs::read_to_string(&self.pid_file) else { + return Ok(None); + }; + let trimmed = raw.trim(); + if trimmed.is_empty() { + return Ok(None); + } + trimmed.parse::().map(Some).map_err(|err| { + format!( + "parse gateway pid file '{}': {err}", + self.pid_file.display() + ) + }) + } + + fn gateway_args(&self) -> Result, String> { + let raw = fs::read(&self.args_file) + .map_err(|err| format!("read gateway args '{}': {err}", self.args_file.display()))?; + raw.split(|byte| *byte == 0) + .filter(|arg| !arg.is_empty()) + .map(|arg| { + String::from_utf8(arg.to_vec()).map_err(|err| { + format!( + "gateway args file '{}' is not UTF-8: {err}", + self.args_file.display() + ) + }) + }) + .collect() + } +} + +impl Drop for ManagedGateway { + fn drop(&mut self) { + let _ = self.start(); + } +} + +fn env_path(name: &str) -> Result { + std::env::var_os(name) + .map(PathBuf::from) + .ok_or_else(|| format!("{name} must be set when OPENSHELL_E2E_GATEWAY_BIN is set")) +} + +fn process_running(pid: u32) -> Result { + if !signal_command(["-0", &pid.to_string()])? { + return Ok(false); + } + + let output = Command::new("ps") + .args(["-p", &pid.to_string(), "-o", "stat="]) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output() + .map_err(|err| format!("run ps for pid {pid}: {err}"))?; + if !output.status.success() { + return Ok(false); + } + + let stat = String::from_utf8_lossy(&output.stdout); + Ok(!stat.trim_start().starts_with('Z')) +} + +fn send_signal(pid: u32, signal: Option<&str>) -> Result<(), String> { + let mut args = Vec::new(); + if let Some(signal) = signal { + args.push(signal); + } + let pid_string = pid.to_string(); + args.push(&pid_string); + if signal_command(args)? { + Ok(()) + } else { + Err(format!("failed to signal gateway process {pid}")) + } +} + +fn signal_command<'a>(args: impl IntoIterator) -> Result { + Command::new("kill") + .args(args) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .status() + .map(|status| status.success()) + .map_err(|err| format!("run kill: {err}")) +} diff --git a/e2e/rust/src/harness/mod.rs b/e2e/rust/src/harness/mod.rs index b3add2c01..33105a4c7 100644 --- a/e2e/rust/src/harness/mod.rs +++ b/e2e/rust/src/harness/mod.rs @@ -4,6 +4,7 @@ //! Shared test harness modules for CLI e2e tests. pub mod binary; +pub mod gateway; pub mod output; pub mod port; pub mod sandbox; diff --git a/e2e/rust/tests/cf_auth_smoke.rs b/e2e/rust/tests/cf_auth_smoke.rs index aa46604fb..1e8d616e3 100644 --- a/e2e/rust/tests/cf_auth_smoke.rs +++ b/e2e/rust/tests/cf_auth_smoke.rs @@ -21,6 +21,7 @@ async fn run_isolated(args: &[&str]) -> (String, i32) { .env("XDG_CONFIG_HOME", tmpdir.path()) .env("HOME", tmpdir.path()) .env_remove("OPENSHELL_GATEWAY") + .env_remove("OPENSHELL_GATEWAY_ENDPOINT") // Suppress browser popup during auth flow. .env("OPENSHELL_NO_BROWSER", "1") // Use a closed stdin so auth prompts don't hang the test. @@ -44,6 +45,7 @@ async fn run_with_config(tmpdir: &std::path::Path, args: &[&str]) -> (String, i3 .env("XDG_CONFIG_HOME", tmpdir) .env("HOME", tmpdir) .env_remove("OPENSHELL_GATEWAY") + .env_remove("OPENSHELL_GATEWAY_ENDPOINT") // Suppress browser popup during auth flow. .env("OPENSHELL_NO_BROWSER", "1") // Use a closed stdin so auth prompts don't hang the test. @@ -416,4 +418,3 @@ async fn gateway_add_ssh_url_requires_port() { "error should mention port:\n{clean}" ); } - diff --git a/e2e/rust/tests/cli_smoke.rs b/e2e/rust/tests/cli_smoke.rs index 0abc24b43..42e230e23 100644 --- a/e2e/rust/tests/cli_smoke.rs +++ b/e2e/rust/tests/cli_smoke.rs @@ -21,6 +21,7 @@ async fn run_isolated(args: &[&str]) -> (String, i32) { .env("XDG_CONFIG_HOME", tmpdir.path()) .env("HOME", tmpdir.path()) .env_remove("OPENSHELL_GATEWAY") + .env_remove("OPENSHELL_GATEWAY_ENDPOINT") .stdout(Stdio::piped()) .stderr(Stdio::piped()); diff --git a/e2e/rust/tests/docker_preflight.rs b/e2e/rust/tests/docker_preflight.rs index d6e125b4b..9a6ea9f65 100644 --- a/e2e/rust/tests/docker_preflight.rs +++ b/e2e/rust/tests/docker_preflight.rs @@ -32,6 +32,7 @@ async fn run_without_docker(args: &[&str]) -> (String, i32, std::time::Duration) .env("HOME", tmpdir.path()) .env("DOCKER_HOST", "unix:///tmp/openshell-e2e-nonexistent.sock") .env_remove("OPENSHELL_GATEWAY") + .env_remove("OPENSHELL_GATEWAY_ENDPOINT") .stdout(Stdio::piped()) .stderr(Stdio::piped()); @@ -249,6 +250,7 @@ async fn doctor_check_passes_with_docker() { .env("XDG_CONFIG_HOME", tmpdir.path()) .env("HOME", tmpdir.path()) .env_remove("OPENSHELL_GATEWAY") + .env_remove("OPENSHELL_GATEWAY_ENDPOINT") .stdout(Stdio::piped()) .stderr(Stdio::piped()); diff --git a/e2e/rust/tests/edge_tunnel_e2e.rs b/e2e/rust/tests/edge_tunnel_e2e.rs index d482bab77..cb2c0bc7a 100644 --- a/e2e/rust/tests/edge_tunnel_e2e.rs +++ b/e2e/rust/tests/edge_tunnel_e2e.rs @@ -3,20 +3,20 @@ #![cfg(feature = "e2e")] -//! E2E tests for edge tunnel auth flow against a running cluster. +//! E2E tests for edge tunnel auth flow against a running gateway. //! //! Prerequisites: -//! - A running openshell gateway deployed with `--plaintext` -//! - The gateway's HTTP endpoint accessible (no TLS) +//! - A running openshell gateway +//! - For WS tunnel coverage, the gateway's HTTP endpoint is accessible (no TLS) //! - The `openshell` binary (built automatically from the workspace) //! //! These tests exercise the full CLI → WS tunnel → gRPC flow. //! //! Environment variables: //! - `OPENSHELL_GATEWAY`: Name of the active gateway (standard e2e var) +//! - `OPENSHELL_GATEWAY_ENDPOINT`: Optional direct plaintext endpoint. //! -//! The cluster must have been deployed with `openshell gateway start --plaintext` -//! so that the server accepts plaintext HTTP connections. +//! The edge-tunnel path requires a gateway endpoint that accepts plaintext HTTP. use std::process::Stdio; @@ -39,13 +39,14 @@ async fn run_cli(args: &[&str]) -> (String, i32) { } /// Run `openshell ` with a custom config directory so the CLI reads -/// our seeded cluster metadata and edge token instead of the real config. +/// our seeded gateway metadata and edge token instead of the real config. async fn run_cli_with_config(config_dir: &std::path::Path, args: &[&str]) -> (String, i32) { let mut cmd = openshell_cmd(); cmd.args(args) .env("XDG_CONFIG_HOME", config_dir) .env("HOME", config_dir) .env_remove("OPENSHELL_GATEWAY") + .env_remove("OPENSHELL_GATEWAY_ENDPOINT") .stdout(Stdio::piped()) .stderr(Stdio::piped()); @@ -57,63 +58,61 @@ async fn run_cli_with_config(config_dir: &std::path::Path, args: &[&str]) -> (St (combined, code) } -/// Seed a temporary config directory with cluster metadata that has -/// `auth_mode: "cloudflare_jwt"`, a stored edge token, and an active cluster +/// Seed a temporary config directory with gateway metadata that has +/// `auth_mode: "cloudflare_jwt"`, a stored edge token, and an active gateway /// pointing at the given endpoint. -fn seed_edge_cluster_config( +fn seed_edge_gateway_config( config_dir: &std::path::Path, - cluster_name: &str, + gateway_name: &str, gateway_endpoint: &str, edge_token: &str, ) { let openshell_dir = config_dir.join("openshell"); - let clusters_dir = openshell_dir.join("clusters"); + let gateways_dir = openshell_dir.join("gateways"); - // Write active_cluster file. std::fs::create_dir_all(&openshell_dir).expect("create openshell config dir"); - std::fs::write(openshell_dir.join("active_cluster"), cluster_name) - .expect("write active_cluster"); + std::fs::write(openshell_dir.join("active_gateway"), gateway_name) + .expect("write active_gateway"); - // Write cluster metadata JSON. - std::fs::create_dir_all(&clusters_dir).expect("create clusters dir"); + // Write gateway metadata JSON. + let gateway_dir = gateways_dir.join(gateway_name); + std::fs::create_dir_all(&gateway_dir).expect("create gateway dir"); let metadata = serde_json::json!({ - "name": cluster_name, + "name": gateway_name, "gateway_endpoint": gateway_endpoint, "is_remote": false, "gateway_port": 0, "auth_mode": "cloudflare_jwt" }); std::fs::write( - clusters_dir.join(format!("{cluster_name}_metadata.json")), + gateway_dir.join("metadata.json"), serde_json::to_string_pretty(&metadata).unwrap(), ) - .expect("write cluster metadata"); + .expect("write gateway metadata"); // Write edge token file. - let token_dir = clusters_dir.join(cluster_name); - std::fs::create_dir_all(&token_dir).expect("create token dir"); - std::fs::write(token_dir.join("edge_token"), edge_token).expect("write edge_token"); + std::fs::write(gateway_dir.join("edge_token"), edge_token).expect("write edge_token"); } // ------------------------------------------------------------------- -// Test 12: gRPC health check against a plaintext cluster +// Test 12: gRPC health check against a gateway // ------------------------------------------------------------------- /// `openshell status` should report a healthy gateway when connected to a -/// plaintext cluster (deployed with `--plaintext`/`--disable-tls`). +/// configured gateway. /// -/// This test verifies the entire plaintext path: -/// - CLI resolves cluster metadata with `http://` scheme -/// - gRPC client connects over plaintext +/// This test verifies the normal gateway path: +/// - CLI resolves gateway metadata +/// - gRPC client connects /// - Server responds to health check #[tokio::test] -async fn plaintext_cluster_status_reports_healthy() { +async fn gateway_status_reports_healthy() { let (output, code) = run_cli(&["status"]).await; let clean = strip_ansi(&output); assert_eq!( code, 0, - "openshell status should exit 0 against plaintext cluster:\n{clean}" + "openshell status should exit 0 against gateway:\n{clean}" ); // The status output should show the gateway as healthy/connected. @@ -130,7 +129,7 @@ async fn plaintext_cluster_status_reports_healthy() { // Test 13: gRPC through the WS tunnel proxy (edge token path) // ------------------------------------------------------------------- -/// When a cluster's metadata has `auth_mode == "cloudflare_jwt"` and a +/// When a gateway's metadata has `auth_mode == "cloudflare_jwt"` and a /// stored edge token, the CLI routes gRPC through the WebSocket tunnel proxy. /// This test verifies the full tunnel path: /// @@ -141,24 +140,23 @@ async fn plaintext_cluster_status_reports_healthy() { /// gateway. /// /// Note: The dummy token won't be validated (no edge auth middleware on -/// the plaintext cluster), but it triggers the CLI's tunnel proxy codepath. +/// the plaintext gateway), but it triggers the CLI's tunnel proxy codepath. #[tokio::test] async fn ws_tunnel_status_through_edge_proxy() { - // Read the current cluster name to restore it later. let (original_status, _) = run_cli(&["status"]).await; let clean_status = strip_ansi(&original_status); - // Only run this test if we have a healthy cluster to test against. + // Only run this test if we have a healthy gateway to test against. if !clean_status.to_lowercase().contains("healthy") && !clean_status.to_lowercase().contains("running") && !clean_status.to_lowercase().contains("connected") && !clean_status.contains("✓") { - eprintln!("Skipping ws_tunnel test: no healthy cluster available"); + eprintln!("Skipping ws_tunnel test: no healthy gateway available"); return; } - // Get the gateway endpoint from the cluster metadata. + // Get the gateway endpoint from the gateway metadata. let (info_output, info_code) = run_cli(&["gateway", "info"]).await; assert_eq!(info_code, 0, "gateway info should succeed:\n{info_output}"); @@ -182,13 +180,15 @@ async fn ws_tunnel_status_through_edge_proxy() { }); let Some(endpoint) = endpoint else { - eprintln!("Skipping ws_tunnel test: could not extract gateway endpoint from:\n{info_clean}"); + eprintln!( + "Skipping ws_tunnel test: could not extract gateway endpoint from:\n{info_clean}" + ); return; }; // For the WS tunnel test, we need the endpoint to be HTTP (plaintext). // If it's HTTPS, the WS tunnel test requires TLS negotiation which - // complicates things. Skip if the cluster isn't plaintext. + // complicates things. Skip if the gateway isn't plaintext. if !endpoint.starts_with("http://") { eprintln!( "Skipping ws_tunnel test: gateway endpoint is not plaintext HTTP: {endpoint}\n\ @@ -201,10 +201,15 @@ async fn ws_tunnel_status_through_edge_proxy() { // the live gateway. The dummy token triggers the WS tunnel codepath // without requiring real edge auth middleware. let tmpdir = tempfile::tempdir().expect("create temp config dir"); - seed_edge_cluster_config(tmpdir.path(), "edge-tunnel-test", &endpoint, "dummy-test-jwt"); + seed_edge_gateway_config( + tmpdir.path(), + "edge-tunnel-test", + &endpoint, + "dummy-test-jwt", + ); let (output, code) = run_cli_with_config(tmpdir.path(), &[ - "--cluster", + "--gateway", "edge-tunnel-test", "status", ]) diff --git a/e2e/rust/tests/forward_proxy_graphql_l7.rs b/e2e/rust/tests/forward_proxy_graphql_l7.rs index 56aaec0f5..a0a14c00d 100644 --- a/e2e/rust/tests/forward_proxy_graphql_l7.rs +++ b/e2e/rust/tests/forward_proxy_graphql_l7.rs @@ -20,8 +20,10 @@ use tempfile::NamedTempFile; use tokio::time::{interval, timeout}; const TEST_SERVER_IMAGE: &str = "public.ecr.aws/docker/library/python:3.13-alpine"; +const TEST_SERVER_ALIAS: &str = "graphql-l7.openshell.test"; struct DockerServer { + host: String, port: u16, container_id: String, } @@ -66,18 +68,26 @@ class Handler(BaseHTTPRequestHandler): HTTPServer(("0.0.0.0", 8000), Handler).serve_forever() "#; + let e2e_network = std::env::var("OPENSHELL_E2E_DOCKER_NETWORK_NAME") + .ok() + .filter(|network| !network.trim().is_empty()); + let host = e2e_network.as_ref().map_or_else( + || "host.openshell.internal".to_string(), + |_| TEST_SERVER_ALIAS.to_string(), + ); + let port = if e2e_network.is_some() { 8000 } else { port }; + + let mut args = vec!["run", "--detach", "--rm"]; + let published_port = format!("{port}:8000"); + if let Some(network) = e2e_network.as_deref() { + args.extend(["--network", network, "--network-alias", TEST_SERVER_ALIAS]); + } else { + args.extend(["-p", &published_port]); + } + args.extend([TEST_SERVER_IMAGE, "python3", "-c", script]); + let output = Command::new("docker") - .args([ - "run", - "--detach", - "--rm", - "-p", - &format!("{port}:8000"), - TEST_SERVER_IMAGE, - "python3", - "-c", - script, - ]) + .args(args) .output() .map_err(|e| format!("start docker test server: {e}"))?; @@ -92,6 +102,7 @@ HTTPServer(("0.0.0.0", 8000), Handler).serve_forever() } let server = Self { + host, port, container_id: stdout, }; @@ -133,7 +144,7 @@ impl Drop for DockerServer { } } -fn write_graphql_policy(port: u16) -> Result { +fn write_graphql_policy(host: &str, port: u16) -> Result { let mut file = NamedTempFile::new().map_err(|e| format!("create temp policy file: {e}"))?; let policy = format!( r#"version: 1 @@ -164,7 +175,7 @@ network_policies: test_graphql_l7: name: test_graphql_l7 endpoints: - - host: host.openshell.internal + - host: {host} port: {port} protocol: graphql enforcement: enforce @@ -175,7 +186,10 @@ network_policies: operation_name: Viewer fields: [viewer] allowed_ips: + - "10.0.0.0/8" - "172.0.0.0/8" + - "192.168.0.0/16" + - "fc00::/7" rules: - allow: operation_type: query @@ -205,7 +219,7 @@ async fn graphql_l7_enforces_allow_and_deny_rules_on_forward_and_connect_paths() let server = DockerServer::start() .await .expect("start docker test server"); - let policy = write_graphql_policy(server.port).expect("write custom policy"); + let policy = write_graphql_policy(&server.host, server.port).expect("write custom policy"); let policy_path = policy .path() .to_str() @@ -217,11 +231,12 @@ async fn graphql_l7_enforces_allow_and_deny_rules_on_forward_and_connect_paths() import json import os import socket +import time import urllib.error import urllib.parse import urllib.request -HOST = "host.openshell.internal" +HOST = {host:?} PORT = {port} DETAILS = {{}} @@ -299,7 +314,7 @@ def forward_proxy_parts(): return proxy_parts("HTTP_PROXY", "http_proxy", "HTTPS_PROXY", "https_proxy") def connect_proxy_parts(): - return proxy_parts("HTTPS_PROXY", "https_proxy", "HTTP_PROXY", "http_proxy") + return proxy_parts("HTTP_PROXY", "http_proxy", "HTTPS_PROXY", "https_proxy") def forward_chunked_status(query): proxy_host, proxy_port = forward_proxy_parts() @@ -319,7 +334,7 @@ def forward_chunked_status(query): sock.sendall(request) response, body = read_response(sock) DETAILS["forward_chunked_query_allowed_detail"] = body.decode(errors="replace") - return int(response.split()[1]) + return status_code(response, "forward_chunked_response") def read_until(sock, marker): data = b"" @@ -345,132 +360,110 @@ def read_response(sock): body += chunk return response, body -def connect_status(query): +def status_code(response, label): + parts = response.split() + if len(parts) < 2: + DETAILS[f"{{label}}_raw"] = response.decode(errors="replace") + raise RuntimeError(f"{{label}}: malformed HTTP response: {{response!r}}") + try: + return int(parts[1]) + except ValueError as error: + DETAILS[f"{{label}}_raw"] = response.decode(errors="replace") + raise RuntimeError(f"{{label}}: non-numeric HTTP status: {{response!r}}") from error + +def connect_http_status(label, request): proxy_host, proxy_port = connect_proxy_parts() target = f"{{HOST}}:{{PORT}}" - body = json.dumps({{"query": query}}).encode() - with socket.create_connection((proxy_host, proxy_port), timeout=15) as sock: - sock.sendall( - f"CONNECT {{target}} HTTP/1.1\r\nHost: {{target}}\r\n\r\n".encode() - ) - connect_response = read_until(sock, b"\r\n\r\n") - if not connect_response.startswith(b"HTTP/1.1 200"): - return int(connect_response.split()[1]) - - request = ( - f"POST /graphql HTTP/1.1\r\n" - f"Host: {{target}}\r\n" - f"Content-Type: application/json\r\n" - f"Content-Length: {{len(body)}}\r\n" - f"Connection: close\r\n" - f"\r\n" - ).encode() + body - sock.sendall(request) - sock.shutdown(socket.SHUT_WR) - response = read_until(sock, b"\r\n\r\n") - return int(response.split()[1]) + last_error = None + for attempt in range(5): + try: + with socket.create_connection((proxy_host, proxy_port), timeout=15) as sock: + sock.sendall( + f"CONNECT {{target}} HTTP/1.1\r\nHost: {{target}}\r\n\r\n".encode() + ) + connect_response = read_until(sock, b"\r\n\r\n") + connect_code = status_code(connect_response, f"{{label}}_connect") + if connect_code != 200: + return connect_code + + sock.sendall(request) + sock.shutdown(socket.SHUT_WR) + response = read_until(sock, b"\r\n\r\n") + return status_code(response, f"{{label}}_response") + except (OSError, RuntimeError) as error: + last_error = error + DETAILS[f"{{label}}_attempt_{{attempt + 1}}_error"] = str(error) + time.sleep(0.2) + + raise RuntimeError(f"{{label}}: failed after 5 attempts: {{last_error}}") + +def connect_status(query, label): + target = f"{{HOST}}:{{PORT}}" + body = json.dumps({{"query": query}}).encode() -def connect_get_status(query): - proxy_host, proxy_port = connect_proxy_parts() + request = ( + f"POST /graphql HTTP/1.1\r\n" + f"Host: {{target}}\r\n" + f"Content-Type: application/json\r\n" + f"Content-Length: {{len(body)}}\r\n" + f"Connection: close\r\n" + f"\r\n" + ).encode() + body + return connect_http_status(label, request) + +def connect_get_status(query, label): target = f"{{HOST}}:{{PORT}}" encoded = urllib.parse.urlencode({{"query": query}}) - with socket.create_connection((proxy_host, proxy_port), timeout=15) as sock: - sock.sendall( - f"CONNECT {{target}} HTTP/1.1\r\nHost: {{target}}\r\n\r\n".encode() - ) - connect_response = read_until(sock, b"\r\n\r\n") - if not connect_response.startswith(b"HTTP/1.1 200"): - return int(connect_response.split()[1]) - - request = ( - f"GET /graphql?{{encoded}} HTTP/1.1\r\n" - f"Host: {{target}}\r\n" - f"Connection: close\r\n" - f"\r\n" - ).encode() - sock.sendall(request) - sock.shutdown(socket.SHUT_WR) - response = read_until(sock, b"\r\n\r\n") - return int(response.split()[1]) + request = ( + f"GET /graphql?{{encoded}} HTTP/1.1\r\n" + f"Host: {{target}}\r\n" + f"Connection: close\r\n" + f"\r\n" + ).encode() + return connect_http_status(label, request) def connect_duplicate_get_status(): - proxy_host, proxy_port = connect_proxy_parts() target = f"{{HOST}}:{{PORT}}" safe = urllib.parse.quote_plus(QUERY_VIEWER) unsafe = urllib.parse.quote_plus(MUTATION_DELETE) - with socket.create_connection((proxy_host, proxy_port), timeout=15) as sock: - sock.sendall( - f"CONNECT {{target}} HTTP/1.1\r\nHost: {{target}}\r\n\r\n".encode() - ) - connect_response = read_until(sock, b"\r\n\r\n") - if not connect_response.startswith(b"HTTP/1.1 200"): - return int(connect_response.split()[1]) + request = ( + f"GET /graphql?query={{safe}}&query={{unsafe}} HTTP/1.1\r\n" + f"Host: {{target}}\r\n" + f"Connection: close\r\n" + f"\r\n" + ).encode() + return connect_http_status("connect_duplicate_get_denied", request) - request = ( - f"GET /graphql?query={{safe}}&query={{unsafe}} HTTP/1.1\r\n" - f"Host: {{target}}\r\n" - f"Connection: close\r\n" - f"\r\n" - ).encode() - sock.sendall(request) - sock.shutdown(socket.SHUT_WR) - response = read_until(sock, b"\r\n\r\n") - return int(response.split()[1]) - -def connect_persisted_get_status(hash_value): - proxy_host, proxy_port = connect_proxy_parts() +def connect_persisted_get_status(hash_value, label): target = f"{{HOST}}:{{PORT}}" extensions = json.dumps({{"persistedQuery": {{"version": 1, "sha256Hash": hash_value}}}}) encoded = urllib.parse.urlencode({{"operationName": "Viewer", "extensions": extensions}}) - with socket.create_connection((proxy_host, proxy_port), timeout=15) as sock: - sock.sendall( - f"CONNECT {{target}} HTTP/1.1\r\nHost: {{target}}\r\n\r\n".encode() - ) - connect_response = read_until(sock, b"\r\n\r\n") - if not connect_response.startswith(b"HTTP/1.1 200"): - return int(connect_response.split()[1]) - - request = ( - f"GET /graphql?{{encoded}} HTTP/1.1\r\n" - f"Host: {{target}}\r\n" - f"Connection: close\r\n" - f"\r\n" - ).encode() - sock.sendall(request) - sock.shutdown(socket.SHUT_WR) - response = read_until(sock, b"\r\n\r\n") - return int(response.split()[1]) + request = ( + f"GET /graphql?{{encoded}} HTTP/1.1\r\n" + f"Host: {{target}}\r\n" + f"Connection: close\r\n" + f"\r\n" + ).encode() + return connect_http_status(label, request) def connect_chunked_status(query): - proxy_host, proxy_port = connect_proxy_parts() target = f"{{HOST}}:{{PORT}}" body = json.dumps({{"query": query}}).encode() chunk = f"{{len(body):x}}\r\n".encode() + body + b"\r\n0\r\n\r\n" - with socket.create_connection((proxy_host, proxy_port), timeout=15) as sock: - sock.sendall( - f"CONNECT {{target}} HTTP/1.1\r\nHost: {{target}}\r\n\r\n".encode() - ) - connect_response = read_until(sock, b"\r\n\r\n") - if not connect_response.startswith(b"HTTP/1.1 200"): - return int(connect_response.split()[1]) - - request = ( - f"POST /graphql HTTP/1.1\r\n" - f"Host: {{target}}\r\n" - f"Content-Type: application/json\r\n" - f"Transfer-Encoding: chunked\r\n" - f"Connection: close\r\n" - f"\r\n" - ).encode() + chunk - sock.sendall(request) - sock.shutdown(socket.SHUT_WR) - response = read_until(sock, b"\r\n\r\n") - return int(response.split()[1]) + request = ( + f"POST /graphql HTTP/1.1\r\n" + f"Host: {{target}}\r\n" + f"Content-Type: application/json\r\n" + f"Transfer-Encoding: chunked\r\n" + f"Connection: close\r\n" + f"\r\n" + ).encode() + chunk + return connect_http_status("connect_chunked_query_allowed", request) results = {{ "forward_query_allowed": forward_status(QUERY_VIEWER), @@ -482,32 +475,26 @@ results = {{ "forward_unlisted_field_denied": forward_status(QUERY_REPOSITORY), "forward_mutation_allowed": forward_status(MUTATION_CREATE), "forward_deny_rule_denied": forward_status(MUTATION_DELETE), - "connect_query_allowed": connect_status(QUERY_VIEWER), - "connect_get_query_allowed": connect_get_status(QUERY_VIEWER), + "connect_query_allowed": connect_status(QUERY_VIEWER, "connect_query_allowed"), + "connect_get_query_allowed": connect_get_status(QUERY_VIEWER, "connect_get_query_allowed"), "connect_duplicate_get_denied": connect_duplicate_get_status(), - "connect_persisted_get_allowed": connect_persisted_get_status("abc123"), - "connect_unregistered_persisted_get_denied": connect_persisted_get_status("missing"), + "connect_persisted_get_allowed": connect_persisted_get_status("abc123", "connect_persisted_get_allowed"), + "connect_unregistered_persisted_get_denied": connect_persisted_get_status("missing", "connect_unregistered_persisted_get_denied"), "connect_chunked_query_allowed": connect_chunked_status(QUERY_VIEWER), - "connect_unlisted_field_denied": connect_status(QUERY_REPOSITORY), - "connect_mutation_allowed": connect_status(MUTATION_CREATE), - "connect_deny_rule_denied": connect_status(MUTATION_DELETE), + "connect_unlisted_field_denied": connect_status(QUERY_REPOSITORY, "connect_unlisted_field_denied"), + "connect_mutation_allowed": connect_status(MUTATION_CREATE, "connect_mutation_allowed"), + "connect_deny_rule_denied": connect_status(MUTATION_DELETE, "connect_deny_rule_denied"), }} results.update(DETAILS) print(json.dumps(results, sort_keys=True)) "#, + host = server.host, port = server.port, ); - let guard = SandboxGuard::create(&[ - "--policy", - &policy_path, - "--", - "python3", - "-c", - &script, - ]) - .await - .expect("sandbox create"); + let guard = SandboxGuard::create(&["--policy", &policy_path, "--", "python3", "-c", &script]) + .await + .expect("sandbox create"); for (key, expected) in [ ("forward_query_allowed", 200), diff --git a/e2e/rust/tests/forward_proxy_l7_bypass.rs b/e2e/rust/tests/forward_proxy_l7_bypass.rs index c3ae584b0..4a4c65c59 100644 --- a/e2e/rust/tests/forward_proxy_l7_bypass.rs +++ b/e2e/rust/tests/forward_proxy_l7_bypass.rs @@ -18,8 +18,10 @@ use tempfile::NamedTempFile; use tokio::time::{interval, timeout}; const TEST_SERVER_IMAGE: &str = "public.ecr.aws/docker/library/python:3.13-alpine"; +const TEST_SERVER_ALIAS: &str = "rest-l7.openshell.test"; struct DockerServer { + host: String, port: u16, container_id: String, } @@ -44,18 +46,26 @@ class Handler(BaseHTTPRequestHandler): HTTPServer(("0.0.0.0", 8000), Handler).serve_forever() "#; + let e2e_network = std::env::var("OPENSHELL_E2E_DOCKER_NETWORK_NAME") + .ok() + .filter(|network| !network.trim().is_empty()); + let host = e2e_network.as_ref().map_or_else( + || "host.openshell.internal".to_string(), + |_| TEST_SERVER_ALIAS.to_string(), + ); + let port = if e2e_network.is_some() { 8000 } else { port }; + + let mut args = vec!["run", "--detach", "--rm"]; + let published_port = format!("{port}:8000"); + if let Some(network) = e2e_network.as_deref() { + args.extend(["--network", network, "--network-alias", TEST_SERVER_ALIAS]); + } else { + args.extend(["-p", &published_port]); + } + args.extend([TEST_SERVER_IMAGE, "python3", "-c", script]); + let output = Command::new("docker") - .args([ - "run", - "--detach", - "--rm", - "-p", - &format!("{port}:8000"), - TEST_SERVER_IMAGE, - "python3", - "-c", - script, - ]) + .args(args) .output() .map_err(|e| format!("start docker test server: {e}"))?; @@ -70,6 +80,7 @@ HTTPServer(("0.0.0.0", 8000), Handler).serve_forever() } let server = Self { + host, port, container_id: stdout, }; @@ -111,7 +122,7 @@ impl Drop for DockerServer { } } -fn write_policy_with_l7_rules(port: u16) -> Result { +fn write_policy_with_l7_rules(host: &str, port: u16) -> Result { let mut file = NamedTempFile::new().map_err(|e| format!("create temp policy file: {e}"))?; let policy = format!( r#"version: 1 @@ -142,12 +153,15 @@ network_policies: test_l7: name: test_l7 endpoints: - - host: host.openshell.internal + - host: {host} port: {port} protocol: rest enforcement: enforce allowed_ips: + - "10.0.0.0/8" - "172.0.0.0/8" + - "192.168.0.0/16" + - "fc00::/7" rules: - allow: method: GET @@ -173,8 +187,7 @@ async fn forward_proxy_allows_l7_permitted_request() { .await .expect("start docker test server"); let policy = - write_policy_with_l7_rules(server.port) - .expect("write custom policy"); + write_policy_with_l7_rules(&server.host, server.port).expect("write custom policy"); let policy_path = policy .path() .to_str() @@ -184,7 +197,7 @@ async fn forward_proxy_allows_l7_permitted_request() { let script = format!( r#" import urllib.request, urllib.error, json, sys -url = "http://host.openshell.internal:{port}/allowed" +url = "http://{host}:{port}/allowed" try: resp = urllib.request.urlopen(url, timeout=15) print(json.dumps({{"status": resp.status, "error": None}})) @@ -193,19 +206,13 @@ except urllib.error.HTTPError as e: except Exception as e: print(json.dumps({{"status": -1, "error": str(e)}})) "#, + host = server.host, port = server.port, ); - let guard = SandboxGuard::create(&[ - "--policy", - &policy_path, - "--", - "python3", - "-c", - &script, - ]) - .await - .expect("sandbox create"); + let guard = SandboxGuard::create(&["--policy", &policy_path, "--", "python3", "-c", &script]) + .await + .expect("sandbox create"); // L7 policy allows GET /allowed — should succeed. assert!( @@ -222,8 +229,7 @@ async fn forward_proxy_denies_l7_blocked_request() { .await .expect("start docker test server"); let policy = - write_policy_with_l7_rules(server.port) - .expect("write custom policy"); + write_policy_with_l7_rules(&server.host, server.port).expect("write custom policy"); let policy_path = policy .path() .to_str() @@ -233,7 +239,7 @@ async fn forward_proxy_denies_l7_blocked_request() { let script = format!( r#" import urllib.request, urllib.error, json, sys -url = "http://host.openshell.internal:{port}/allowed" +url = "http://{host}:{port}/allowed" req = urllib.request.Request(url, data=b"test", method="POST") try: resp = urllib.request.urlopen(req, timeout=15) @@ -243,19 +249,13 @@ except urllib.error.HTTPError as e: except Exception as e: print(json.dumps({{"status": -1, "error": str(e)}})) "#, + host = server.host, port = server.port, ); - let guard = SandboxGuard::create(&[ - "--policy", - &policy_path, - "--", - "python3", - "-c", - &script, - ]) - .await - .expect("sandbox create"); + let guard = SandboxGuard::create(&["--policy", &policy_path, "--", "python3", "-c", &script]) + .await + .expect("sandbox create"); // L7 policy denies POST — should return 403. assert!( diff --git a/e2e/rust/tests/gateway_resume.rs b/e2e/rust/tests/gateway_resume.rs index 01ad4941e..ded961859 100644 --- a/e2e/rust/tests/gateway_resume.rs +++ b/e2e/rust/tests/gateway_resume.rs @@ -3,335 +3,224 @@ #![cfg(feature = "e2e")] -//! E2E tests for gateway resume from existing state. +//! E2E coverage for resuming Docker sandboxes after a standalone gateway restart. //! -//! All scenarios run inside a **single** `#[tokio::test]` so they execute -//! in a deterministic order and share a known-good gateway state. Each -//! scenario restores the gateway to a healthy state before the next one -//! begins, preventing cascading failures. -//! -//! **Requires a running gateway** — the `e2e:rust` mise task bootstraps one. +//! This intentionally targets the Docker-driver gateway started by +//! `e2e/with-docker-gateway.sh`. Existing-endpoint E2E runs do not own the +//! gateway process, so they skip this restart-only coverage. use std::process::{Command, Stdio}; -use std::time::Duration; +use std::time::{Duration, Instant}; use openshell_e2e::harness::binary::openshell_cmd; +use openshell_e2e::harness::gateway::ManagedGateway; use openshell_e2e::harness::output::strip_ansi; +use openshell_e2e::harness::sandbox::SandboxGuard; use tokio::time::sleep; -// --------------------------------------------------------------------------- -// Helpers -// --------------------------------------------------------------------------- - -/// Resolve the gateway name from the `OPENSHELL_GATEWAY` env var (the same -/// variable the CLI reads), falling back to `"openshell"` which matches CI. -fn gateway_name() -> String { - std::env::var("OPENSHELL_GATEWAY").unwrap_or_else(|_| "openshell".to_string()) -} - -/// Docker container name for the e2e gateway. -fn container_name() -> String { - format!("openshell-cluster-{}", gateway_name()) -} +const MANAGED_BY_LABEL_FILTER: &str = "label=openshell.ai/managed-by=openshell"; +const READY_MARKER: &str = "gateway-resume-ready"; +const SANDBOX_NAMESPACE_LABEL: &str = "openshell.ai/sandbox-namespace"; +const SANDBOX_NAME_LABEL: &str = "openshell.ai/sandbox-name"; -/// Run `openshell ` and return (combined output, exit code). async fn run_cli(args: &[&str]) -> (String, i32) { let mut cmd = openshell_cmd(); - cmd.args(args) - .stdout(Stdio::piped()) - .stderr(Stdio::piped()); + cmd.args(args).stdout(Stdio::piped()).stderr(Stdio::piped()); let output = cmd.output().await.expect("spawn openshell"); - let stdout = String::from_utf8_lossy(&output.stdout).to_string(); - let stderr = String::from_utf8_lossy(&output.stderr).to_string(); + let stdout = String::from_utf8_lossy(&output.stdout); + let stderr = String::from_utf8_lossy(&output.stderr); let combined = format!("{stdout}{stderr}"); let code = output.status.code().unwrap_or(-1); (combined, code) } -/// Run `docker ` synchronously and return (stdout, exit code). -fn docker_cmd(args: &[&str]) -> (String, i32) { - let output = Command::new("docker") - .args(args) - .stdout(Stdio::piped()) - .stderr(Stdio::piped()) - .output() - .expect("spawn docker"); - let stdout = String::from_utf8_lossy(&output.stdout).to_string(); - let code = output.status.code().unwrap_or(-1); - (stdout, code) -} +async fn wait_for_healthy(timeout: Duration) -> Result<(), String> { + let start = Instant::now(); + let mut last_output: String; -/// Wait for the gateway to become healthy by polling `openshell status`. -async fn wait_for_healthy(timeout: Duration) { - let start = std::time::Instant::now(); loop { let (output, code) = run_cli(&["status"]).await; - let clean = strip_ansi(&output).to_lowercase(); + let clean = strip_ansi(&output); + let lower = clean.to_lowercase(); if code == 0 - && (clean.contains("healthy") - || clean.contains("running") - || clean.contains("connected") - || clean.contains("✓")) + && (lower.contains("healthy") + || lower.contains("running") + || lower.contains("connected")) { - return; + return Ok(()); } + last_output = clean; + if start.elapsed() > timeout { - panic!( - "gateway did not become healthy within {}s. Last output:\n{}", - timeout.as_secs(), - strip_ansi(&output) - ); + return Err(format!( + "gateway did not become healthy within {}s. Last output:\n{last_output}", + timeout.as_secs() + )); } - sleep(Duration::from_secs(3)).await; + sleep(Duration::from_secs(2)).await; } } -/// Read the SSH handshake secret from the K8s secret inside the cluster. -fn read_ssh_handshake_secret() -> Option { - let cname = container_name(); - let (output, code) = docker_cmd(&[ - "exec", - &cname, - "sh", - "-c", - "KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl -n openshell get secret openshell-ssh-handshake -o jsonpath='{.data.secret}' 2>/dev/null", - ]); - if code == 0 && !output.trim().is_empty() { - Some(output.trim().to_string()) - } else { - None +async fn sandbox_names() -> Result, String> { + let (output, code) = run_cli(&["sandbox", "list", "--names"]).await; + let clean = strip_ansi(&output); + if code != 0 { + return Err(format!("sandbox list failed (exit {code}):\n{clean}")); } -} -/// Extract the sandbox name from `openshell sandbox create` output. -fn extract_sandbox_name(output: &str) -> String { - strip_ansi(output) + Ok(clean .lines() - .find_map(|line| { - if let Some((_, rest)) = line.split_once("Created sandbox:") { - rest.split_whitespace().next().map(ToOwned::to_owned) - } else if let Some((_, rest)) = line.split_once("Name:") { - rest.split_whitespace().next().map(ToOwned::to_owned) - } else { - None - } - }) - .expect("should extract sandbox name from create output") + .map(str::trim) + .filter(|line| !line.is_empty()) + .map(ToOwned::to_owned) + .collect()) } -/// Run `gateway start` and log the output if it fails (non-fatal — the -/// test relies on [`wait_for_healthy`] for the real assertion). -async fn start_gateway() { - let (output, code) = run_cli(&["gateway", "start"]).await; - if code != 0 { - eprintln!( - "gateway start exited {code} (may still recover):\n{}", - strip_ansi(&output) - ); +fn sandbox_container_id(namespace: &str, sandbox_name: &str) -> Result { + let namespace_filter = format!("label={SANDBOX_NAMESPACE_LABEL}={namespace}"); + let sandbox_name_filter = format!("label={SANDBOX_NAME_LABEL}={sandbox_name}"); + let output = Command::new("docker") + .args(["ps", "-aq", "--filter", MANAGED_BY_LABEL_FILTER, "--filter"]) + .arg(namespace_filter) + .args(["--filter"]) + .arg(sandbox_name_filter) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output() + .map_err(|err| format!("failed to run docker ps: {err}"))?; + let stdout = String::from_utf8_lossy(&output.stdout); + let stderr = String::from_utf8_lossy(&output.stderr); + let combined = format!("{stdout}{stderr}"); + if !output.status.success() { + return Err(format!( + "docker ps failed (exit {:?}):\n{combined}", + output.status.code() + )); } -} - -// --------------------------------------------------------------------------- -// Orchestrated test suite -// --------------------------------------------------------------------------- - -/// Single entry-point that runs every resume scenario in a fixed order. -/// -/// Running as one `#[tokio::test]` gives us: -/// - **Deterministic ordering** — no async-mutex races. -/// - **Cascade prevention** — each scenario starts only after the previous -/// one left the gateway healthy. -/// - **No task-runner hacks** — no `--test-threads`, `--skip`, or split -/// cargo invocations. -#[tokio::test] -async fn gateway_resume_scenarios() { - // The gateway must already be running (bootstrapped by the `cluster` task). - wait_for_healthy(Duration::from_secs(30)).await; - // Warm the sandbox base image by creating (and deleting) a throwaway - // sandbox. On a fresh cluster the ~1 GB image pull can take minutes; - // doing it once up-front keeps the actual scenarios snappy. - eprintln!("--- warmup: pulling sandbox base image ---"); - let (output, code) = - run_cli(&["sandbox", "create", "--", "echo", "warmup"]).await; - if code == 0 { - let name = extract_sandbox_name(&output); - let _ = run_cli(&["sandbox", "delete", &name]).await; - } else { - eprintln!( - "warmup sandbox create failed (non-fatal, image may already be cached):\n{}", - strip_ansi(&output) - ); + let ids = stdout + .lines() + .map(str::trim) + .filter(|line| !line.is_empty()) + .collect::>(); + match ids.as_slice() { + [id] => Ok((*id).to_string()), + [] => Err(format!( + "no Docker container found for sandbox '{sandbox_name}' in namespace '{namespace}'" + )), + _ => Err(format!( + "multiple Docker containers found for sandbox '{sandbox_name}' in namespace '{namespace}': {ids:?}" + )), } - - scenario_start_on_running_gateway().await; - scenario_ssh_secret_persists_across_restart().await; - scenario_stop_start_resumes_with_sandbox().await; - scenario_container_kill_resumes().await; - scenario_container_removal_resumes().await; } -// --------------------------------------------------------------------------- -// Scenario: `gateway start` on an already-running gateway -// --------------------------------------------------------------------------- - -async fn scenario_start_on_running_gateway() { - eprintln!("--- scenario: start on running gateway ---"); - - let (output, code) = run_cli(&["gateway", "start"]).await; - let clean = strip_ansi(&output); +fn sandbox_container_running(namespace: &str, sandbox_name: &str) -> Result { + let container_id = sandbox_container_id(namespace, sandbox_name)?; + let output = Command::new("docker") + .args(["inspect", "-f", "{{.State.Running}}", &container_id]) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output() + .map_err(|err| format!("failed to run docker inspect: {err}"))?; + let stdout = String::from_utf8_lossy(&output.stdout); + let stderr = String::from_utf8_lossy(&output.stderr); + let combined = format!("{stdout}{stderr}"); + if !output.status.success() { + return Err(format!( + "docker inspect failed (exit {:?}):\n{combined}", + output.status.code() + )); + } - assert_eq!( - code, 0, - "gateway start on running gateway should exit 0:\n{clean}" - ); - assert!( - clean.to_lowercase().contains("already running"), - "output should indicate gateway is already running:\n{clean}" - ); + match stdout.trim() { + "true" => Ok(true), + "false" => Ok(false), + other => Err(format!( + "unexpected Docker running state for container {container_id}: {other}" + )), + } } -// --------------------------------------------------------------------------- -// Scenario: SSH handshake secret persists across restart -// --------------------------------------------------------------------------- - -async fn scenario_ssh_secret_persists_across_restart() { - eprintln!("--- scenario: SSH secret persists across restart ---"); - - let secret_before = - read_ssh_handshake_secret().expect("SSH handshake secret should exist before restart"); - assert!( - !secret_before.is_empty(), - "SSH handshake secret should not be empty" - ); - - // Stop → start. - let (_, stop_code) = run_cli(&["gateway", "stop"]).await; - assert_eq!(stop_code, 0, "gateway stop should succeed"); - sleep(Duration::from_secs(3)).await; +async fn wait_for_container_running( + namespace: &str, + sandbox_name: &str, + expected: bool, + timeout: Duration, +) -> Result<(), String> { + let start = Instant::now(); + let mut last_state: String; - start_gateway().await; - wait_for_healthy(Duration::from_secs(300)).await; + loop { + match sandbox_container_running(namespace, sandbox_name) { + Ok(running) if running == expected => return Ok(()), + Ok(running) => last_state = format!("running={running}"), + Err(err) => last_state = err, + } - let secret_after = - read_ssh_handshake_secret().expect("SSH handshake secret should exist after restart"); - assert_eq!( - secret_before, secret_after, - "SSH handshake secret should be identical before and after restart" - ); + if start.elapsed() > timeout { + return Err(format!( + "sandbox container '{sandbox_name}' did not reach running={expected} within {}s. Last state: {last_state}", + timeout.as_secs() + )); + } + sleep(Duration::from_secs(1)).await; + } } -// --------------------------------------------------------------------------- -// Scenario: stop → start resumes, sandbox survives -// --------------------------------------------------------------------------- - -async fn scenario_stop_start_resumes_with_sandbox() { - eprintln!("--- scenario: stop/start resumes with sandbox ---"); - - // Create a sandbox. - let (output, code) = - run_cli(&["sandbox", "create", "--", "echo", "resume-test"]).await; - assert_eq!( - code, 0, - "sandbox create should succeed:\n{}", - strip_ansi(&output) - ); - let sandbox_name = extract_sandbox_name(&output); - - // Stop → start. - let (stop_output, stop_code) = run_cli(&["gateway", "stop"]).await; - assert_eq!( - stop_code, 0, - "gateway stop should succeed:\n{}", - strip_ansi(&stop_output) - ); - sleep(Duration::from_secs(3)).await; - - // Verify container is stopped. - let (inspect_out, _) = docker_cmd(&[ - "inspect", - "-f", - "{{.State.Running}}", - &container_name(), - ]); - assert_eq!( - inspect_out.trim(), - "false", - "container should be stopped after gateway stop" - ); - - start_gateway().await; - wait_for_healthy(Duration::from_secs(300)).await; - - // Verify sandbox survived. - let (list_output, list_code) = run_cli(&["sandbox", "list", "--names"]).await; - let clean_list = strip_ansi(&list_output); - assert_eq!( - list_code, 0, - "sandbox list should succeed:\n{clean_list}" - ); +#[tokio::test] +async fn docker_gateway_restart_resumes_running_sandbox() { + let Some(gateway) = ManagedGateway::from_env().expect("load managed e2e gateway metadata") + else { + eprintln!("Skipping gateway resume test: e2e gateway is not managed by this test run"); + return; + }; + let Some(namespace) = std::env::var("OPENSHELL_E2E_DOCKER_NETWORK_NAME") + .ok() + .filter(|value| !value.trim().is_empty()) + else { + eprintln!("Skipping gateway resume test: Docker e2e namespace is unavailable"); + return; + }; + + wait_for_healthy(Duration::from_secs(30)) + .await + .expect("gateway should start healthy"); + + let mut sandbox = SandboxGuard::create_keep( + &[ + "sh", + "-c", + "echo gateway-resume-ready; while true; do sleep 1; done", + ], + READY_MARKER, + ) + .await + .expect("create long-running sandbox"); + + wait_for_container_running(&namespace, &sandbox.name, true, Duration::from_secs(60)) + .await + .expect("sandbox container should be running before gateway restart"); + + gateway.stop().expect("stop e2e gateway"); + wait_for_container_running(&namespace, &sandbox.name, false, Duration::from_secs(120)) + .await + .expect("gateway shutdown should stop managed Docker sandboxes"); + + gateway.start().expect("restart e2e gateway"); + wait_for_healthy(Duration::from_secs(120)) + .await + .expect("gateway should become healthy after restart"); + wait_for_container_running(&namespace, &sandbox.name, true, Duration::from_secs(120)) + .await + .expect("gateway startup should resume the Docker sandbox container"); + + let names = sandbox_names().await.expect("list sandboxes after restart"); assert!( - clean_list.contains(&sandbox_name), - "sandbox '{sandbox_name}' should survive stop/start.\nList:\n{clean_list}" - ); - - let _ = run_cli(&["sandbox", "delete", &sandbox_name]).await; -} - -// --------------------------------------------------------------------------- -// Scenario: container killed → resume with stale network -// --------------------------------------------------------------------------- - -async fn scenario_container_kill_resumes() { - eprintln!("--- scenario: container kill resumes ---"); - - let cname = container_name(); - let net_name = format!("openshell-cluster-{}", gateway_name()); - - // Kill the container. - let (_, kill_code) = docker_cmd(&["kill", &cname]); - assert_eq!(kill_code, 0, "docker kill should succeed"); - sleep(Duration::from_secs(3)).await; - - // Remove the network to simulate a stale network reference. - // The bootstrap `ensure_network` always destroys and recreates, so - // after this the container's stored network ID will be invalid. - let _ = docker_cmd(&["network", "disconnect", "-f", &net_name, &cname]); - let (_, net_rm_code) = docker_cmd(&["network", "rm", &net_name]); - assert_eq!( - net_rm_code, 0, - "docker network rm should succeed" - ); - - // Resume — must handle stale network + reuse existing PKI. - start_gateway().await; - wait_for_healthy(Duration::from_secs(300)).await; -} - -// --------------------------------------------------------------------------- -// Scenario: container removed → resume from volume -// --------------------------------------------------------------------------- - -async fn scenario_container_removal_resumes() { - eprintln!("--- scenario: container removal resumes ---"); - - // Force-remove the container. - let (_, rm_code) = docker_cmd(&["rm", "-f", &container_name()]); - assert_eq!(rm_code, 0, "docker rm -f should succeed"); - - // Volume should survive. - let (vol_out, vol_code) = docker_cmd(&[ - "volume", - "inspect", - &format!("openshell-cluster-{}", gateway_name()), - ]); - assert_eq!( - vol_code, 0, - "volume should still exist after container removal:\n{vol_out}" + names.contains(&sandbox.name), + "sandbox '{}' should still be listed after gateway restart. Names: {names:?}", + sandbox.name ); - // Resume from volume. - start_gateway().await; - wait_for_healthy(Duration::from_secs(300)).await; + sandbox.cleanup().await; } diff --git a/e2e/rust/tests/host_gateway_alias.rs b/e2e/rust/tests/host_gateway_alias.rs index 083f5e6ee..2dbdbf1dc 100644 --- a/e2e/rust/tests/host_gateway_alias.rs +++ b/e2e/rust/tests/host_gateway_alias.rs @@ -4,20 +4,19 @@ #![cfg(feature = "e2e")] use std::io::Write; -use std::process::Command; use std::process::Stdio; use std::sync::Mutex; -use std::time::Duration; use openshell_e2e::harness::binary::openshell_cmd; -use openshell_e2e::harness::port::find_free_port; use openshell_e2e::harness::sandbox::SandboxGuard; use tempfile::NamedTempFile; -use tokio::time::{interval, timeout}; +use tokio::io::AsyncReadExt; +use tokio::io::AsyncWriteExt; +use tokio::net::TcpListener; +use tokio::task::JoinHandle; const INFERENCE_PROVIDER_NAME: &str = "e2e-host-inference"; const INFERENCE_PROVIDER_UNREACHABLE_NAME: &str = "e2e-host-inference-unreachable"; -const TEST_SERVER_IMAGE: &str = "public.ecr.aws/docker/library/python:3.13-alpine"; static INFERENCE_ROUTE_LOCK: Mutex<()> = Mutex::new(()); async fn run_cli(args: &[&str]) -> Result { @@ -44,117 +43,63 @@ async fn run_cli(args: &[&str]) -> Result { Ok(combined) } -struct DockerServer { +struct HostServer { port: u16, - container_id: String, + task: JoinHandle<()>, } -impl DockerServer { +impl HostServer { async fn start(response_body: &str) -> Result { - let port = find_free_port(); - let script = r#"from http.server import BaseHTTPRequestHandler, HTTPServer -import os - -BODY = os.environ["RESPONSE_BODY"].encode() - -class Handler(BaseHTTPRequestHandler): - def do_GET(self): - self.send_response(200) - self.send_header("Content-Type", "application/json") - self.send_header("Content-Length", str(len(BODY))) - self.end_headers() - self.wfile.write(BODY) - - def do_POST(self): - length = int(self.headers.get("Content-Length", "0")) - if length: - self.rfile.read(length) - self.send_response(200) - self.send_header("Content-Type", "application/json") - self.send_header("Content-Length", str(len(BODY))) - self.end_headers() - self.wfile.write(BODY) - - def log_message(self, format, *args): - pass - -HTTPServer(("0.0.0.0", 8000), Handler).serve_forever() -"#; - - let output = Command::new("docker") - .args([ - "run", - "--detach", - "--rm", - "-e", - &format!("RESPONSE_BODY={response_body}"), - "-p", - &format!("{port}:8000"), - TEST_SERVER_IMAGE, - "python3", - "-c", - script, - ]) - .output() - .map_err(|e| format!("start docker test server: {e}"))?; - - let stdout = String::from_utf8_lossy(&output.stdout).trim().to_string(); - let stderr = String::from_utf8_lossy(&output.stderr).to_string(); - - if !output.status.success() { - return Err(format!( - "docker run failed (exit {:?}):\n{stderr}", - output.status.code() - )); - } - - let server = Self { - port, - container_id: stdout, - }; - server.wait_until_ready().await?; - Ok(server) - } - - async fn wait_until_ready(&self) -> Result<(), String> { - let container_id = self.container_id.clone(); - timeout(Duration::from_secs(60), async move { - let mut tick = interval(Duration::from_millis(500)); + let listener = TcpListener::bind(("0.0.0.0", 0)) + .await + .map_err(|e| format!("bind host test server: {e}"))?; + let port = listener + .local_addr() + .map_err(|e| format!("read host test server address: {e}"))? + .port(); + let response_body = response_body.as_bytes().to_vec(); + let task = tokio::spawn(async move { loop { - tick.tick().await; - let output = Command::new("docker") - .args([ - "exec", - &container_id, - "python3", - "-c", - "import urllib.request; urllib.request.urlopen('http://127.0.0.1:8000', timeout=1).read()", - ]) - .output(); - - match output { - Ok(result) if result.status.success() => return Ok(()), - Ok(_) | Err(_) => continue, - } + let Ok((mut stream, _)) = listener.accept().await else { + break; + }; + let body = response_body.clone(); + tokio::spawn(async move { + let mut request = Vec::new(); + let mut buf = [0_u8; 1024]; + loop { + let Ok(read) = stream.read(&mut buf).await else { + return; + }; + if read == 0 { + return; + } + request.extend_from_slice(&buf[..read]); + if request.windows(4).any(|window| window == b"\r\n\r\n") { + break; + } + } + + let response = format!( + "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n", + body.len() + ); + if stream.write_all(response.as_bytes()).await.is_err() { + return; + } + let _ = stream.write_all(&body).await; + let _ = stream.shutdown().await; + }); } - }) - .await - .map_err(|_| { - format!( - "docker test server {} did not become ready within 60s", - self.container_id - ) - })? + }); + + Ok(Self { port, task }) } } -impl Drop for DockerServer { +impl Drop for HostServer { fn drop(&mut self) { - let _ = Command::new("docker") - .args(["rm", "-f", &self.container_id]) - .stdout(Stdio::null()) - .stderr(Stdio::null()) - .status(); + self.task.abort(); } } @@ -228,7 +173,10 @@ network_policies: - host: host.openshell.internal port: {port} allowed_ips: + - "10.0.0.0/8" - "172.0.0.0/8" + - "192.168.0.0/16" + - "fc00::/7" binaries: - path: /usr/bin/curl "# @@ -242,7 +190,7 @@ network_policies: #[tokio::test] async fn sandbox_reaches_host_openshell_internal_via_host_gateway_alias() { - let server = DockerServer::start(r#"{"message":"hello-from-host"}"#) + let server = HostServer::start(r#"{"message":"hello-from-host"}"#) .await .expect("start host echo server"); let policy = write_policy(server.port).expect("write custom policy"); @@ -289,7 +237,7 @@ async fn sandbox_inference_local_routes_to_host_openshell_internal() { return; } - let server = DockerServer::start( + let server = HostServer::start( r#"{"id":"chatcmpl-test","object":"chat.completion","created":1,"model":"host-echo","choices":[{"index":0,"message":{"role":"assistant","content":"hello-from-host"},"finish_reason":"stop"}]}"#, ) .await @@ -299,7 +247,7 @@ async fn sandbox_inference_local_routes_to_host_openshell_internal() { delete_provider(INFERENCE_PROVIDER_NAME).await; } - let create_output = create_openai_provider( + create_openai_provider( INFERENCE_PROVIDER_NAME, &format!("http://host.openshell.internal:{}/v1", server.port), ) @@ -313,17 +261,14 @@ async fn sandbox_inference_local_routes_to_host_openshell_internal() { INFERENCE_PROVIDER_NAME, "--model", "host-echo-model", + "--no-verify", ]) .await .expect("point inference.local at host-backed provider"); assert!( - inference_output.contains("Validated Endpoints:"), - "expected verification details in output:\n{inference_output}" - ); - assert!( - inference_output.contains("/v1/chat/completions (openai_chat_completions)"), - "expected validated endpoint in output:\n{inference_output}" + !inference_output.contains("Validated Endpoints:"), + "did not expect local CLI verification for host-only alias:\n{inference_output}" ); let guard = SandboxGuard::create(&[ @@ -352,8 +297,6 @@ async fn sandbox_inference_local_routes_to_host_openshell_internal() { "expected sandbox to receive echoed inference content:\n{}", guard.create_output ); - - let _ = create_output; } #[tokio::test] diff --git a/e2e/with-docker-gateway.sh b/e2e/with-docker-gateway.sh new file mode 100755 index 000000000..f1edcf05b --- /dev/null +++ b/e2e/with-docker-gateway.sh @@ -0,0 +1,531 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Run an e2e command against a Docker-backed OpenShell gateway. +# +# Modes: +# - OPENSHELL_GATEWAY_ENDPOINT unset: +# Build and start an ephemeral standalone gateway with the Docker compute +# driver, then run the command against that gateway. +# - OPENSHELL_GATEWAY_ENDPOINT=http://host:port: +# Use the existing plaintext gateway endpoint and run the command. +# +# HTTPS endpoint-only mode is intentionally unsupported here. Use a named +# gateway config when mTLS materials are needed. + +set -euo pipefail + +if [ "$#" -eq 0 ]; then + echo "Usage: e2e/with-docker-gateway.sh [args...]" >&2 + exit 2 +fi + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +cargo_target_dir() { + if [ -n "${CARGO_TARGET_DIR:-}" ]; then + case "${CARGO_TARGET_DIR}" in + /*) printf '%s\n' "${CARGO_TARGET_DIR}" ;; + *) printf '%s\n' "${ROOT}/${CARGO_TARGET_DIR}" ;; + esac + return 0 + fi + + cargo metadata --format-version=1 --no-deps \ + | python3 -c 'import json, sys; print(json.load(sys.stdin)["target_directory"])' +} + +github_actions_host_docker_tmpdir() { + if [ "${GITHUB_ACTIONS:-}" != "true" ] \ + || [ ! -S /var/run/docker.sock ] \ + || [ ! -d /__w/_temp ]; then + return 1 + fi + + # Container jobs talk to the host Docker daemon. Bind mount source paths must + # exist on the host, but the gateway also validates those same paths inside + # the job container before handing them to Docker. This must be a real mount + # rather than a symlink because the Docker driver canonicalizes file paths. + if [ -d /home/runner/_work/_temp ]; then + printf '%s\n' /home/runner/_work/_temp + return 0 + fi + + echo "ERROR: GitHub Actions Docker e2e requires /home/runner/_work mounted inside the job container." >&2 + echo " Mount /home/runner/_work:/home/runner/_work so Docker bind paths resolve on both sides." >&2 + return 2 +} + +if WORKDIR_PARENT="$(github_actions_host_docker_tmpdir)"; then + : +else + status=$? + if [ "${status}" -eq 2 ]; then + exit 2 + fi + WORKDIR_PARENT="${TMPDIR:-/tmp}" +fi +WORKDIR_PARENT="${WORKDIR_PARENT%/}" +WORKDIR="$(mktemp -d "${WORKDIR_PARENT}/openshell-e2e-gateway.XXXXXX")" +GATEWAY_BIN="" +CLI_BIN="" +GATEWAY_PID="" +GATEWAY_LOG="${WORKDIR}/gateway.log" +GATEWAY_PID_FILE="${WORKDIR}/gateway.pid" +GATEWAY_ARGS_FILE="${WORKDIR}/gateway.args" +GATEWAY_CONFIG_DIR="" +E2E_NAMESPACE="" +DOCKER_NETWORK_NAME="" +DOCKER_NETWORK_CONNECTED_CONTAINER="" +DOCKER_NETWORK_MANAGED=0 +GPU_MODE="${OPENSHELL_E2E_DOCKER_GPU:-0}" + +# Isolate CLI/SDK gateway metadata from the developer's real config. +export XDG_CONFIG_HOME="${WORKDIR}/config" + +cleanup() { + local exit_code=$? + + local gateway_pid="${GATEWAY_PID}" + if [ -f "${GATEWAY_PID_FILE}" ]; then + gateway_pid="$(cat "${GATEWAY_PID_FILE}" 2>/dev/null || true)" + fi + if [ -n "${gateway_pid}" ] && kill -0 "${gateway_pid}" 2>/dev/null; then + echo "Stopping openshell-gateway (pid ${gateway_pid})..." + kill "${gateway_pid}" 2>/dev/null || true + wait "${gateway_pid}" 2>/dev/null || true + fi + + if [ "${exit_code}" -ne 0 ] \ + && [ -n "${E2E_NAMESPACE}" ] \ + && command -v docker >/dev/null 2>&1; then + local ids + ids=$(docker ps -aq \ + --filter "label=openshell.ai/managed-by=openshell" \ + --filter "label=openshell.ai/sandbox-namespace=${E2E_NAMESPACE}" \ + 2>/dev/null || true) + if [ -n "${ids}" ]; then + echo "=== sandbox container logs (preserved for debugging) ===" + for id in ${ids}; do + echo "--- container ${id} (inspect) ---" + docker inspect --format '{{.Name}} state={{.State.Status}} exit={{.State.ExitCode}} restarts={{.RestartCount}} error={{.State.Error}}' "${id}" 2>/dev/null || true + echo "--- container ${id} (last 80 log lines) ---" + docker logs --tail 80 "${id}" 2>&1 || true + done + echo "=== end sandbox container logs ===" + fi + fi + + if [ -n "${E2E_NAMESPACE}" ] && command -v docker >/dev/null 2>&1; then + local stale + stale=$(docker ps -aq \ + --filter "label=openshell.ai/managed-by=openshell" \ + --filter "label=openshell.ai/sandbox-namespace=${E2E_NAMESPACE}" \ + 2>/dev/null || true) + if [ -n "${stale}" ]; then + # shellcheck disable=SC2086 + docker rm -f ${stale} >/dev/null 2>&1 || true + fi + fi + + if [ -n "${DOCKER_NETWORK_CONNECTED_CONTAINER}" ] \ + && [ -n "${DOCKER_NETWORK_NAME}" ] \ + && command -v docker >/dev/null 2>&1; then + docker network disconnect -f \ + "${DOCKER_NETWORK_NAME}" \ + "${DOCKER_NETWORK_CONNECTED_CONTAINER}" >/dev/null 2>&1 || true + fi + + if [ "${DOCKER_NETWORK_MANAGED}" = "1" ] \ + && [ -n "${DOCKER_NETWORK_NAME}" ] \ + && command -v docker >/dev/null 2>&1; then + docker network rm "${DOCKER_NETWORK_NAME}" >/dev/null 2>&1 || true + fi + + if [ "${exit_code}" -ne 0 ] && [ -f "${GATEWAY_LOG}" ]; then + echo "=== gateway log (preserved for debugging) ===" + cat "${GATEWAY_LOG}" + echo "=== end gateway log ===" + fi + + rm -rf "${WORKDIR}" 2>/dev/null || true +} +trap cleanup EXIT + +register_plaintext_gateway() { + local name=$1 + local endpoint=$2 + local port=$3 + + GATEWAY_CONFIG_DIR="${XDG_CONFIG_HOME}/openshell/gateways/${name}" + mkdir -p "${GATEWAY_CONFIG_DIR}" + cat >"${GATEWAY_CONFIG_DIR}/metadata.json" <"${XDG_CONFIG_HOME}/openshell/active_gateway" +} + +register_mtls_gateway() { + local name=$1 + local endpoint=$2 + local port=$3 + local pki_dir=$4 + + GATEWAY_CONFIG_DIR="${XDG_CONFIG_HOME}/openshell/gateways/${name}" + mkdir -p "${GATEWAY_CONFIG_DIR}/mtls" + cp "${pki_dir}/ca.crt" "${GATEWAY_CONFIG_DIR}/mtls/ca.crt" + cp "${pki_dir}/client.crt" "${GATEWAY_CONFIG_DIR}/mtls/tls.crt" + cp "${pki_dir}/client.key" "${GATEWAY_CONFIG_DIR}/mtls/tls.key" + cat >"${GATEWAY_CONFIG_DIR}/metadata.json" <"${XDG_CONFIG_HOME}/openshell/active_gateway" +} + +endpoint_port() { + python3 - "$1" <<'PY' +import sys +from urllib.parse import urlparse + +parsed = urlparse(sys.argv[1]) +print(parsed.port or (443 if parsed.scheme == "https" else 80)) +PY +} + +pick_port() { + python3 -c 'import socket; s=socket.socket(); s.bind(("",0)); print(s.getsockname()[1]); s.close()' +} + +ensure_e2e_docker_network() { + local network=$1 + + if docker network inspect "${network}" >/dev/null 2>&1; then + return 0 + fi + + docker network create \ + --driver bridge \ + --attachable \ + --label openshell.ai/managed-by=openshell \ + --label "openshell.ai/sandbox-namespace=${E2E_NAMESPACE}" \ + "${network}" >/dev/null + DOCKER_NETWORK_MANAGED=1 +} + +github_actions_container_id() { + if [ "${GITHUB_ACTIONS:-}" != "true" ] || [ ! -f /.dockerenv ]; then + return 1 + fi + + local container + container="$(hostname)" + if docker inspect "${container}" >/dev/null 2>&1; then + printf '%s\n' "${container}" + return 0 + fi + + return 1 +} + +connect_current_container_to_docker_network() { + local network=$1 + local container + + if ! container="$(github_actions_container_id)"; then + return 1 + fi + + local connect_err="${WORKDIR}/docker-network-connect.err" + if ! docker network connect \ + --alias host.openshell.internal \ + "${network}" \ + "${container}" 2>"${connect_err}"; then + if ! grep -qi "already exists" "${connect_err}"; then + cat "${connect_err}" >&2 + return 1 + fi + fi + + DOCKER_NETWORK_CONNECTED_CONTAINER="${container}" + + local container_ip + container_ip="$(docker inspect \ + --format "{{with index .NetworkSettings.Networks \"${network}\"}}{{.IPAddress}}{{end}}" \ + "${container}")" + if [ -z "${container_ip}" ]; then + echo "ERROR: failed to resolve current job container IP on Docker network ${network}" >&2 + return 1 + fi + + GATEWAY_HOST_ALIAS_IP="${container_ip}" +} + +if [ -n "${OPENSHELL_GATEWAY_ENDPOINT:-}" ]; then + case "${OPENSHELL_GATEWAY_ENDPOINT}" in + http://*) ;; + https://*) + echo "ERROR: OPENSHELL_GATEWAY_ENDPOINT endpoint mode is HTTP-only for e2e." >&2 + echo " Register a named gateway with mTLS config instead of using a raw HTTPS endpoint." >&2 + exit 2 + ;; + *) + echo "ERROR: OPENSHELL_GATEWAY_ENDPOINT must start with http:// for e2e endpoint mode." >&2 + exit 2 + ;; + esac + + GATEWAY_NAME="${OPENSHELL_GATEWAY:-openshell-e2e-endpoint}" + register_plaintext_gateway "${GATEWAY_NAME}" "${OPENSHELL_GATEWAY_ENDPOINT}" "$(endpoint_port "${OPENSHELL_GATEWAY_ENDPOINT}")" + export OPENSHELL_GATEWAY="${GATEWAY_NAME}" + export OPENSHELL_PROVISION_TIMEOUT="${OPENSHELL_PROVISION_TIMEOUT:-180}" + + echo "Using existing e2e gateway endpoint: ${OPENSHELL_GATEWAY_ENDPOINT}" + "$@" + exit $? +fi + +# ── Preflight for managed Docker gateway mode ──────────────────────── +if ! command -v docker >/dev/null 2>&1; then + echo "ERROR: docker CLI is required to run Docker-backed e2e tests" >&2 + exit 2 +fi +if ! docker info >/dev/null 2>&1; then + echo "ERROR: docker daemon is not reachable (docker info failed)" >&2 + exit 2 +fi +if ! command -v openssl >/dev/null 2>&1; then + echo "ERROR: openssl is required to generate ephemeral PKI" >&2 + exit 2 +fi +if [ "${GPU_MODE}" = "1" ]; then + DOCKER_CDI_SPEC_DIRS="$(docker info --format '{{json .CDISpecDirs}}' 2>/dev/null || true)" + if [ -z "${DOCKER_CDI_SPEC_DIRS}" ] \ + || [ "${DOCKER_CDI_SPEC_DIRS}" = "null" ] \ + || [ "${DOCKER_CDI_SPEC_DIRS}" = "[]" ] \ + || [ "${DOCKER_CDI_SPEC_DIRS}" = "" ]; then + echo "ERROR: Docker GPU e2e requires Docker CDI support." >&2 + echo " Generate CDI specs and restart Docker, then verify docker info reports CDISpecDirs." >&2 + exit 2 + fi +fi + +normalize_arch() { + case "$1" in + x86_64|amd64) echo "amd64" ;; + aarch64|arm64) echo "arm64" ;; + *) echo "$1" ;; + esac +} + +linux_target_triple() { + case "$1" in + amd64) echo "x86_64-unknown-linux-gnu" ;; + arm64) echo "aarch64-unknown-linux-gnu" ;; + *) + echo "ERROR: unsupported Docker daemon architecture '$1'" >&2 + exit 2 + ;; + esac +} + +DAEMON_ARCH="$(normalize_arch "$(docker info --format '{{.Architecture}}' 2>/dev/null || true)")" +SUPERVISOR_TARGET="$(linux_target_triple "${DAEMON_ARCH}")" +HOST_OS="$(uname -s)" +HOST_ARCH="$(normalize_arch "$(uname -m)")" +SUPERVISOR_OUT_DIR="${WORKDIR}/supervisor/${DAEMON_ARCH}" +SUPERVISOR_BIN="${SUPERVISOR_OUT_DIR}/openshell-sandbox" + +CARGO_BUILD_JOBS_ARG=() +if [ -n "${CARGO_BUILD_JOBS:-}" ]; then + CARGO_BUILD_JOBS_ARG=(-j "${CARGO_BUILD_JOBS}") +fi + +TARGET_DIR="$(cargo_target_dir)" +GATEWAY_BIN="${TARGET_DIR}/debug/openshell-gateway" +CLI_BIN="${TARGET_DIR}/debug/openshell" + +echo "Building openshell-gateway..." +cargo build ${CARGO_BUILD_JOBS_ARG[@]+"${CARGO_BUILD_JOBS_ARG[@]}"} \ + -p openshell-server --bin openshell-gateway \ + --features openshell-core/dev-settings + +echo "Building openshell-cli..." +cargo build ${CARGO_BUILD_JOBS_ARG[@]+"${CARGO_BUILD_JOBS_ARG[@]}"} \ + -p openshell-cli --bin openshell \ + --features openshell-core/dev-settings + +if [ ! -x "${GATEWAY_BIN}" ]; then + echo "ERROR: expected openshell-gateway binary at ${GATEWAY_BIN}" >&2 + exit 1 +fi +if [ ! -x "${CLI_BIN}" ]; then + echo "ERROR: expected openshell CLI binary at ${CLI_BIN}" >&2 + exit 1 +fi + +echo "Building openshell-sandbox for ${SUPERVISOR_TARGET}..." +mkdir -p "${SUPERVISOR_OUT_DIR}" +if [ "${HOST_OS}" = "Linux" ] && [ "${HOST_ARCH}" = "${DAEMON_ARCH}" ]; then + rustup target add "${SUPERVISOR_TARGET}" >/dev/null 2>&1 || true + cargo build ${CARGO_BUILD_JOBS_ARG[@]+"${CARGO_BUILD_JOBS_ARG[@]}"} \ + --release -p openshell-sandbox --target "${SUPERVISOR_TARGET}" + cp "${TARGET_DIR}/${SUPERVISOR_TARGET}/release/openshell-sandbox" "${SUPERVISOR_BIN}" +else + CONTAINER_ENGINE=docker \ + DOCKER_PLATFORM="linux/${DAEMON_ARCH}" \ + DOCKER_OUTPUT="type=local,dest=${SUPERVISOR_OUT_DIR}" \ + bash "${ROOT}/tasks/scripts/docker-build-image.sh" supervisor-output +fi + +if [ ! -f "${SUPERVISOR_BIN}" ]; then + echo "ERROR: expected supervisor binary at ${SUPERVISOR_BIN}" >&2 + exit 1 +fi +chmod +x "${SUPERVISOR_BIN}" + +DEFAULT_SANDBOX_IMAGE="ghcr.io/nvidia/openshell-community/sandboxes/base:latest" +SANDBOX_IMAGE="${OPENSHELL_E2E_DOCKER_SANDBOX_IMAGE:-${OPENSHELL_SANDBOX_IMAGE:-${DEFAULT_SANDBOX_IMAGE}}}" +if ! docker image inspect "${SANDBOX_IMAGE}" >/dev/null 2>&1; then + echo "Pulling ${SANDBOX_IMAGE}..." + docker pull "${SANDBOX_IMAGE}" +fi + +PKI_DIR="${WORKDIR}/pki" +mkdir -p "${PKI_DIR}" +cd "${PKI_DIR}" + +cat > openssl.cnf <<'EOF' +[req] +distinguished_name = dn +prompt = no +[dn] +CN = openshell-server +[san_server] +subjectAltName = @alt_server +[alt_server] +DNS.1 = localhost +DNS.2 = host.openshell.internal +DNS.3 = host.docker.internal +IP.1 = 127.0.0.1 +IP.2 = ::1 +[san_client] +subjectAltName = DNS:openshell-client +EOF + +openssl req -x509 -newkey rsa:2048 -nodes -days 30 \ + -keyout ca.key -out ca.crt -subj "/CN=openshell-e2e-ca" >/dev/null 2>&1 + +openssl req -newkey rsa:2048 -nodes -keyout server.key -out server.csr \ + -config openssl.cnf >/dev/null 2>&1 +openssl x509 -req -in server.csr -CA ca.crt -CAkey ca.key -CAcreateserial \ + -out server.crt -days 30 -extfile openssl.cnf -extensions san_server >/dev/null 2>&1 + +openssl req -newkey rsa:2048 -nodes -keyout client.key -out client.csr \ + -subj "/CN=openshell-client" >/dev/null 2>&1 +openssl x509 -req -in client.csr -CA ca.crt -CAkey ca.key -CAcreateserial \ + -out client.crt -days 30 -extfile openssl.cnf -extensions san_client >/dev/null 2>&1 + +cd "${ROOT}" + +HOST_PORT=$(pick_port) +STATE_DIR="${WORKDIR}/state" +mkdir -p "${STATE_DIR}" + +GATEWAY_ENDPOINT="https://host.openshell.internal:${HOST_PORT}" +E2E_NAMESPACE="e2e-docker-$$-${HOST_PORT}" +DOCKER_NETWORK_NAME="${E2E_NAMESPACE}" +GATEWAY_HOST_ALIAS_IP="" + +ensure_e2e_docker_network "${DOCKER_NETWORK_NAME}" +export OPENSHELL_E2E_DOCKER_NETWORK_NAME="${DOCKER_NETWORK_NAME}" +if connect_current_container_to_docker_network "${DOCKER_NETWORK_NAME}"; then + echo "Connected CI job container to Docker network ${DOCKER_NETWORK_NAME} (${GATEWAY_HOST_ALIAS_IP})." +else + GATEWAY_HOST_ALIAS_IP="" +fi + +echo "Starting openshell-gateway on port ${HOST_PORT} (namespace: ${E2E_NAMESPACE})..." +GATEWAY_ARGS=( + --bind-address 0.0.0.0 \ + --port "${HOST_PORT}" \ + --drivers docker \ + --sandbox-namespace "${E2E_NAMESPACE}" \ + --docker-network-name "${DOCKER_NETWORK_NAME}" \ + --tls-cert "${PKI_DIR}/server.crt" \ + --tls-key "${PKI_DIR}/server.key" \ + --tls-client-ca "${PKI_DIR}/ca.crt" \ + --db-url "sqlite:${STATE_DIR}/gateway.db?mode=rwc" \ + --grpc-endpoint "${GATEWAY_ENDPOINT}" \ + --docker-supervisor-bin "${SUPERVISOR_BIN}" \ + --docker-tls-ca "${PKI_DIR}/ca.crt" \ + --docker-tls-cert "${PKI_DIR}/client.crt" \ + --docker-tls-key "${PKI_DIR}/client.key" \ + --sandbox-image "${SANDBOX_IMAGE}" \ + --sandbox-image-pull-policy IfNotPresent +) +if [ -n "${GATEWAY_HOST_ALIAS_IP}" ]; then + GATEWAY_ARGS+=(--host-gateway-ip "${GATEWAY_HOST_ALIAS_IP}") +fi + +: >"${GATEWAY_ARGS_FILE}" +for arg in "${GATEWAY_ARGS[@]}"; do + printf '%s\0' "${arg}" >>"${GATEWAY_ARGS_FILE}" +done +export OPENSHELL_E2E_GATEWAY_BIN="${GATEWAY_BIN}" +export OPENSHELL_E2E_GATEWAY_ARGS_FILE="${GATEWAY_ARGS_FILE}" +export OPENSHELL_E2E_GATEWAY_LOG="${GATEWAY_LOG}" +export OPENSHELL_E2E_GATEWAY_PID_FILE="${GATEWAY_PID_FILE}" + +"${GATEWAY_BIN}" "${GATEWAY_ARGS[@]}" >"${GATEWAY_LOG}" 2>&1 & +GATEWAY_PID=$! +printf '%s\n' "${GATEWAY_PID}" >"${GATEWAY_PID_FILE}" + +GATEWAY_NAME="openshell-e2e-docker-${HOST_PORT}" +CLI_GATEWAY_ENDPOINT="https://127.0.0.1:${HOST_PORT}" +register_mtls_gateway "${GATEWAY_NAME}" "${CLI_GATEWAY_ENDPOINT}" "${HOST_PORT}" "${PKI_DIR}" + +export OPENSHELL_GATEWAY="${GATEWAY_NAME}" +export OPENSHELL_PROVISION_TIMEOUT="${OPENSHELL_PROVISION_TIMEOUT:-180}" + +echo "Waiting for gateway to become healthy..." +elapsed=0 +timeout=120 +last_status_output="" +while [ "${elapsed}" -lt "${timeout}" ]; do + if ! kill -0 "${GATEWAY_PID}" 2>/dev/null; then + echo "ERROR: openshell-gateway exited before becoming healthy" + exit 1 + fi + if last_status_output="$("${CLI_BIN}" status 2>&1)"; then + echo "Gateway healthy after ${elapsed}s." + break + fi + sleep 2 + elapsed=$((elapsed + 2)) +done +if [ "${elapsed}" -ge "${timeout}" ]; then + echo "ERROR: gateway did not become healthy within ${timeout}s" + echo "=== last openshell status output ===" + if [ -n "${last_status_output}" ]; then + printf '%s\n' "${last_status_output}" + else + echo "" + fi + echo "=== end openshell status output ===" + exit 1 +fi + +echo "Running e2e command against ${CLI_GATEWAY_ENDPOINT}: $*" +"$@" diff --git a/tasks/test.toml b/tasks/test.toml index dd1e88941..82b49d576 100644 --- a/tasks/test.toml +++ b/tasks/test.toml @@ -28,21 +28,17 @@ run = "uv run pytest python/" hide = true ["e2e:rust"] -description = "Run Rust CLI e2e tests (requires a running cluster)" -depends = ["cluster"] +description = "Run Rust CLI e2e tests against a Docker-backed gateway" run = [ "cargo build -p openshell-cli --features openshell-core/dev-settings", - # gateway_resume and Docker GPU tests run in dedicated jobs with their own clusters. - # Dockerfile sources build into the host Docker daemon, so the custom image - # test belongs with Docker-backed gateway coverage rather than the k3s suite. - "cargo test --manifest-path e2e/rust/Cargo.toml --features e2e -- --skip gateway_resume_scenarios --skip docker_gpu_sandbox_runs_nvidia_smi --skip sandbox_from_custom_dockerfile", + "e2e/with-docker-gateway.sh cargo test --manifest-path e2e/rust/Cargo.toml --features e2e -- --skip docker_gpu_sandbox_runs_nvidia_smi", ] ["e2e:python"] -description = "Run Python e2e tests (E2E_PARALLEL=N or 'auto'; default 5)" -depends = ["python:proto", "cluster"] +description = "Run Python e2e tests against a Docker-backed gateway (E2E_PARALLEL=N or 'auto'; default 5)" +depends = ["python:proto"] env = { UV_NO_SYNC = "1", PYTHONPATH = "python" } -run = "uv run pytest -o python_files='test_*.py' -m 'not gpu' -n ${E2E_PARALLEL:-5} e2e/python" +run = "e2e/with-docker-gateway.sh uv run pytest -o python_files='test_*.py' -m 'not gpu' -n ${E2E_PARALLEL:-5} e2e/python" ["e2e:python:gpu"] description = "Run Python GPU e2e tests"