diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fc84dc777..cb425e8b7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -263,6 +263,18 @@ jobs: run: deno task build working-directory: frontend + - name: Install Rust + uses: dsherret/rust-toolchain-file@v1 + + - name: Add wasm target + run: rustup target add wasm32-unknown-unknown + + - name: Build API Cloudflare Worker + run: | + cargo install worker-build --locked + worker-build --release + working-directory: workers-rs + - name: terraform plan run: | touch terraform/staging.secret.tfvars @@ -335,6 +347,18 @@ jobs: run: deno task build working-directory: frontend + - name: Install Rust + uses: dsherret/rust-toolchain-file@v1 + + - name: Add wasm target + run: rustup target add wasm32-unknown-unknown + + - name: Build API Cloudflare Worker + run: | + cargo install worker-build --locked + worker-build --release + working-directory: workers-rs + - name: terraform plan run: | touch terraform/prod.secret.tfvars diff --git a/api/src/config.rs b/api/src/config.rs index 6c30a6655..fd124f6c7 100644 --- a/api/src/config.rs +++ b/api/src/config.rs @@ -218,6 +218,16 @@ pub struct Config { #[clap(long = "database_pool_size", default_value = "3")] /// The size of the database connection pool. pub database_pool_size: u32, + + #[clap(long = "db_client_cert", env = "DB_CLIENT_CERT")] + /// PEM client certificate presented when connecting to the database over + /// TLS. Required once the DB enforces `TRUSTED_CLIENT_CERTIFICATE_REQUIRED`; + /// all three of cert/key/root must be set together to take effect. + pub db_client_cert: Option, + + #[clap(long = "db_client_key", env = "DB_CLIENT_KEY")] + /// PEM private key matching `db_client_cert`. + pub db_client_key: Option, } impl std::fmt::Debug for Config { @@ -247,6 +257,11 @@ impl std::fmt::Debug for Config { ) .field("email_from", &self.email_from) .field("email_from_name", &self.email_from_name) + .field( + "db_client_cert", + &self.db_client_cert.as_ref().map(|_| "***"), + ) + .field("db_client_key", &self.db_client_key.as_ref().map(|_| "***")) .finish() } } diff --git a/api/src/db/database.rs b/api/src/db/database.rs index b025aa853..0a586732d 100644 --- a/api/src/db/database.rs +++ b/api/src/db/database.rs @@ -14,7 +14,10 @@ use sqlx::FromRow; use sqlx::Result; use sqlx::Row; use sqlx::migrate; +use sqlx::postgres::PgConnectOptions; use sqlx::postgres::PgPoolOptions; +use sqlx::postgres::PgSslMode; +use std::str::FromStr; use tracing::instrument; use uuid::Uuid; @@ -54,16 +57,42 @@ pub struct Database { pool: sqlx::PgPool, } +/// Client-certificate TLS material for connecting to the database. Supplied +/// when the database requires a client certificate (`ssl_mode = +/// TRUSTED_CLIENT_CERTIFICATE_REQUIRED`); the same cert is also presented by +/// the Hyperdrive-backed `api` Worker so both reach Cloud SQL over mTLS. +pub struct DbTls { + pub client_cert: String, + pub client_key: String, +} + impl Database { pub async fn connect( database_url: &str, pool_size: u32, acquire_timeout: std::time::Duration, + tls: Option, ) -> anyhow::Result { + let mut opts = PgConnectOptions::from_str(database_url)?; + if let Some(tls) = tls { + // Present our client cert (the DB requires one) and encrypt, but don't + // verify the server cert. We use `Require`, not `VerifyCa`: Cloud Run + // connects to Cloud SQL by private IP, yet the server cert is only valid + // for the instance's `*.sql.goog` DNS name. `VerifyCa` is meant to skip + // that hostname check, but sqlx 0.8's `NoHostnameTlsVerifier` only + // swallows rustls's legacy `NotValidForName` error, not 0.23's + // `NotValidForNameContext`, so verification fails and the connection is + // refused. The client certificate (mTLS) is the access boundary and the + // link stays inside the VPC. + opts = opts + .ssl_mode(PgSslMode::Require) + .ssl_client_cert_from_pem(tls.client_cert.into_bytes()) + .ssl_client_key_from_pem(tls.client_key.into_bytes()); + } let pool = PgPoolOptions::new() .max_connections(pool_size) .acquire_timeout(acquire_timeout) - .connect(database_url) + .connect_with(opts) .await?; if std::env::var("DATABASE_DISABLE_MIGRATIONS").is_err() { migrate!("./migrations") diff --git a/api/src/db/ephemeral_database.rs b/api/src/db/ephemeral_database.rs index 19d39788c..24744018a 100644 --- a/api/src/db/ephemeral_database.rs +++ b/api/src/db/ephemeral_database.rs @@ -54,9 +54,10 @@ impl EphemeralDatabase { pg_execute(format!("CREATE DATABASE \"{database_name}\"")); - let database = Database::connect(&database_url, 1, Duration::from_secs(5)) - .await - .unwrap(); + let database = + Database::connect(&database_url, 1, Duration::from_secs(5), None) + .await + .unwrap(); Self { database: Some(database), diff --git a/api/src/main.rs b/api/src/main.rs index ca1d586cf..050a077cc 100644 --- a/api/src/main.rs +++ b/api/src/main.rs @@ -180,10 +180,19 @@ async fn main() { }; setup_tracing("api", export_target, config.deployment_environment).await; + let db_tls = match (config.db_client_cert, config.db_client_key) { + (Some(client_cert), Some(client_key)) => Some(crate::db::DbTls { + client_cert, + client_key, + }), + _ => None, + }; + let database = Database::connect( &config.database_url, config.database_pool_size, Duration::from_secs(15), + db_tls, ) .await .unwrap(); diff --git a/lb/local.ts b/lb/local.ts index e6b8535a0..5ad33bc8c 100755 --- a/lb/local.ts +++ b/lb/local.ts @@ -196,9 +196,28 @@ const frontendShim: Fetcher = { // deno-lint-ignore no-explicit-any } as any; +// Local-dev shim for the API service binding: forwards to the API server's +// HTTP URL on localhost. In prod this binding points at the `api` Worker; for +// local dev we forward straight to the compute server so the harness keeps +// working without running the Worker separately. +const apiShim: Fetcher = { + fetch(req: Request | string | URL, init?: RequestInit) { + if (typeof req === "string" || req instanceof URL) { + return fetch( + new URL(new URL(req).pathname + new URL(req).search, REGISTRY_API_URL), + init, + ); + } + const url = new URL(req.url); + const target = new URL(url.pathname + url.search, REGISTRY_API_URL); + return fetch(new Request(target, req)); + }, + // deno-lint-ignore no-explicit-any +} as any; + function handler(req: Request): Promise { return main.fetch(req, { - REGISTRY_API_URL, + API: apiShim, FRONTEND: frontendShim, MODULES_BUCKET: new R2BucketShim(MODULES_BUCKET), NPM_BUCKET: new R2BucketShim(NPM_BUCKET), diff --git a/lb/main.ts b/lb/main.ts index bf0fe966e..8b24a4bd3 100644 --- a/lb/main.ts +++ b/lb/main.ts @@ -73,7 +73,7 @@ export async function handleAPIRequest( const response = await proxyToBackend( request, - env.REGISTRY_API_URL, + env.API, rewritePath ? (path) => `/api${path}` : undefined, ); diff --git a/lb/types.ts b/lb/types.ts index 71c468848..3f87262f5 100644 --- a/lb/types.ts +++ b/lb/types.ts @@ -13,7 +13,11 @@ declare global { export type PartialBucket = Pick; export interface WorkerEnv { - REGISTRY_API_URL: string; + // The API server is a sibling Cloudflare Worker (workers-rs), bound via a + // service binding rather than an HTTP URL so traffic stays inside Cloudflare. + // It serves the lightweight CRUD/DB/auth surface and itself proxies the + // compute-only paths to the Cloud Run compute service. + API: Fetcher; // The frontend is a sibling Cloudflare Worker, wired up via a service // binding rather than an HTTP URL so traffic stays inside Cloudflare. diff --git a/terraform/cloud_run_api.tf b/terraform/cloud_run_api.tf index 97aaadfb0..194cc8208 100644 --- a/terraform/cloud_run_api.tf +++ b/terraform/cloud_run_api.tf @@ -58,6 +58,15 @@ locals { "CLOUDFLARE_ACCOUNT_ID" = var.cloudflare_account_id "CLOUDFLARE_ZONE_ID" = var.cloudflare_zone_id "CLOUDFLARE_ANALYTICS_DATASET" = local.worker_download_analytics_dataset + + # Client certificate for the DB connection. The DB requires a client cert + # (ssl_mode = TRUSTED_CLIENT_CERTIFICATE_REQUIRED, see db.tf), so both Cloud + # Run services present it over the private VPC IP; the same cert is handed to + # the Hyperdrive config fronting the `api` Worker. The connection uses + # sslmode=require (encrypt + client auth, no server verification — we connect + # by IP), so no server CA is needed here. Plain env, like DATABASE_URL. + "DB_CLIENT_CERT" = google_sql_ssl_cert.api.cert + "DB_CLIENT_KEY" = google_sql_ssl_cert.api.private_key }) } diff --git a/terraform/cloudflare_api.tf b/terraform/cloudflare_api.tf new file mode 100644 index 000000000..04ff63873 --- /dev/null +++ b/terraform/cloudflare_api.tf @@ -0,0 +1,127 @@ +// Copyright 2024 the JSR authors. All rights reserved. MIT license. + +// The `api` Cloudflare Worker (workers-rs, wasm32) that fronts `api.jsr.io`. +// It serves the lightweight CRUD/DB/auth surface directly — reaching the +// existing Postgres through Cloudflare Hyperdrive (no sqlx) — and proxies the +// compute-only paths (publish, docs, source, diff, graph, /tasks/*) to the +// Cloud Run compute service. The `lb` Worker service-binds this Worker for the +// `api.jsr.io` backend (see lb.tf), exactly as it already does the frontend. +// +// Deployed as the same worker/version/deployment triple as the frontend (see +// cloudflare_frontend.tf): an immutable version holding the built wasm bundle, +// and a deployment pinning 100% of traffic to it. + +# Hyperdrive carries the client certificate the Worker presents to Cloud SQL, +# uploaded as an account mTLS certificate. Same cert/key as google_sql_ssl_cert +# .api, which both Cloud Run services also present (see db.tf, cloud_run_api.tf). +resource "cloudflare_mtls_certificate" "api_db_client" { + account_id = var.cloudflare_account_id + name = "${var.gcp_project}-jsr-api-db-client" + certificates = google_sql_ssl_cert.api.cert + private_key = google_sql_ssl_cert.api.private_key + ca = false +} + +# Cloud SQL's server CA, uploaded so Hyperdrive can verify the origin (verify-ca: +# we connect by IP, so the hostname isn't checked, but the CA is). +resource "cloudflare_mtls_certificate" "api_db_ca" { + account_id = var.cloudflare_account_id + name = "${var.gcp_project}-jsr-api-db-ca" + certificates = google_sql_ssl_cert.api.server_ca_cert + ca = true +} + +# Hyperdrive connection to the existing Postgres over the public IP, with the +# client certificate (mTLS) as the access boundary — the DB requires it +# (ssl_mode = TRUSTED_CLIENT_CERTIFICATE_REQUIRED, see db.tf). +resource "cloudflare_hyperdrive_config" "api" { + account_id = var.cloudflare_account_id + name = "${var.gcp_project}-jsr-api" + + origin = { + scheme = "postgres" + database = google_sql_database.database.name + host = google_sql_database_instance.main_pg15.public_ip_address + port = 5432 + user = google_sql_user.api.name + password = google_sql_user.api.password + } + + mtls = { + sslmode = "verify-ca" + ca_certificate_id = cloudflare_mtls_certificate.api_db_ca.id + mtls_certificate_id = cloudflare_mtls_certificate.api_db_client.id + } +} + +resource "cloudflare_worker" "jsr_api" { + account_id = var.cloudflare_account_id + name = "${var.gcp_project}-jsr-api" + + observability = { + enabled = true + logs = { + enabled = true + invocation_logs = false + head_sampling_rate = 0.01 + persist = false + destinations = [var.cloudflare_otlp_logs_destination] + } + traces = { + enabled = true + head_sampling_rate = 0.01 + persist = false + destinations = [var.cloudflare_otlp_traces_destination] + } + } +} + +resource "cloudflare_worker_version" "jsr_api" { + account_id = var.cloudflare_account_id + worker_id = cloudflare_worker.jsr_api.id + main_module = "index.js" + compatibility_date = "2026-05-19" + compatibility_flags = ["nodejs_compat"] + + # `worker-build --release` (run in CI before terraform) emits the bundle into + # workers-rs/build: the esbuild entrypoint `index.js` and the wasm it imports + # as `./index_bg.wasm` (build/worker/shim.mjs is only a back-compat re-export + # of ../index.js, used by `wrangler dev`). + modules = [ + { + name = "index.js" + content_file = "${path.module}/../workers-rs/build/index.js" + content_type = "application/javascript+module" + }, + { + name = "index_bg.wasm" + content_file = "${path.module}/../workers-rs/build/index_bg.wasm" + content_type = "application/wasm" + }, + ] + + bindings = [ + { + type = "hyperdrive" + name = "HYPERDRIVE" + id = cloudflare_hyperdrive_config.api.id + }, { + # The Cloud Run compute service the Worker proxies compute-only paths to. + # Public Cloud Run URL (the same value lb used for REGISTRY_API_URL before + # the cutover); reached over `fetch` (see workers-rs proxy_to_compute). + type = "plain_text" + name = "COMPUTE_API_URL" + text = google_cloud_run_v2_service.registry_api.uri + } + ] +} + +resource "cloudflare_workers_deployment" "jsr_api" { + account_id = var.cloudflare_account_id + script_name = cloudflare_worker.jsr_api.name + strategy = "percentage" + versions = [{ + percentage = 100 + version_id = cloudflare_worker_version.jsr_api.id + }] +} diff --git a/terraform/db.tf b/terraform/db.tf index 09017b33e..af494bdf7 100644 --- a/terraform/db.tf +++ b/terraform/db.tf @@ -22,7 +22,20 @@ resource "google_sql_database_instance" "main_pg15" { ip_configuration { ipv4_enabled = true private_network = google_compute_network.main.self_link - ssl_mode = "ENCRYPTED_ONLY" + + # Cloudflare Hyperdrive (fronting the `api` Worker) reaches Cloud SQL over + # the public IP — Hyperdrive's egress isn't a pinnable range, so it can't + # be an allowlist entry. The public IP is left open and a required client + # certificate (mTLS) is the access boundary instead of a network ACL: the + # client key is secret, so only cert holders connect. Cloud Run still + # reaches the DB over the private VPC and presents the same cert (see + # google_sql_ssl_cert.api + cloud_run_api.tf). + ssl_mode = "TRUSTED_CLIENT_CERTIFICATE_REQUIRED" + + authorized_networks { + name = "hyperdrive-public-egress" + value = "0.0.0.0/0" + } } backup_configuration { @@ -49,6 +62,16 @@ resource "google_sql_database" "database" { instance = google_sql_database_instance.main_pg15.name } +# Client certificate the API presents when connecting to Cloud SQL over TLS. +# Delivered to both Cloud Run services as env (see cloud_run_api.tf) and, later, +# to the Hyperdrive config that fronts the `api` Worker. It is presented now +# (harmless under the current ssl_mode) so it is already in place before the DB +# is flipped to TRUSTED_CLIENT_CERTIFICATE_REQUIRED in a follow-up. +resource "google_sql_ssl_cert" "api" { + common_name = "api-client" + instance = google_sql_database_instance.main_pg15.name +} + resource "google_sql_user" "api" { name = "api" instance = google_sql_database_instance.main_pg15.name diff --git a/terraform/lb.tf b/terraform/lb.tf index 1f8e0e0f9..bf0b3b443 100644 --- a/terraform/lb.tf +++ b/terraform/lb.tf @@ -53,12 +53,15 @@ resource "cloudflare_workers_script" "jsr_lb" { name = "NPM_DOMAIN" text = local.npm_domain }, { - # Cloud Run service URLs aren't secret; keep as plain_text so the - # current value is visible from the worker's bindings page (needed - # to verify the LB is pointing at a live Cloud Run revision). - type = "plain_text" - name = "REGISTRY_API_URL" - text = google_cloud_run_v2_service.registry_api.uri + # Service binding to the `api` Worker (workers-rs), which fronts + # api.jsr.io: it serves the lightweight CRUD/DB/auth surface and itself + # proxies the compute-only paths to Cloud Run. Replaces the former + # REGISTRY_API_URL plain_text backend — the LB no longer talks to Cloud Run + # directly. Like FRONTEND, the depends_on below waits for the api Worker's + # version to be promoted so the LB never references an un-promoted version. + type = "service" + name = "API" + service = "${var.gcp_project}-jsr-api" }, { # Service binding to the frontend Worker. Terraform uploads new # versions via `cloudflare_worker_version.jsr_frontend` and @@ -85,7 +88,10 @@ resource "cloudflare_workers_script" "jsr_lb" { } ] - depends_on = [cloudflare_workers_deployment.jsr_frontend] + depends_on = [ + cloudflare_workers_deployment.jsr_frontend, + cloudflare_workers_deployment.jsr_api, + ] lifecycle { create_before_destroy = true diff --git a/workers-rs/README.md b/workers-rs/README.md index 066bfc4d0..ccf3df15d 100644 --- a/workers-rs/README.md +++ b/workers-rs/README.md @@ -22,9 +22,11 @@ the migration sequence completes, this Worker will: - `GET /api/stats` — front-page newest/updated/featured package lists. - `GET /api/metrics` — registry-wide package/version/user counts. -Everything else still returns `501 Not Implemented`; the remaining endpoint -groups land one PR at a time per the design doc's sequence. The Worker is not -yet fronting prod traffic. +Everything else is reverse-proxied to the Cloud Run compute service by the +axum fallback (`COMPUTE_API_URL`); the remaining endpoint groups are migrated to +run locally one PR at a time per the design doc's sequence. The Worker fronts +`api.jsr.io` (the `lb` Worker service-binds it as the `API` backend), reaching +Postgres through Hyperdrive over mTLS. The `/api/stats` and `/api/metrics` handlers reach Postgres through Hyperdrive (`tokio-postgres`, no `sqlx`) and serialize the **same** `jsr_types::api` wire