diff --git a/pyiceberg/catalog/rest/__init__.py b/pyiceberg/catalog/rest/__init__.py index 7fa81312d1..fc5965c52b 100644 --- a/pyiceberg/catalog/rest/__init__.py +++ b/pyiceberg/catalog/rest/__init__.py @@ -754,6 +754,8 @@ def _split_identifier_for_json(self, identifier: str | Identifier) -> dict[str, return {"namespace": identifier_tuple[:-1], "name": identifier_tuple[-1]} def _init_sigv4(self, session: Session) -> None: + import base64 + import hashlib from urllib import parse import boto3 @@ -762,6 +764,22 @@ def _init_sigv4(self, session: Session) -> None: from requests import PreparedRequest from requests.adapters import HTTPAdapter + class _IcebergSigV4Auth(SigV4Auth): + def canonical_request(self, request: AWSRequest) -> str: + # Override forces hex payload hash in the canonical request even when + # x-amz-content-sha256 header is base64 (see body-hash block below). + # Mirrors botocore <=1.42.x SigV4Auth.canonical_request layout: + # https://github.com/boto/botocore/blob/1.42.85/botocore/auth.py#L622-L637 + cr = [request.method.upper()] + path = self._normalize_url_path(parse.urlsplit(request.url).path) + cr.append(path) + cr.append(self.canonical_query_string(request)) + headers_to_sign = self.headers_to_sign(request) + cr.append(self.canonical_headers(headers_to_sign) + "\n") + cr.append(self.signed_headers(headers_to_sign)) + cr.append(self.payload(request)) + return "\n".join(cr) + class SigV4Adapter(HTTPAdapter): def __init__(self, **properties: str): self._properties = properties @@ -788,17 +806,36 @@ def add_headers(self, request: PreparedRequest, **kwargs: Any) -> None: # pylin # remove the connection header as it will be updated after signing if "connection" in request.headers: del request.headers["connection"] - # For empty bodies, explicitly set the content hash header to the SHA256 of an empty string - if not request.body: - request.headers["x-amz-content-sha256"] = EMPTY_BODY_SHA256 + + # Match Iceberg Java's AWS SDK v2 flexible-checksum signing: + # x-amz-content-sha256 header is base64 for non-empty bodies, hex for empty. + # The SigV4 canonical request still uses hex (enforced in _IcebergSigV4Auth above). + # Ref: https://github.com/apache/iceberg/blob/main/aws/src/main/java/org/apache/iceberg/aws/RESTSigV4AuthSession.java + if request.body: + if isinstance(request.body, str): + body_bytes = request.body.encode("utf-8") + elif isinstance(request.body, (bytes, bytearray)): + body_bytes = request.body + else: + raise TypeError( + f"Unsupported request body type for SigV4 signing: " + f"{type(request.body).__name__}; expected str or bytes." + ) + content_sha256_header = base64.b64encode(hashlib.sha256(body_bytes).digest()).decode() + else: + content_sha256_header = EMPTY_BODY_SHA256 + + signing_headers = dict(request.headers) + signing_headers["x-amz-content-sha256"] = content_sha256_header aws_request = AWSRequest( - method=request.method, url=url, params=params, data=request.body, headers=dict(request.headers) + method=request.method, url=url, params=params, data=request.body, headers=signing_headers ) - SigV4Auth(credentials, service, region).add_auth(aws_request) - original_header = request.headers - signed_headers = aws_request.headers + _IcebergSigV4Auth(credentials, service, region).add_auth(aws_request) + + original_header = dict(request.headers) + signed_headers = dict(aws_request.headers) relocated_headers = {} # relocate headers if there is a conflict with signed headers diff --git a/pyproject.toml b/pyproject.toml index 96118f8451..4293e057a2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -92,7 +92,7 @@ sql-postgres = [ ] sql-sqlite = ["sqlalchemy>=2.0.18,<3"] gcsfs = ["gcsfs>=2023.1.0"] -rest-sigv4 = ["boto3>=1.24.59"] +rest-sigv4 = ["boto3>=1.24.59", "botocore<2"] hf = ["huggingface-hub>=0.24.0"] pyiceberg-core = ["pyiceberg-core>=0.5.1,<0.10.0"] datafusion = ["datafusion>=52,<53"] diff --git a/tests/catalog/test_rest.py b/tests/catalog/test_rest.py index 2adfe9f06e..0e5404029a 100644 --- a/tests/catalog/test_rest.py +++ b/tests/catalog/test_rest.py @@ -18,6 +18,7 @@ from __future__ import annotations import base64 +import hashlib import os from collections.abc import Callable from typing import Any, cast @@ -514,9 +515,16 @@ def test_sigv4_sign_request_without_body(rest_mock: Mocker) -> None: assert isinstance(adapter, HTTPAdapter) adapter.add_headers(prepared) - assert prepared.headers["Authorization"].startswith("AWS4-HMAC-SHA256") + auth_header = prepared.headers["Authorization"] + assert auth_header.startswith("AWS4-HMAC-SHA256 Credential=") assert prepared.headers["Original-Authorization"] == f"Bearer {existing_token}" assert prepared.headers["x-amz-content-sha256"] == EMPTY_BODY_SHA256 + # Verify the signature format: Credential, SignedHeaders, Signature + assert "Credential=" in auth_header + assert "SignedHeaders=" in auth_header + assert "Signature=" in auth_header + # x-amz-content-sha256 should be in signed headers + assert "x-amz-content-sha256" in auth_header def test_sigv4_sign_request_with_body(rest_mock: Mocker) -> None: @@ -545,9 +553,188 @@ def test_sigv4_sign_request_with_body(rest_mock: Mocker) -> None: assert isinstance(adapter, HTTPAdapter) adapter.add_headers(prepared) - assert prepared.headers["Authorization"].startswith("AWS4-HMAC-SHA256") + auth_header = prepared.headers["Authorization"] + assert auth_header.startswith("AWS4-HMAC-SHA256 Credential=") + assert "SignedHeaders=" in auth_header + # Conflicting Authorization header is relocated assert prepared.headers["Original-Authorization"] == f"Bearer {existing_token}" - assert prepared.headers.get("x-amz-content-sha256") != EMPTY_BODY_SHA256 + # Non-empty body should have base64-encoded SHA256 + content_sha256 = prepared.headers["x-amz-content-sha256"] + assert prepared.body is not None + body_bytes = prepared.body.encode("utf-8") if isinstance(prepared.body, str) else prepared.body + expected_sha256 = base64.b64encode(hashlib.sha256(body_bytes).digest()).decode() + assert content_sha256 == expected_sha256 + # x-amz-content-sha256 should be in signed headers + assert "x-amz-content-sha256" in auth_header + + +def test_sigv4_content_sha256_with_bytes_body(rest_mock: Mocker) -> None: + existing_token = "existing_token" + + catalog = RestCatalog( + "rest", + **{ + "uri": TEST_URI, + "token": existing_token, + "rest.sigv4-enabled": "true", + "rest.signing-region": "us-west-2", + "client.access-key-id": "id", + "client.secret-access-key": "secret", + }, + ) + + body_content = b'{"namespace": "test_namespace"}' + prepared = catalog._session.prepare_request( + Request( + "POST", + f"{TEST_URI}v1/namespaces", + data=body_content, + ) + ) + adapter = catalog._session.adapters[catalog.uri] + assert isinstance(adapter, HTTPAdapter) + adapter.add_headers(prepared) + + assert prepared.headers["Authorization"].startswith("AWS4-HMAC-SHA256 Credential=") + assert "SignedHeaders=" in prepared.headers["Authorization"] + content_sha256 = prepared.headers["x-amz-content-sha256"] + expected_sha256 = base64.b64encode(hashlib.sha256(body_content).digest()).decode() + assert content_sha256 == expected_sha256 + + +def test_sigv4_conflicting_sigv4_headers(rest_mock: Mocker) -> None: + catalog = RestCatalog( + "rest", + **{ + "uri": TEST_URI, + "rest.sigv4-enabled": "true", + "rest.signing-region": "us-west-2", + "client.access-key-id": "id", + "client.secret-access-key": "secret", + }, + ) + + prepared = catalog._session.prepare_request(Request("GET", f"{TEST_URI}v1/config")) + adapter = catalog._session.adapters[catalog.uri] + assert isinstance(adapter, HTTPAdapter) + + # Inject conflicting SigV4 headers before signing + prepared.headers["x-amz-content-sha256"] = "fake" + prepared.headers["X-Amz-Date"] = "fake" + + adapter.add_headers(prepared) + + # Matching Java SDK: conflicting headers are relocated with "Original-" prefix + assert prepared.headers.get("Original-x-amz-content-sha256") == "fake" + assert prepared.headers.get("Original-X-Amz-Date") == "fake" + # SigV4 headers are set correctly after signing + assert prepared.headers["Authorization"].startswith("AWS4-HMAC-SHA256 Credential=") + assert prepared.headers["x-amz-content-sha256"] == EMPTY_BODY_SHA256 + assert "X-Amz-Date" in prepared.headers + + +def test_sigv4_canonical_request_uses_hex_payload(rest_mock: Mocker) -> None: + """Verify that the canonical request uses hex-encoded payload hash, not the base64 header value.""" + from unittest.mock import patch + + from botocore.auth import SigV4Auth + + catalog = RestCatalog( + "rest", + **{ + "uri": TEST_URI, + "token": "token", + "rest.sigv4-enabled": "true", + "rest.signing-region": "us-west-2", + "client.access-key-id": "id", + "client.secret-access-key": "secret", + }, + ) + + body_content = b'{"namespace": "test"}' + prepared = catalog._session.prepare_request( + Request( + "POST", + f"{TEST_URI}v1/namespaces", + data=body_content, + ) + ) + adapter = catalog._session.adapters[catalog.uri] + assert isinstance(adapter, HTTPAdapter) + + # Capture the canonical request string during signing + captured_canonical = [] + original_add_auth = SigV4Auth.add_auth + + def capturing_add_auth(self: Any, request: Any) -> None: + captured_canonical.append(self.canonical_request(request)) + original_add_auth(self, request) + + with patch.object(SigV4Auth, "add_auth", capturing_add_auth): + adapter.add_headers(prepared) + + assert len(captured_canonical) == 1 + canonical_lines = captured_canonical[0].split("\n") + # Last line of canonical request is the payload hash + payload_hash = canonical_lines[-1] + # Must be hex-encoded (64 hex chars), not base64 + assert len(payload_hash) == 64 + assert payload_hash == hashlib.sha256(body_content).hexdigest() + # Meanwhile the header is base64-encoded + assert prepared.headers["x-amz-content-sha256"] == base64.b64encode(hashlib.sha256(body_content).digest()).decode() + + +def test_sigv4_content_sha256_matches_iceberg_java_reference(rest_mock: Mocker) -> None: + """Pin byte-for-byte equivalence with Iceberg Java TestRESTSigV4AuthSession (L121, L177).""" + java_reference_body = b'{"namespace":["ns"],"properties":{}}' + java_reference_base64 = "yc5oAKPWjHY4sW8XQq0l/3aNrrXJKBycVFNnDEGMfww=" + java_reference_empty_hex = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" + + catalog = RestCatalog( + "rest", + **{ + "uri": TEST_URI, + "rest.sigv4-enabled": "true", + "rest.signing-region": "us-east-1", + "client.access-key-id": "id", + "client.secret-access-key": "secret", + }, + ) + adapter = catalog._session.adapters[catalog.uri] + assert isinstance(adapter, HTTPAdapter) + + # Non-empty body: must match Java's base64 reference value exactly + prepared_with_body = catalog._session.prepare_request(Request("POST", f"{TEST_URI}v1/namespaces", data=java_reference_body)) + adapter.add_headers(prepared_with_body) + assert prepared_with_body.headers["x-amz-content-sha256"] == java_reference_base64 + + # Empty body: must match Java's hex reference value exactly + prepared_empty = catalog._session.prepare_request(Request("GET", f"{TEST_URI}v1/config")) + adapter.add_headers(prepared_empty) + assert prepared_empty.headers["x-amz-content-sha256"] == java_reference_empty_hex + + +def test_sigv4_unsupported_body_type_raises(rest_mock: Mocker) -> None: + """Unsupported body types (e.g. file-like) raise a clear error rather than crashing in hashlib.""" + catalog = RestCatalog( + "rest", + **{ + "uri": TEST_URI, + "rest.sigv4-enabled": "true", + "rest.signing-region": "us-east-1", + "client.access-key-id": "id", + "client.secret-access-key": "secret", + }, + ) + adapter = catalog._session.adapters[catalog.uri] + assert isinstance(adapter, HTTPAdapter) + + prepared = catalog._session.prepare_request(Request("POST", f"{TEST_URI}v1/namespaces")) + # Inject an unsupported body type (a list — not str/bytes) + prepared.body = ["not", "a", "valid", "body"] # type: ignore[assignment] + + with pytest.raises(TypeError, match="Unsupported request body type for SigV4 signing"): + adapter.add_headers(prepared) def test_sigv4_adapter_default_retry_config(rest_mock: Mocker) -> None: diff --git a/uv.lock b/uv.lock index abf58a41a5..fcccbc5e9a 100644 --- a/uv.lock +++ b/uv.lock @@ -4688,6 +4688,7 @@ ray = [ ] rest-sigv4 = [ { name = "boto3" }, + { name = "botocore" }, ] s3fs = [ { name = "s3fs" }, @@ -4754,6 +4755,7 @@ requires-dist = [ { name = "boto3", marker = "extra == 'dynamodb'", specifier = ">=1.24.59" }, { name = "boto3", marker = "extra == 'glue'", specifier = ">=1.24.59" }, { name = "boto3", marker = "extra == 'rest-sigv4'", specifier = ">=1.24.59" }, + { name = "botocore", marker = "extra == 'rest-sigv4'", specifier = "<2" }, { name = "cachetools", specifier = ">=5.5,<8.0" }, { name = "click", specifier = ">=7.1.1,<9.0.0" }, { name = "daft", marker = "extra == 'daft'", specifier = ">=0.7.10" },