From 93d77d38fd892161de1b34167426b966bc4ae2d4 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Sun, 19 Apr 2026 01:07:45 -0700 Subject: [PATCH 01/24] Implementation Co-Authored-By: Claude Opus 4.6 (1M context) --- pyiceberg/catalog/__init__.py | 83 +++++ pyiceberg/catalog/noop.py | 23 ++ pyiceberg/catalog/rest/__init__.py | 81 ++++- pyiceberg/partitioning.py | 67 ++++ pyiceberg/schema.py | 52 +++ pyiceberg/table/__init__.py | 122 +++++++ tests/catalog/test_rest.py | 479 ++++++++++++++++++++++++- tests/integration/test_rest_catalog.py | 478 ++++++++++++++++++++++++ tests/table/test_partitioning.py | 54 ++- tests/test_schema.py | 88 +++++ 10 files changed, 1523 insertions(+), 4 deletions(-) diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index 5797e1f050..be782576dd 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -47,6 +47,7 @@ DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE, CommitTableResponse, CreateTableTransaction, + ReplaceTableTransaction, StagedTable, Table, TableProperties, @@ -442,6 +443,66 @@ def create_table_if_not_exists( except TableAlreadyExistsError: return self.load_table(identifier) + @abstractmethod + def replace_table( + self, + identifier: str | Identifier, + schema: Schema | pa.Schema, + location: str | None = None, + partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC, + sort_order: SortOrder = UNSORTED_SORT_ORDER, + properties: Properties = EMPTY_DICT, + ) -> Table: + """Atomically replace a table's schema, spec, sort order, location, and properties. + + The table UUID and history (snapshots, schemas, specs, sort orders) are preserved. + The current snapshot is cleared (main branch ref is removed). + + Args: + identifier (str | Identifier): Table identifier. + schema (Schema): New table schema. + location (str | None): New table location. Defaults to the existing location. + partition_spec (PartitionSpec): New partition spec. + sort_order (SortOrder): New sort order. + properties (Properties): New table properties (merged with existing). + + Returns: + Table: the replaced table instance. + + Raises: + NoSuchTableError: If the table does not exist. + """ + + @abstractmethod + def replace_table_transaction( + self, + identifier: str | Identifier, + schema: Schema | pa.Schema, + location: str | None = None, + partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC, + sort_order: SortOrder = UNSORTED_SORT_ORDER, + properties: Properties = EMPTY_DICT, + ) -> ReplaceTableTransaction: + """Create a ReplaceTableTransaction. + + The transaction can be used to stage additional changes (schema evolution, + partition evolution, etc.) before committing. + + Args: + identifier (str | Identifier): Table identifier. + schema (Schema): New table schema. + location (str | None): New table location. Defaults to the existing location. + partition_spec (PartitionSpec): New partition spec. + sort_order (SortOrder): New sort order. + properties (Properties): New table properties (merged with existing). + + Returns: + ReplaceTableTransaction: A transaction for the replace operation. + + Raises: + NoSuchTableError: If the table does not exist. + """ + @abstractmethod def load_table(self, identifier: str | Identifier) -> Table: """Load the table's metadata and returns the table instance. @@ -888,6 +949,28 @@ def create_table_transaction( self._create_staged_table(identifier, schema, location, partition_spec, sort_order, properties) ) + def replace_table( + self, + identifier: str | Identifier, + schema: Schema | pa.Schema, + location: str | None = None, + partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC, + sort_order: SortOrder = UNSORTED_SORT_ORDER, + properties: Properties = EMPTY_DICT, + ) -> Table: + raise NotImplementedError("replace_table is not yet supported for this catalog type") + + def replace_table_transaction( + self, + identifier: str | Identifier, + schema: Schema | pa.Schema, + location: str | None = None, + partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC, + sort_order: SortOrder = UNSORTED_SORT_ORDER, + properties: Properties = EMPTY_DICT, + ) -> ReplaceTableTransaction: + raise NotImplementedError("replace_table_transaction is not yet supported for this catalog type") + def table_exists(self, identifier: str | Identifier) -> bool: try: self.load_table(identifier) diff --git a/pyiceberg/catalog/noop.py b/pyiceberg/catalog/noop.py index c5399ad62e..b07e2cc824 100644 --- a/pyiceberg/catalog/noop.py +++ b/pyiceberg/catalog/noop.py @@ -26,6 +26,7 @@ from pyiceberg.table import ( CommitTableResponse, CreateTableTransaction, + ReplaceTableTransaction, Table, ) from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder @@ -64,6 +65,28 @@ def create_table_transaction( ) -> CreateTableTransaction: raise NotImplementedError + def replace_table( + self, + identifier: str | Identifier, + schema: Schema | pa.Schema, + location: str | None = None, + partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC, + sort_order: SortOrder = UNSORTED_SORT_ORDER, + properties: Properties = EMPTY_DICT, + ) -> Table: + raise NotImplementedError + + def replace_table_transaction( + self, + identifier: str | Identifier, + schema: Schema | pa.Schema, + location: str | None = None, + partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC, + sort_order: SortOrder = UNSORTED_SORT_ORDER, + properties: Properties = EMPTY_DICT, + ) -> ReplaceTableTransaction: + raise NotImplementedError + def load_table(self, identifier: str | Identifier) -> Table: raise NotImplementedError diff --git a/pyiceberg/catalog/rest/__init__.py b/pyiceberg/catalog/rest/__init__.py index d06fd3885b..94c3a92f6b 100644 --- a/pyiceberg/catalog/rest/__init__.py +++ b/pyiceberg/catalog/rest/__init__.py @@ -67,13 +67,19 @@ FileIO, load_file_io, ) -from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec, assign_fresh_partition_spec_ids -from pyiceberg.schema import Schema, assign_fresh_schema_ids +from pyiceberg.partitioning import ( + UNPARTITIONED_PARTITION_SPEC, + PartitionSpec, + assign_fresh_partition_spec_ids, + assign_fresh_partition_spec_ids_for_replace, +) +from pyiceberg.schema import Schema, assign_fresh_schema_ids, assign_fresh_schema_ids_for_replace from pyiceberg.table import ( CommitTableRequest, CommitTableResponse, CreateTableTransaction, FileScanTask, + ReplaceTableTransaction, StagedTable, Table, TableIdentifier, @@ -937,6 +943,77 @@ def create_table_transaction( staged_table = self._response_to_staged_table(self.identifier_to_tuple(identifier), table_response) return CreateTableTransaction(staged_table) + def replace_table( + self, + identifier: str | Identifier, + schema: Schema | pa.Schema, + location: str | None = None, + partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC, + sort_order: SortOrder = UNSORTED_SORT_ORDER, + properties: Properties = EMPTY_DICT, + ) -> Table: + txn = self.replace_table_transaction( + identifier=identifier, + schema=schema, + location=location, + partition_spec=partition_spec, + sort_order=sort_order, + properties=properties, + ) + return txn.commit_transaction() + + @retry(**_RETRY_ARGS) + def replace_table_transaction( + self, + identifier: str | Identifier, + schema: Schema | pa.Schema, + location: str | None = None, + partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC, + sort_order: SortOrder = UNSORTED_SORT_ORDER, + properties: Properties = EMPTY_DICT, + ) -> ReplaceTableTransaction: + existing_table = self.load_table(identifier) + existing_metadata = existing_table.metadata + + iceberg_schema = self._convert_schema_if_needed( + schema, + int(properties.get(TableProperties.FORMAT_VERSION, existing_metadata.format_version)), # type: ignore + ) + + # Assign fresh schema IDs, reusing IDs from the existing schema by field name + fresh_schema, _ = assign_fresh_schema_ids_for_replace( + iceberg_schema, existing_metadata.schema(), existing_metadata.last_column_id + ) + + # Assign fresh partition spec IDs, reusing IDs from existing specs + fresh_partition_spec, _ = assign_fresh_partition_spec_ids_for_replace( + partition_spec, iceberg_schema, fresh_schema, existing_metadata.partition_specs, existing_metadata.last_partition_id + ) + + # Assign fresh sort order IDs + fresh_sort_order = assign_fresh_sort_order_ids(sort_order, iceberg_schema, fresh_schema) + + # Use existing location if not specified + resolved_location = location.rstrip("/") if location else existing_metadata.location + + # Create a StagedTable from the existing table + staged_table = StagedTable( + identifier=existing_table.name(), + metadata=existing_metadata, + metadata_location=existing_table.metadata_location, + io=existing_table.io, + catalog=self, + ) + + return ReplaceTableTransaction( + table=staged_table, + new_schema=fresh_schema, + new_spec=fresh_partition_spec, + new_sort_order=fresh_sort_order, + new_location=resolved_location, + new_properties=properties, + ) + @retry(**_RETRY_ARGS) def create_view( self, diff --git a/pyiceberg/partitioning.py b/pyiceberg/partitioning.py index 3de185d886..3861e62c30 100644 --- a/pyiceberg/partitioning.py +++ b/pyiceberg/partitioning.py @@ -335,6 +335,73 @@ def assign_fresh_partition_spec_ids(spec: PartitionSpec, old_schema: Schema, fre return PartitionSpec(*partition_fields, spec_id=INITIAL_PARTITION_SPEC_ID) +def assign_fresh_partition_spec_ids_for_replace( + spec: PartitionSpec, + old_schema: Schema, + fresh_schema: Schema, + existing_specs: list[PartitionSpec], + last_partition_id: int | None, +) -> tuple[PartitionSpec, int]: + """Assign partition field IDs for a replace operation, reusing IDs from existing specs. + + For each partition field, if a field with the same (source_id, transform) pair exists in + any of the existing specs, its partition field ID is reused; otherwise a fresh ID is + allocated starting from last_partition_id + 1. + + Args: + spec: The new partition spec to assign IDs to. + old_schema: The schema that the new spec's source_ids reference. + fresh_schema: The schema with freshly assigned field IDs. + existing_specs: All partition specs from the existing table metadata. + last_partition_id: The current table's last_partition_id. + + Returns: + A tuple of (fresh_spec, new_last_partition_id). + """ + effective_last_partition_id = last_partition_id if last_partition_id is not None else PARTITION_FIELD_ID_START - 1 + + # Build (source_id, transform) → partition_field_id mapping from all existing specs + # Use max() for dedup when the same (source_id, transform) appears in multiple specs + transform_to_field_id: dict[tuple[int, str], int] = {} + for existing_spec in existing_specs: + for field in existing_spec.fields: + key = (field.source_id, str(field.transform)) + if key not in transform_to_field_id or field.field_id > transform_to_field_id[key]: + transform_to_field_id[key] = field.field_id + + next_id = effective_last_partition_id + partition_fields = [] + for field in spec.fields: + original_column_name = old_schema.find_column_name(field.source_id) + if original_column_name is None: + raise ValueError(f"Could not find in old schema: {field}") + fresh_field = fresh_schema.find_field(original_column_name) + if fresh_field is None: + raise ValueError(f"Could not find field in fresh schema: {original_column_name}") + + validate_partition_name(field.name, field.transform, fresh_field.field_id, fresh_schema, set()) + + key = (fresh_field.field_id, str(field.transform)) + if key in transform_to_field_id: + partition_field_id = transform_to_field_id[key] + else: + next_id += 1 + partition_field_id = next_id + transform_to_field_id[key] = partition_field_id + + partition_fields.append( + PartitionField( + name=field.name, + source_id=fresh_field.field_id, + field_id=partition_field_id, + transform=field.transform, + ) + ) + + new_last_partition_id = max(next_id, effective_last_partition_id) + return PartitionSpec(*partition_fields, spec_id=INITIAL_PARTITION_SPEC_ID), new_last_partition_id + + T = TypeVar("T") diff --git a/pyiceberg/schema.py b/pyiceberg/schema.py index fd60eb8f94..9d60787978 100644 --- a/pyiceberg/schema.py +++ b/pyiceberg/schema.py @@ -1380,6 +1380,58 @@ def primitive(self, primitive: PrimitiveType) -> PrimitiveType: return primitive +class _SetFreshIDsForReplace(_SetFreshIDs): + """Assign fresh IDs for a replace operation, reusing IDs from the base schema by field name. + + For each field in the new schema, if a field with the same full name exists in the + base schema, its ID is reused; otherwise a fresh ID is allocated starting from + last_column_id + 1. + """ + + def __init__(self, old_id_to_base_id: dict[int, int], starting_id: int) -> None: + self.old_id_to_new_id: dict[int, int] = {} + self._old_id_to_base_id = old_id_to_base_id + counter = itertools.count(starting_id + 1) + self.next_id_func = lambda: next(counter) + + def _get_and_increment(self, current_id: int) -> int: + if current_id in self._old_id_to_base_id: + new_id = self._old_id_to_base_id[current_id] + else: + new_id = self.next_id_func() + self.old_id_to_new_id[current_id] = new_id + return new_id + + +def assign_fresh_schema_ids_for_replace(schema: Schema, base_schema: Schema, last_column_id: int) -> tuple[Schema, int]: + """Assign fresh IDs to a schema for a replace operation, reusing IDs from the base schema. + + For each field in the new schema, if a field with the same full path name exists + in the base schema, its ID is reused. New fields get IDs starting from + last_column_id + 1. + + Args: + schema: The new schema to assign IDs to. + base_schema: The existing table's current schema (IDs are reused from here by name). + last_column_id: The current table's last_column_id (new IDs start above this). + + Returns: + A tuple of (fresh_schema, new_last_column_id). + """ + base_name_to_id = index_by_name(base_schema) + new_id_to_name = index_name_by_id(schema) + + old_id_to_base_id: dict[int, int] = {} + for old_id, name in new_id_to_name.items(): + if name in base_name_to_id: + old_id_to_base_id[old_id] = base_name_to_id[name] + + visitor = _SetFreshIDsForReplace(old_id_to_base_id, last_column_id) + fresh_schema = pre_order_visit(schema, visitor) + new_last_column_id = max(fresh_schema.highest_field_id, last_column_id) + return fresh_schema, new_last_column_id + + # Implementation copied from Apache Iceberg repo. def make_compatible_name(name: str) -> str: """Make a field name compatible with Avro specification. diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index bb8765b651..ec2f54ad57 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -62,6 +62,7 @@ AssertTableUUID, AssignUUIDUpdate, RemovePropertiesUpdate, + RemoveSnapshotRefUpdate, SetCurrentSchemaUpdate, SetDefaultSortOrderUpdate, SetDefaultSpecUpdate, @@ -1009,6 +1010,127 @@ def commit_transaction(self) -> Table: return self._table +class ReplaceTableTransaction(Transaction): + """A transaction that replaces an existing table's schema, spec, sort order, location, and properties. + + The existing table UUID, snapshots, snapshot log, metadata log, and history are preserved. + The "main" branch ref is removed (current-snapshot-id set to -1), and new + schema/spec/sort-order/location/properties are applied. + """ + + def _initial_changes( + self, + table_metadata: TableMetadata, + new_schema: Schema, + new_spec: PartitionSpec, + new_sort_order: SortOrder, + new_location: str, + new_properties: Properties, + ) -> None: + """Set the initial changes that transform the existing table into the replacement.""" + # Remove the main branch ref to clear the current snapshot + self._updates += (RemoveSnapshotRefUpdate(ref_name=MAIN_BRANCH),) + + # Reuse an existing schema if structurally identical (ignoring schema_id). + existing_schema_id = self._find_matching_schema_id(table_metadata, new_schema) + if existing_schema_id is not None: + if existing_schema_id != table_metadata.current_schema_id: + self._updates += (SetCurrentSchemaUpdate(schema_id=existing_schema_id),) + else: + self._updates += ( + AddSchemaUpdate(schema_=new_schema), + SetCurrentSchemaUpdate(schema_id=-1), + ) + + # Only emit AddPartitionSpecUpdate + SetDefaultSpecUpdate(-1) when the spec + # is new. If an identical spec already exists, use its concrete ID. + effective_spec = UNPARTITIONED_PARTITION_SPEC if new_spec.is_unpartitioned() else new_spec + existing_spec_id = self._find_matching_spec_id(table_metadata, effective_spec) + if existing_spec_id is not None: + if existing_spec_id != table_metadata.default_spec_id: + self._updates += (SetDefaultSpecUpdate(spec_id=existing_spec_id),) + else: + self._updates += ( + AddPartitionSpecUpdate(spec=effective_spec), + SetDefaultSpecUpdate(spec_id=-1), + ) + + # Set the new sort order (same logic as spec). + effective_sort_order = UNSORTED_SORT_ORDER if new_sort_order.is_unsorted else new_sort_order + existing_order_id = self._find_matching_sort_order_id(table_metadata, effective_sort_order) + if existing_order_id is not None: + if existing_order_id != table_metadata.default_sort_order_id: + self._updates += (SetDefaultSortOrderUpdate(sort_order_id=existing_order_id),) + else: + self._updates += ( + AddSortOrderUpdate(sort_order=effective_sort_order), + SetDefaultSortOrderUpdate(sort_order_id=-1), + ) + + # Set location if changed + if new_location != table_metadata.location: + self._updates += (SetLocationUpdate(location=new_location),) + + # Merge properties (SetPropertiesUpdate merges onto existing properties) + if new_properties: + self._updates += (SetPropertiesUpdate(updates=new_properties),) + + @staticmethod + def _find_matching_schema_id(table_metadata: TableMetadata, schema: Schema) -> int | None: + """Find an existing schema structurally equal to the given one, returning its schema_id or None.""" + for existing in table_metadata.schemas: + if existing == schema: + return existing.schema_id + return None + + @staticmethod + def _find_matching_spec_id(table_metadata: TableMetadata, spec: PartitionSpec) -> int | None: + """Find an existing partition spec with the same fields, returning its spec_id or None.""" + for existing in table_metadata.partition_specs: + if existing.fields == spec.fields: + return existing.spec_id + return None + + @staticmethod + def _find_matching_sort_order_id(table_metadata: TableMetadata, sort_order: SortOrder) -> int | None: + """Find an existing sort order with the same fields, returning its order_id or None.""" + for existing in table_metadata.sort_orders: + if existing.fields == sort_order.fields: + return existing.order_id + return None + + def __init__( + self, + table: StagedTable, + new_schema: Schema, + new_spec: PartitionSpec, + new_sort_order: SortOrder, + new_location: str, + new_properties: Properties, + ) -> None: + super().__init__(table, autocommit=False) + self._initial_changes(table.metadata, new_schema, new_spec, new_sort_order, new_location, new_properties) + + def commit_transaction(self) -> Table: + """Commit the replace changes to the catalog. + + Uses AssertTableUUID as the only requirement. + + Returns: + The table with the updates applied. + """ + if len(self._updates) > 0: + self._table._do_commit( # pylint: disable=W0212 + updates=self._updates, + requirements=(AssertTableUUID(uuid=self._table.metadata.table_uuid),), + ) + + self._updates = () + self._requirements = () + + return self._table + + class Namespace(IcebergRootModel[list[str]]): """Reference to one or more levels of a namespace.""" diff --git a/tests/catalog/test_rest.py b/tests/catalog/test_rest.py index aa9a467381..a92748c6bc 100644 --- a/tests/catalog/test_rest.py +++ b/tests/catalog/test_rest.py @@ -64,7 +64,7 @@ from pyiceberg.table.sorting import SortField, SortOrder from pyiceberg.transforms import IdentityTransform, TruncateTransform from pyiceberg.typedef import RecursiveDict -from pyiceberg.types import StringType +from pyiceberg.types import BooleanType, IntegerType, NestedField, StringType from pyiceberg.utils.config import Config from pyiceberg.view import View from pyiceberg.view.metadata import ViewMetadata, ViewVersion @@ -2654,3 +2654,480 @@ def test_load_table_without_storage_credentials( ) assert actual.metadata.model_dump() == expected.metadata.model_dump() assert actual == expected + + +def test_replace_table_transaction_200( + rest_mock: Mocker, + example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any], + example_table_metadata_no_snapshot_v1_rest_json: dict[str, Any], +) -> None: + """Test that replace_table_transaction loads the existing table, then commits with AssertTableUUID.""" + expected_table_uuid = example_table_metadata_with_snapshot_v1_rest_json["metadata"]["table-uuid"] + example_table_metadata_no_snapshot_v1_rest_json["metadata"]["table-uuid"] = expected_table_uuid + + # Mock load_table (GET) to return existing table with snapshot + rest_mock.get( + f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", + json=example_table_metadata_with_snapshot_v1_rest_json, + status_code=200, + request_headers=TEST_HEADERS, + ) + # Mock commit (POST) for the replace + rest_mock.post( + f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", + json=example_table_metadata_no_snapshot_v1_rest_json, + status_code=200, + request_headers=TEST_HEADERS, + ) + catalog = RestCatalog("rest", uri=TEST_URI, token=TEST_TOKEN) + + # Replace with a new schema (3 fields: id stays, data stays, new_col is new) + new_schema = Schema( + NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), + NestedField(field_id=3, name="new_col", field_type=BooleanType(), required=False), + ) + txn = catalog.replace_table_transaction( + identifier=("fokko", "fokko2"), + schema=new_schema, + ) + txn.commit_transaction() + + actual_request = rest_mock.last_request.json() + + # Verify requirements: only AssertTableUUID + assert actual_request["requirements"] == [ + {"type": "assert-table-uuid", "uuid": expected_table_uuid}, + ] + + # Verify updates sequence. Since the existing table already has the same + # unpartitioned spec and unsorted sort order, those updates are skipped. + updates = actual_request["updates"] + actions = [u["action"] for u in updates] + assert actions == [ + "remove-snapshot-ref", + "add-schema", + "set-current-schema", + ] + + # Verify remove-snapshot-ref targets "main" + assert updates[0] == {"action": "remove-snapshot-ref", "ref-name": "main"} + + # Verify schema has reused field IDs (id=1, data=2 reused from existing schema) + schema_fields = updates[1]["schema"]["fields"] + assert schema_fields[0]["id"] == 1 + assert schema_fields[0]["name"] == "id" + assert schema_fields[1]["id"] == 2 + assert schema_fields[1]["name"] == "data" + # new_col gets a fresh ID above last_column_id (which is 2), so it gets 3 + assert schema_fields[2]["id"] == 3 + assert schema_fields[2]["name"] == "new_col" + + # set-current-schema uses -1 (meaning last added) + assert updates[2] == {"action": "set-current-schema", "schema-id": -1} + + +def test_replace_table_transaction_preserves_uuid( + rest_mock: Mocker, + example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any], + example_table_metadata_no_snapshot_v1_rest_json: dict[str, Any], +) -> None: + """Test that replace preserves the table UUID.""" + table_uuid = example_table_metadata_with_snapshot_v1_rest_json["metadata"]["table-uuid"] + example_table_metadata_no_snapshot_v1_rest_json["metadata"]["table-uuid"] = table_uuid + + rest_mock.get( + f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", + json=example_table_metadata_with_snapshot_v1_rest_json, + status_code=200, + request_headers=TEST_HEADERS, + ) + rest_mock.post( + f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", + json=example_table_metadata_no_snapshot_v1_rest_json, + status_code=200, + request_headers=TEST_HEADERS, + ) + catalog = RestCatalog("rest", uri=TEST_URI, token=TEST_TOKEN) + + txn = catalog.replace_table_transaction( + identifier=("fokko", "fokko2"), + schema=Schema( + NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), + ), + ) + + # The staged table should have the same UUID as the existing table + assert str(txn.table_metadata.table_uuid) == table_uuid + + result = txn.commit_transaction() + # After commit, the table should still have the same UUID + assert str(result.metadata.table_uuid) == table_uuid + + +def test_replace_table_transaction_with_new_location( + rest_mock: Mocker, + example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any], + example_table_metadata_no_snapshot_v1_rest_json: dict[str, Any], +) -> None: + """Test that replace_table_transaction with a new location includes SetLocationUpdate.""" + table_uuid = example_table_metadata_with_snapshot_v1_rest_json["metadata"]["table-uuid"] + example_table_metadata_no_snapshot_v1_rest_json["metadata"]["table-uuid"] = table_uuid + + rest_mock.get( + f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", + json=example_table_metadata_with_snapshot_v1_rest_json, + status_code=200, + request_headers=TEST_HEADERS, + ) + rest_mock.post( + f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", + json=example_table_metadata_no_snapshot_v1_rest_json, + status_code=200, + request_headers=TEST_HEADERS, + ) + catalog = RestCatalog("rest", uri=TEST_URI, token=TEST_TOKEN) + + txn = catalog.replace_table_transaction( + identifier=("fokko", "fokko2"), + schema=Schema( + NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), + ), + location="s3://new-warehouse/database/table", + ) + txn.commit_transaction() + + updates = rest_mock.last_request.json()["updates"] + actions = [u["action"] for u in updates] + + # Should include set-location since the location changed + assert "set-location" in actions + set_location_update = next(u for u in updates if u["action"] == "set-location") + assert set_location_update["location"] == "s3://new-warehouse/database/table" + + +def test_replace_table_transaction_with_properties( + rest_mock: Mocker, + example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any], + example_table_metadata_no_snapshot_v1_rest_json: dict[str, Any], +) -> None: + """Test that replace merges properties via SetPropertiesUpdate.""" + table_uuid = example_table_metadata_with_snapshot_v1_rest_json["metadata"]["table-uuid"] + example_table_metadata_no_snapshot_v1_rest_json["metadata"]["table-uuid"] = table_uuid + + rest_mock.get( + f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", + json=example_table_metadata_with_snapshot_v1_rest_json, + status_code=200, + request_headers=TEST_HEADERS, + ) + rest_mock.post( + f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", + json=example_table_metadata_no_snapshot_v1_rest_json, + status_code=200, + request_headers=TEST_HEADERS, + ) + catalog = RestCatalog("rest", uri=TEST_URI, token=TEST_TOKEN) + + txn = catalog.replace_table_transaction( + identifier=("fokko", "fokko2"), + schema=Schema( + NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), + ), + properties={"new-prop": "new-value"}, + ) + txn.commit_transaction() + + updates = rest_mock.last_request.json()["updates"] + actions = [u["action"] for u in updates] + + assert "set-properties" in actions + set_props = next(u for u in updates if u["action"] == "set-properties") + # SetPropertiesUpdate sends the user properties; the server merges onto existing + assert set_props["updates"] == {"new-prop": "new-value"} + + +def test_replace_table_transaction_with_partition_spec( + rest_mock: Mocker, + example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any], + example_table_metadata_no_snapshot_v1_rest_json: dict[str, Any], +) -> None: + """Test that replace_table_transaction with a new partition spec includes AddPartitionSpecUpdate.""" + table_uuid = example_table_metadata_with_snapshot_v1_rest_json["metadata"]["table-uuid"] + example_table_metadata_no_snapshot_v1_rest_json["metadata"]["table-uuid"] = table_uuid + + rest_mock.get( + f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", + json=example_table_metadata_with_snapshot_v1_rest_json, + status_code=200, + request_headers=TEST_HEADERS, + ) + rest_mock.post( + f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", + json=example_table_metadata_no_snapshot_v1_rest_json, + status_code=200, + request_headers=TEST_HEADERS, + ) + catalog = RestCatalog("rest", uri=TEST_URI, token=TEST_TOKEN) + + txn = catalog.replace_table_transaction( + identifier=("fokko", "fokko2"), + schema=Schema( + NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), + ), + partition_spec=PartitionSpec( + PartitionField(source_id=1, field_id=1000, transform=TruncateTransform(width=3), name="id_trunc"), spec_id=1 + ), + ) + txn.commit_transaction() + + updates = rest_mock.last_request.json()["updates"] + add_spec = next(u for u in updates if u["action"] == "add-spec") + spec_fields = add_spec["spec"]["fields"] + assert len(spec_fields) == 1 + assert spec_fields[0]["source-id"] == 1 # id field + assert spec_fields[0]["transform"] == "truncate[3]" + assert spec_fields[0]["name"] == "id_trunc" + + # set-default-spec should also be present, pointing to the newly added spec + actions = [u["action"] for u in updates] + assert "set-default-spec" in actions + set_default_spec = next(u for u in updates if u["action"] == "set-default-spec") + assert set_default_spec["spec-id"] == -1 + + +def test_replace_table_404( + rest_mock: Mocker, +) -> None: + """Test that replace_table raises NoSuchTableError when the table doesn't exist.""" + rest_mock.get( + f"{TEST_URI}v1/namespaces/fokko/tables/nonexistent", + json={ + "error": { + "message": "Table does not exist: fokko.nonexistent", + "type": "NoSuchTableException", + "code": 404, + } + }, + status_code=404, + request_headers=TEST_HEADERS, + ) + catalog = RestCatalog("rest", uri=TEST_URI, token=TEST_TOKEN) + + with pytest.raises(NoSuchTableError): + catalog.replace_table( + identifier=("fokko", "nonexistent"), + schema=Schema( + NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), + ), + ) + + +def test_replace_table_200( + rest_mock: Mocker, + example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any], + example_table_metadata_no_snapshot_v1_rest_json: dict[str, Any], +) -> None: + """Test that replace_table commits immediately and returns the table.""" + table_uuid = example_table_metadata_with_snapshot_v1_rest_json["metadata"]["table-uuid"] + example_table_metadata_no_snapshot_v1_rest_json["metadata"]["table-uuid"] = table_uuid + + rest_mock.get( + f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", + json=example_table_metadata_with_snapshot_v1_rest_json, + status_code=200, + request_headers=TEST_HEADERS, + ) + rest_mock.post( + f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", + json=example_table_metadata_no_snapshot_v1_rest_json, + status_code=200, + request_headers=TEST_HEADERS, + ) + catalog = RestCatalog("rest", uri=TEST_URI, token=TEST_TOKEN) + + result = catalog.replace_table( + identifier=("fokko", "fokko2"), + schema=Schema( + NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), + ), + ) + + assert isinstance(result, Table) + # The commit should have used assert-table-uuid + actual_request = rest_mock.last_request.json() + assert actual_request["requirements"] == [ + {"type": "assert-table-uuid", "uuid": table_uuid}, + ] + + +def test_replace_table_transaction_same_location_no_set_location( + rest_mock: Mocker, + example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any], + example_table_metadata_no_snapshot_v1_rest_json: dict[str, Any], +) -> None: + """Test that when location is not changed, SetLocationUpdate is NOT included.""" + table_uuid = example_table_metadata_with_snapshot_v1_rest_json["metadata"]["table-uuid"] + example_table_metadata_no_snapshot_v1_rest_json["metadata"]["table-uuid"] = table_uuid + + rest_mock.get( + f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", + json=example_table_metadata_with_snapshot_v1_rest_json, + status_code=200, + request_headers=TEST_HEADERS, + ) + rest_mock.post( + f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", + json=example_table_metadata_no_snapshot_v1_rest_json, + status_code=200, + request_headers=TEST_HEADERS, + ) + catalog = RestCatalog("rest", uri=TEST_URI, token=TEST_TOKEN) + + # Replace with no location specified - should use existing location + txn = catalog.replace_table_transaction( + identifier=("fokko", "fokko2"), + schema=Schema( + NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), + ), + ) + txn.commit_transaction() + + updates = rest_mock.last_request.json()["updates"] + actions = [u["action"] for u in updates] + # set-location should NOT be present since location didn't change + assert "set-location" not in actions + + +def test_replace_table_transaction_same_schema_skips_add_schema( + rest_mock: Mocker, + example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any], + example_table_metadata_no_snapshot_v1_rest_json: dict[str, Any], +) -> None: + """Test that replacing with the same schema skips add-schema and set-current-schema.""" + table_uuid = example_table_metadata_with_snapshot_v1_rest_json["metadata"]["table-uuid"] + example_table_metadata_no_snapshot_v1_rest_json["metadata"]["table-uuid"] = table_uuid + + rest_mock.get( + f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", + json=example_table_metadata_with_snapshot_v1_rest_json, + status_code=200, + request_headers=TEST_HEADERS, + ) + rest_mock.post( + f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", + json=example_table_metadata_no_snapshot_v1_rest_json, + status_code=200, + request_headers=TEST_HEADERS, + ) + catalog = RestCatalog("rest", uri=TEST_URI, token=TEST_TOKEN) + + # Use the exact same schema as the existing table (id: int, data: string) + same_schema = Schema( + NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), + ) + txn = catalog.replace_table_transaction( + identifier=("fokko", "fokko2"), + schema=same_schema, + ) + txn.commit_transaction() + + updates = rest_mock.last_request.json()["updates"] + actions = [u["action"] for u in updates] + + # Since the schema is unchanged, add-schema and set-current-schema should be skipped + assert "add-schema" not in actions + assert "set-current-schema" not in actions + + # The only update should be remove-snapshot-ref + assert actions == ["remove-snapshot-ref"] + + +def test_replace_table_transaction_different_schema_adds_schema( + rest_mock: Mocker, + example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any], + example_table_metadata_no_snapshot_v1_rest_json: dict[str, Any], +) -> None: + """Test that replacing with a genuinely new schema includes add-schema and set-current-schema.""" + table_uuid = example_table_metadata_with_snapshot_v1_rest_json["metadata"]["table-uuid"] + example_table_metadata_no_snapshot_v1_rest_json["metadata"]["table-uuid"] = table_uuid + + rest_mock.get( + f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", + json=example_table_metadata_with_snapshot_v1_rest_json, + status_code=200, + request_headers=TEST_HEADERS, + ) + rest_mock.post( + f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", + json=example_table_metadata_no_snapshot_v1_rest_json, + status_code=200, + request_headers=TEST_HEADERS, + ) + catalog = RestCatalog("rest", uri=TEST_URI, token=TEST_TOKEN) + + # A new schema with a different field (new_col instead of data) + new_schema = Schema( + NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), + NestedField(field_id=3, name="new_col", field_type=BooleanType(), required=False), + ) + txn = catalog.replace_table_transaction( + identifier=("fokko", "fokko2"), + schema=new_schema, + ) + txn.commit_transaction() + + updates = rest_mock.last_request.json()["updates"] + actions = [u["action"] for u in updates] + + # Since the schema is different, add-schema and set-current-schema must be present + assert "add-schema" in actions + assert "set-current-schema" in actions + + # set-current-schema should reference -1 (the last added schema) + set_schema = next(u for u in updates if u["action"] == "set-current-schema") + assert set_schema["schema-id"] == -1 + + +def test_replace_table_transaction_with_sort_order( + rest_mock: Mocker, + example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any], + example_table_metadata_no_snapshot_v1_rest_json: dict[str, Any], +) -> None: + """Test that replacing with a custom sort order includes add-sort-order and set-default-sort-order.""" + table_uuid = example_table_metadata_with_snapshot_v1_rest_json["metadata"]["table-uuid"] + example_table_metadata_no_snapshot_v1_rest_json["metadata"]["table-uuid"] = table_uuid + + rest_mock.get( + f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", + json=example_table_metadata_with_snapshot_v1_rest_json, + status_code=200, + request_headers=TEST_HEADERS, + ) + rest_mock.post( + f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", + json=example_table_metadata_no_snapshot_v1_rest_json, + status_code=200, + request_headers=TEST_HEADERS, + ) + catalog = RestCatalog("rest", uri=TEST_URI, token=TEST_TOKEN) + + txn = catalog.replace_table_transaction( + identifier=("fokko", "fokko2"), + schema=Schema( + NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), + ), + sort_order=SortOrder(SortField(source_id=1, transform=IdentityTransform())), + ) + txn.commit_transaction() + + updates = rest_mock.last_request.json()["updates"] + actions = [u["action"] for u in updates] + + # Should include add-sort-order and set-default-sort-order + assert "add-sort-order" in actions + assert "set-default-sort-order" in actions diff --git a/tests/integration/test_rest_catalog.py b/tests/integration/test_rest_catalog.py index 18aa943175..637fedcaf6 100644 --- a/tests/integration/test_rest_catalog.py +++ b/tests/integration/test_rest_catalog.py @@ -16,10 +16,16 @@ # under the License. # pylint:disable=redefined-outer-name +import pyarrow as pa import pytest from pytest_lazy_fixtures import lf +from pyiceberg.catalog import Catalog from pyiceberg.catalog.rest import RestCatalog +from pyiceberg.exceptions import NoSuchTableError +from pyiceberg.io.pyarrow import _dataframe_to_data_files +from pyiceberg.schema import Schema +from pyiceberg.types import BooleanType, IntegerType, LongType, NestedField, StringType TEST_NAMESPACE_IDENTIFIER = "TEST NS" @@ -62,3 +68,475 @@ def test_create_namespace_if_already_existing(catalog: RestCatalog) -> None: catalog.create_namespace_if_not_exists(TEST_NAMESPACE_IDENTIFIER) assert catalog.namespace_exists(TEST_NAMESPACE_IDENTIFIER) + + +@pytest.mark.integration +@pytest.mark.parametrize("catalog", [lf("session_catalog")]) +@pytest.mark.parametrize("format_version", [1, 2]) +def test_replace_table_transaction(catalog: Catalog, format_version: int) -> None: + identifier = f"default.test_replace_table_txn_{catalog.name}_{format_version}" + try: + catalog.create_namespace("default") + except Exception: + pass + try: + catalog.drop_table(identifier) + except NoSuchTableError: + pass + + # Create a table with initial schema and write some data + original_schema = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), + ) + original = catalog.create_table(identifier, schema=original_schema, properties={"format-version": str(format_version)}) + original_uuid = original.metadata.table_uuid + + pa_table = pa.Table.from_pydict( + {"id": [1, 2, 3], "data": ["a", "b", "c"]}, + schema=pa.schema([pa.field("id", pa.int64()), pa.field("data", pa.large_string())]), + ) + + with original.transaction() as txn: + with txn.update_snapshot().fast_append() as snapshot_update: + for data_file in _dataframe_to_data_files(table_metadata=txn.table_metadata, df=pa_table, io=original.io): + snapshot_update.append_data_file(data_file) + + original.refresh() + current_snapshot = original.current_snapshot() + assert current_snapshot is not None + original_snapshot_id = current_snapshot.snapshot_id + + # Replace with a new schema + new_schema = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=False), + NestedField(field_id=2, name="name", field_type=StringType(), required=False), + NestedField(field_id=3, name="active", field_type=BooleanType(), required=False), + ) + + with catalog.replace_table_transaction( + identifier, schema=new_schema, properties={"format-version": str(format_version)} + ) as txn: + pass # just replace the schema, no data + + table = catalog.load_table(identifier) + + # UUID must be preserved + assert table.metadata.table_uuid == original_uuid + + # Current snapshot should be cleared (main ref removed) + assert table.current_snapshot() is None + + # Old snapshots should still exist in the metadata + assert len(table.metadata.snapshots) >= 1 + assert any(s.snapshot_id == original_snapshot_id for s in table.metadata.snapshots) + + # New schema should be current, with field IDs reused for "id" (should still be 1) + current_schema = table.schema() + id_field = current_schema.find_field("id") + assert id_field.field_id == 1 # reused from original schema + + name_field = current_schema.find_field("name") + assert name_field is not None + assert name_field.field_id >= 3 # "name" is new, must not reuse "data"'s ID (2) + + active_field = current_schema.find_field("active") + assert active_field is not None + assert active_field.field_id >= 4 # "active" is new + + # last_column_id must account for the new fields + assert table.metadata.last_column_id >= 4 + + # Old schemas should still exist — exactly 2 (original + replacement) + assert len(table.metadata.schemas) == 2 + + +@pytest.mark.integration +@pytest.mark.parametrize("catalog", [lf("session_catalog")]) +def test_replace_table(catalog: Catalog) -> None: + identifier = f"default.test_replace_table_{catalog.name}" + try: + catalog.create_namespace("default") + except Exception: + pass + try: + catalog.drop_table(identifier) + except NoSuchTableError: + pass + + original_schema = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), + ) + original = catalog.create_table(identifier, schema=original_schema) + original_uuid = original.metadata.table_uuid + + new_schema = Schema( + NestedField(field_id=1, name="x", field_type=IntegerType(), required=False), + NestedField(field_id=2, name="y", field_type=IntegerType(), required=False), + ) + + result = catalog.replace_table(identifier, schema=new_schema) + + # UUID preserved + assert result.metadata.table_uuid == original_uuid + # New schema applied — "x" and "y" are entirely new names, so they get fresh IDs >= 3 + x_field = result.schema().find_field("x") + y_field = result.schema().find_field("y") + assert x_field is not None + assert y_field is not None + assert x_field.field_id >= 3 + assert y_field.field_id >= 4 + assert result.metadata.last_column_id >= 4 + + +@pytest.mark.integration +@pytest.mark.parametrize("catalog", [lf("session_catalog")]) +def test_replace_table_not_found(catalog: Catalog) -> None: + with pytest.raises(NoSuchTableError): + catalog.replace_table( + "default.does_not_exist_for_replace", + schema=Schema(NestedField(field_id=1, name="id", field_type=LongType(), required=False)), + ) + + +@pytest.mark.integration +@pytest.mark.parametrize("catalog", [lf("session_catalog")]) +def test_replace_table_same_schema_no_duplication(catalog: Catalog) -> None: + """Replacing a table with the exact same schema should succeed without adding duplicates. + + The code detects that the schema already exists and skips AddSchemaUpdate, + matching Java's reuseOrCreateNewSchemaId behavior. + """ + identifier = f"default.test_replace_same_schema_{catalog.name}" + try: + catalog.create_namespace("default") + except Exception: + pass + try: + catalog.drop_table(identifier) + except NoSuchTableError: + pass + + schema_a = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), + ) + catalog.create_table(identifier, schema=schema_a) + + # Replace with the SAME schema — should succeed without adding a duplicate + catalog.replace_table(identifier, schema=schema_a) + + table = catalog.load_table(identifier) + # Should still have exactly 1 schema (no duplicate added) + assert len(table.metadata.schemas) == 1 + # Current schema ID should remain 0 (the original) + assert table.metadata.current_schema_id == 0 + # Schema should be unchanged + assert len(table.schema().fields) == 2 + assert table.schema().find_field("id") is not None + assert table.schema().find_field("data") is not None + + +@pytest.mark.integration +@pytest.mark.parametrize("catalog", [lf("session_catalog")]) +def test_replace_table_back_to_previous_schema(catalog: Catalog) -> None: + """Replacing A -> B -> A where A and B have disjoint fields. + + Since field IDs are reused from the current schema only (matching Java), + "data" gets a new field ID when replacing back from B (which doesn't have "data"). + The resulting schema is structurally different from the original, so a 3rd schema + is created. This matches Java's behavior. + """ + identifier = f"default.test_replace_back_to_prev_schema_{catalog.name}" + try: + catalog.create_namespace("default") + except Exception: + pass + try: + catalog.drop_table(identifier) + except NoSuchTableError: + pass + + schema_a = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), + ) + catalog.create_table(identifier, schema=schema_a) + + # Step 2: Replace with schema B (disjoint fields: "name" and "active" instead of "data") + schema_b = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=False), + NestedField(field_id=2, name="name", field_type=StringType(), required=False), + NestedField(field_id=3, name="active", field_type=BooleanType(), required=False), + ) + catalog.replace_table(identifier, schema=schema_b) + + table_after_b = catalog.load_table(identifier) + assert len(table_after_b.metadata.schemas) == 2 + + # Step 3: Replace BACK to schema A + catalog.replace_table(identifier, schema=schema_a) + + table = catalog.load_table(identifier) + # "data" gets a new field ID (not reused from historical schema A), so a 3rd schema is created + assert len(table.metadata.schemas) == 3 + # last_column_id must be monotonically non-decreasing + assert table.metadata.last_column_id >= table_after_b.metadata.last_column_id + + +@pytest.mark.integration +@pytest.mark.parametrize("catalog", [lf("session_catalog")]) +def test_replace_table_last_column_id_monotonic(catalog: Catalog) -> None: + """last_column_id must never decrease, even when replacing with fewer columns.""" + identifier = f"default.test_replace_last_col_id_{catalog.name}" + try: + catalog.create_namespace("default") + except Exception: + pass + try: + catalog.drop_table(identifier) + except NoSuchTableError: + pass + + # Create table with 5 columns + schema_5col = Schema( + NestedField(field_id=1, name="a", field_type=LongType(), required=False), + NestedField(field_id=2, name="b", field_type=StringType(), required=False), + NestedField(field_id=3, name="c", field_type=StringType(), required=False), + NestedField(field_id=4, name="d", field_type=StringType(), required=False), + NestedField(field_id=5, name="e", field_type=StringType(), required=False), + ) + catalog.create_table(identifier, schema=schema_5col) + + table = catalog.load_table(identifier) + initial_last_col_id = table.metadata.last_column_id + assert initial_last_col_id >= 5, f"Initial last_column_id should be >= 5, got {initial_last_col_id}" + + # Replace with only 2 columns (subset) + schema_2col = Schema( + NestedField(field_id=1, name="a", field_type=LongType(), required=False), + NestedField(field_id=2, name="b", field_type=StringType(), required=False), + ) + catalog.replace_table(identifier, schema=schema_2col) + + table = catalog.load_table(identifier) + after_shrink_last_col_id = table.metadata.last_column_id + # last_column_id must NOT decrease + assert after_shrink_last_col_id >= initial_last_col_id, ( + f"last_column_id decreased from {initial_last_col_id} to {after_shrink_last_col_id} " + f"after replacing with fewer columns. It must be monotonically non-decreasing." + ) + + # Replace with 3 columns (2 existing + 1 new) + schema_3col = Schema( + NestedField(field_id=1, name="a", field_type=LongType(), required=False), + NestedField(field_id=2, name="b", field_type=StringType(), required=False), + NestedField(field_id=3, name="f", field_type=BooleanType(), required=False), # new column + ) + catalog.replace_table(identifier, schema=schema_3col) + + table = catalog.load_table(identifier) + after_grow_last_col_id = table.metadata.last_column_id + # New column should get an ID > previous last_column_id, so last_column_id should grow + assert after_grow_last_col_id >= initial_last_col_id + 1, ( + f"last_column_id should be >= {initial_last_col_id + 1} after adding a new column, got {after_grow_last_col_id}" + ) + + # Verify the new column "f" got an ID > initial_last_col_id (not reusing a dropped column's ID) + f_field = table.schema().find_field("f") + assert f_field.field_id > initial_last_col_id, ( + f"New field 'f' got field_id={f_field.field_id}, but it should be > {initial_last_col_id} to maintain monotonicity" + ) + + +@pytest.mark.integration +@pytest.mark.parametrize("catalog", [lf("session_catalog")]) +def test_replace_table_metadata_log_grows(catalog: Catalog) -> None: + """After replacing a table, the metadata_log should contain the pre-replace metadata.""" + identifier = f"default.test_replace_metadata_log_{catalog.name}" + try: + catalog.create_namespace("default") + except Exception: + pass + try: + catalog.drop_table(identifier) + except NoSuchTableError: + pass + + schema_a = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), + ) + catalog.create_table(identifier, schema=schema_a) + + table_before = catalog.load_table(identifier) + metadata_log_before = len(table_before.metadata.metadata_log) + + # Replace with a different schema + schema_b = Schema( + NestedField(field_id=1, name="x", field_type=IntegerType(), required=False), + NestedField(field_id=2, name="y", field_type=IntegerType(), required=False), + ) + catalog.replace_table(identifier, schema=schema_b) + + table_after = catalog.load_table(identifier) + metadata_log_after = len(table_after.metadata.metadata_log) + + # The metadata_log should have grown by at least 1 entry + assert metadata_log_after > metadata_log_before, ( + f"metadata_log did not grow after replace_table. " + f"Before: {metadata_log_before} entries, After: {metadata_log_after} entries. " + f"The pre-replace metadata location should have been appended." + ) + + +@pytest.mark.integration +@pytest.mark.parametrize("catalog", [lf("session_catalog")]) +def test_replace_table_snapshot_preserved_after_replace(catalog: Catalog) -> None: + """Snapshots are preserved but current snapshot is cleared after replace.""" + identifier = f"default.test_replace_snapshot_preserved_{catalog.name}" + try: + catalog.create_namespace("default") + except Exception: + pass + try: + catalog.drop_table(identifier) + except NoSuchTableError: + pass + + schema = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), + ) + table = catalog.create_table(identifier, schema=schema) + + # Write data to create a snapshot + pa_table = pa.Table.from_pydict( + {"id": [1, 2], "data": ["a", "b"]}, + schema=pa.schema([pa.field("id", pa.int64()), pa.field("data", pa.large_string())]), + ) + with table.transaction() as txn: + with txn.update_snapshot().fast_append() as snapshot_update: + for data_file in _dataframe_to_data_files(table_metadata=txn.table_metadata, df=pa_table, io=table.io): + snapshot_update.append_data_file(data_file) + + table.refresh() + current_snapshot = table.current_snapshot() + assert current_snapshot is not None + original_snapshot_id = current_snapshot.snapshot_id + original_snapshot_log_len = len(table.metadata.snapshot_log) + + # Replace with new schema + new_schema = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=False), + NestedField(field_id=2, name="value", field_type=StringType(), required=False), + ) + catalog.replace_table(identifier, schema=new_schema) + + replaced = catalog.load_table(identifier) + + # Current snapshot cleared (main ref removed) + assert replaced.current_snapshot() is None + + # Old snapshot still in metadata + assert any(s.snapshot_id == original_snapshot_id for s in replaced.metadata.snapshots) + + # Snapshot log preserved + assert len(replaced.metadata.snapshot_log) >= original_snapshot_log_len + + +@pytest.mark.integration +@pytest.mark.parametrize("catalog", [lf("session_catalog")]) +def test_replace_table_with_partition_spec(catalog: Catalog) -> None: + """Replace table with a new partition spec preserves old spec in metadata.""" + from pyiceberg.partitioning import PartitionField, PartitionSpec + from pyiceberg.transforms import IdentityTransform + + identifier = f"default.test_replace_with_spec_{catalog.name}" + try: + catalog.create_namespace("default") + except Exception: + pass + try: + catalog.drop_table(identifier) + except NoSuchTableError: + pass + + schema = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), + ) + catalog.create_table(identifier, schema=schema) + + # Replace with a partition spec on "id" + new_spec = PartitionSpec( + PartitionField(source_id=1, field_id=1000, transform=IdentityTransform(), name="id"), + spec_id=0, + ) + catalog.replace_table(identifier, schema=schema, partition_spec=new_spec) + + table = catalog.load_table(identifier) + + # New spec is the default + current_spec = table.metadata.spec() + assert len(current_spec.fields) == 1 + assert current_spec.fields[0].name == "id" + + # Old unpartitioned spec still in metadata + assert len(table.metadata.partition_specs) >= 2 + + # last_partition_id should be correct + assert table.metadata.last_partition_id is not None and table.metadata.last_partition_id >= 1000 + + +@pytest.mark.integration +@pytest.mark.parametrize("catalog", [lf("session_catalog")]) +def test_replace_table_sequential_replaces(catalog: Catalog) -> None: + """Multiple sequential replaces: schemas grow, last_column_id is monotonic, metadata_log grows.""" + identifier = f"default.test_replace_sequential_{catalog.name}" + try: + catalog.create_namespace("default") + except Exception: + pass + try: + catalog.drop_table(identifier) + except NoSuchTableError: + pass + + schema_a = Schema( + NestedField(field_id=1, name="a", field_type=LongType(), required=False), + NestedField(field_id=2, name="b", field_type=StringType(), required=False), + ) + catalog.create_table(identifier, schema=schema_a) + + table = catalog.load_table(identifier) + prev_last_col_id = table.metadata.last_column_id + prev_metadata_log_len = len(table.metadata.metadata_log) + + # Replace 1: A -> B (completely different fields) + schema_b = Schema( + NestedField(field_id=1, name="x", field_type=IntegerType(), required=False), + NestedField(field_id=2, name="y", field_type=IntegerType(), required=False), + NestedField(field_id=3, name="z", field_type=BooleanType(), required=False), + ) + catalog.replace_table(identifier, schema=schema_b) + + table = catalog.load_table(identifier) + assert table.metadata.last_column_id >= prev_last_col_id + assert len(table.metadata.schemas) == 2 + assert len(table.metadata.metadata_log) > prev_metadata_log_len + prev_last_col_id = table.metadata.last_column_id + prev_metadata_log_len = len(table.metadata.metadata_log) + + # Replace 2: B -> C (again different) + schema_c = Schema( + NestedField(field_id=1, name="p", field_type=StringType(), required=False), + NestedField(field_id=2, name="q", field_type=LongType(), required=False), + ) + catalog.replace_table(identifier, schema=schema_c) + + table = catalog.load_table(identifier) + assert table.metadata.last_column_id >= prev_last_col_id + assert len(table.metadata.schemas) == 3 # A, B, C all have different fields + assert len(table.metadata.metadata_log) > prev_metadata_log_len diff --git a/tests/table/test_partitioning.py b/tests/table/test_partitioning.py index a27046ef30..e7a4c5fc00 100644 --- a/tests/table/test_partitioning.py +++ b/tests/table/test_partitioning.py @@ -22,7 +22,12 @@ import pytest from pyiceberg.exceptions import ValidationError -from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionField, PartitionSpec +from pyiceberg.partitioning import ( + UNPARTITIONED_PARTITION_SPEC, + PartitionField, + PartitionSpec, + assign_fresh_partition_spec_ids_for_replace, +) from pyiceberg.schema import Schema from pyiceberg.transforms import ( BucketTransform, @@ -298,3 +303,50 @@ def test_incompatible_transform_source_type() -> None: spec.check_compatible(schema) assert "Invalid source field foo with type int for transform: year" in str(exc.value) + + +def test_assign_fresh_partition_spec_ids_for_replace_reuses_ids() -> None: + """Test that partition field IDs are reused from existing specs.""" + old_schema = Schema( + NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), + ) + fresh_schema = Schema( + NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), + ) + existing_specs = [ + PartitionSpec( + PartitionField(source_id=1, field_id=1000, transform=IdentityTransform(), name="id"), + spec_id=0, + ) + ] + spec = PartitionSpec( + PartitionField(source_id=1, field_id=999, transform=IdentityTransform(), name="id"), + spec_id=0, + ) + fresh_spec, last_pid = assign_fresh_partition_spec_ids_for_replace(spec, old_schema, fresh_schema, existing_specs, 1000) + assert fresh_spec.fields[0].field_id == 1000 # reused from existing spec + assert last_pid == 1000 + + +def test_assign_fresh_partition_spec_ids_for_replace_new_field() -> None: + """Test that new partition fields get IDs above last_partition_id.""" + old_schema = Schema( + NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), + ) + fresh_schema = Schema( + NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), + ) + existing_specs = [ + PartitionSpec(spec_id=0) # unpartitioned + ] + spec = PartitionSpec( + PartitionField(source_id=1, field_id=999, transform=IdentityTransform(), name="id"), + spec_id=0, + ) + fresh_spec, last_pid = assign_fresh_partition_spec_ids_for_replace(spec, old_schema, fresh_schema, existing_specs, 999) + assert fresh_spec.fields[0].field_id == 1000 # new, above last_partition_id=999 + assert last_pid == 1000 diff --git a/tests/test_schema.py b/tests/test_schema.py index 93ddc16202..3937354a5d 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -26,6 +26,7 @@ Accessor, Schema, _check_schema_compatible, + assign_fresh_schema_ids_for_replace, build_position_accessors, index_by_id, index_by_name, @@ -1815,3 +1816,90 @@ def test_check_schema_compatible_optional_map_field_present() -> None: ) # Should not raise - schemas match _check_schema_compatible(requested_schema, provided_schema) + + +def test_assign_fresh_schema_ids_for_replace_reuses_ids() -> None: + """Test that field IDs are reused from the base schema by name.""" + base_schema = Schema( + NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), + ) + new_schema = Schema( + NestedField(field_id=10, name="id", field_type=IntegerType(), required=False), + NestedField(field_id=20, name="data", field_type=StringType(), required=False), + ) + fresh, last_col_id = assign_fresh_schema_ids_for_replace(new_schema, base_schema, 2) + assert fresh.fields[0].field_id == 1 # reused from base + assert fresh.fields[1].field_id == 2 # reused from base + assert last_col_id == 2 # no new columns added + + +def test_assign_fresh_schema_ids_for_replace_assigns_new_ids_for_new_fields() -> None: + """Test that new fields get IDs above last_column_id.""" + base_schema = Schema( + NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), + ) + new_schema = Schema( + NestedField(field_id=10, name="id", field_type=IntegerType(), required=False), + NestedField(field_id=20, name="data", field_type=StringType(), required=False), + NestedField(field_id=30, name="new_col", field_type=BooleanType(), required=False), + ) + fresh, last_col_id = assign_fresh_schema_ids_for_replace(new_schema, base_schema, 2) + assert fresh.fields[0].field_id == 1 # reused + assert fresh.fields[1].field_id == 2 # reused + assert fresh.fields[2].field_id == 3 # new, starts after last_column_id=2 + assert last_col_id == 3 + + +def test_assign_fresh_schema_ids_for_replace_with_nested_struct() -> None: + """Test that nested struct field IDs are reused by full path name.""" + base_schema = Schema( + NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), + NestedField( + field_id=2, + name="location", + field_type=StructType( + NestedField(field_id=3, name="lat", field_type=FloatType(), required=False), + NestedField(field_id=4, name="lon", field_type=FloatType(), required=False), + ), + required=False, + ), + ) + new_schema = Schema( + NestedField(field_id=10, name="id", field_type=IntegerType(), required=False), + NestedField( + field_id=20, + name="location", + field_type=StructType( + NestedField(field_id=30, name="lat", field_type=FloatType(), required=False), + NestedField(field_id=40, name="lon", field_type=FloatType(), required=False), + NestedField(field_id=50, name="alt", field_type=FloatType(), required=False), + ), + required=False, + ), + ) + fresh, last_col_id = assign_fresh_schema_ids_for_replace(new_schema, base_schema, 4) + assert fresh.fields[0].field_id == 1 # id reused + assert fresh.fields[1].field_id == 2 # location reused + loc_fields = fresh.fields[1].field_type.fields + assert loc_fields[0].field_id == 3 # location.lat reused + assert loc_fields[1].field_id == 4 # location.lon reused + assert loc_fields[2].field_id == 5 # location.alt is new + assert last_col_id == 5 + + +def test_assign_fresh_schema_ids_for_replace_completely_new_schema() -> None: + """Test that a completely new schema gets IDs starting from last_column_id + 1.""" + base_schema = Schema( + NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), + ) + new_schema = Schema( + NestedField(field_id=10, name="x", field_type=IntegerType(), required=False), + NestedField(field_id=20, name="y", field_type=IntegerType(), required=False), + ) + fresh, last_col_id = assign_fresh_schema_ids_for_replace(new_schema, base_schema, 2) + assert fresh.fields[0].field_id == 3 # starts after last_column_id=2 + assert fresh.fields[1].field_id == 4 + assert last_col_id == 4 From e649f9595d89761daafdb9c2141467bae2280c69 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Mon, 18 May 2026 18:37:52 +0100 Subject: [PATCH 02/24] Extend replace_table to all catalogs, harden semantics - Move replace_table_transaction implementations to MetastoreCatalog and RestCatalog; keep stub + shared _replace_staged_table helper on Catalog so out-of-tree subclasses don't break. - Emit UpgradeFormatVersionUpdate when properties bump format-version, matching Java's TableMetadata.buildReplacement. - Always emit SetCurrentSchema / SetDefaultPartitionSpec / SetDefaultSortOrder, mirroring RESTSessionCatalog.replaceTransaction. - Handle v1 partition specs by carrying forward removed fields as VoidTransform (v1 specs are append-only). - Reject view collisions before replacing. - Assign fresh schema_id / spec_id / order_id on Add* updates so AddSchemaUpdate / AddPartitionSpecUpdate / AddSortOrderUpdate produce uniquely-keyed entries. - Tests: behaviour now covered in tests/catalog/test_catalog_behaviors.py parametrized across InMemoryCatalog + SqlCatalog. test_rest.py keeps only REST wire-specific cases (payload shape, 404, view collision, immediate commit). One end-to-end smoke test remains in tests/integration/test_rest_catalog.py. - Add docs section in mkdocs/docs/api.md. --- mkdocs/docs/api.md | 37 ++ pyiceberg/catalog/__init__.py | 111 +++++- pyiceberg/catalog/noop.py | 25 -- pyiceberg/catalog/rest/__init__.py | 50 +-- pyiceberg/partitioning.py | 100 ++++- pyiceberg/table/__init__.py | 50 ++- tests/catalog/test_catalog_behaviors.py | 151 +++++++- tests/catalog/test_rest.py | 490 ++++-------------------- tests/integration/test_rest_catalog.py | 460 +--------------------- tests/test_schema.py | 61 ++- 10 files changed, 539 insertions(+), 996 deletions(-) diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md index 29d09e2604..5d1429a283 100644 --- a/mkdocs/docs/api.md +++ b/mkdocs/docs/api.md @@ -185,6 +185,43 @@ with catalog.create_table_transaction(identifier="docs_example.bids", schema=sch txn.set_properties(test_a="test_aa", test_b="test_b", test_c="test_c") ``` +## Replace a table + +Atomically replace an existing table's schema, partition spec, sort order, location, and properties. The table UUID and history (snapshots, schemas, specs, sort orders, metadata log) are preserved; the current snapshot is cleared (the `main` branch ref is removed). This is the analog of Spark/Trino's `CREATE OR REPLACE TABLE` for the table-metadata side, and supports RTAS-style workflows when combined with subsequent writes. + +```python +from pyiceberg.schema import Schema +from pyiceberg.types import NestedField, LongType, StringType, BooleanType + +new_schema = Schema( + NestedField(field_id=1, name="datetime", field_type=LongType(), required=False), + NestedField(field_id=2, name="symbol", field_type=StringType(), required=False), + NestedField(field_id=3, name="active", field_type=BooleanType(), required=False), +) +catalog.replace_table( + identifier="docs_example.bids", + schema=new_schema, +) +``` + +Field IDs from columns whose names appear in the previous schema are reused, so existing data files remain readable when the new schema is a compatible superset. New columns get fresh IDs above `last-column-id`. + +Use `replace_table_transaction` to stage additional changes (writes, property updates, schema evolution) before committing — the equivalent of `CREATE OR REPLACE TABLE AS SELECT`: + +```python +with catalog.replace_table_transaction(identifier="docs_example.bids", schema=new_schema) as txn: + with txn.update_snapshot().fast_append() as snap: + for data_file in _dataframe_to_data_files(table_metadata=txn.table_metadata, df=df, io=txn._table.io): + snap.append_data_file(data_file) + txn.set_properties(write_replaced_at="2026-04-19T00:00:00Z") +``` + +To upgrade the table's format version as part of the replace, pass `format-version` in `properties`: + +```python +catalog.replace_table(identifier="docs_example.bids", schema=new_schema, properties={"format-version": "2"}) +``` + ## Register a table To register a table using existing metadata: diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index 59532d605c..8a560190e5 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -42,8 +42,12 @@ ) from pyiceberg.io import FileIO, load_file_io from pyiceberg.manifest import ManifestFile -from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec -from pyiceberg.schema import Schema +from pyiceberg.partitioning import ( + UNPARTITIONED_PARTITION_SPEC, + PartitionSpec, + assign_fresh_partition_spec_ids_for_replace, +) +from pyiceberg.schema import Schema, assign_fresh_schema_ids_for_replace from pyiceberg.serializers import ToOutputFile from pyiceberg.table import ( DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE, @@ -56,7 +60,7 @@ ) from pyiceberg.table.locations import load_location_provider from pyiceberg.table.metadata import TableMetadata, TableMetadataV1, new_table_metadata -from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder +from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder, assign_fresh_sort_order_ids from pyiceberg.table.update import ( TableRequirement, TableUpdate, @@ -324,6 +328,20 @@ def delete_data_files(io: FileIO, manifests_to_delete: list[ManifestFile]) -> No deleted_files[path] = True +def _raise_if_view_exists(catalog: Catalog, identifier: str | Identifier) -> None: + """Raise TableAlreadyExistsError if a view exists at the same identifier. + + Mirrors Java's `RESTSessionCatalog.replaceTransaction()` precondition. Catalogs that + don't support views raise `NotImplementedError` from `view_exists` — treat as "no view". + """ + try: + view_collision = catalog.view_exists(identifier) + except NotImplementedError: + view_collision = False + if view_collision: + raise TableAlreadyExistsError(f"View with same name already exists: {identifier}") + + def _import_catalog(name: str, catalog_impl: str, properties: Properties) -> Catalog | None: try: path_parts = catalog_impl.split(".") @@ -445,7 +463,6 @@ def create_table_if_not_exists( except TableAlreadyExistsError: return self.load_table(identifier) - @abstractmethod def replace_table( self, identifier: str | Identifier, @@ -473,9 +490,12 @@ def replace_table( Raises: NoSuchTableError: If the table does not exist. + TableAlreadyExistsError: If a view exists at the same identifier. """ + return self.replace_table_transaction( + identifier, schema, location, partition_spec, sort_order, properties + ).commit_transaction() - @abstractmethod def replace_table_transaction( self, identifier: str | Identifier, @@ -503,7 +523,63 @@ def replace_table_transaction( Raises: NoSuchTableError: If the table does not exist. + TableAlreadyExistsError: If a view exists at the same identifier. """ + raise NotImplementedError("replace_table_transaction is not supported for this catalog type") + + def _replace_staged_table( + self, + identifier: str | Identifier, + schema: Schema | pa.Schema, + location: str | None, + partition_spec: PartitionSpec, + sort_order: SortOrder, + properties: Properties, + ) -> tuple[StagedTable, Schema, PartitionSpec, SortOrder, str]: + """Load the existing table and build fresh schema/spec/sort-order for replacement. + + Mirrors the bookkeeping in `TableMetadata.buildReplacement` (iceberg-java): + - reuses existing field IDs by name (current schema) + - reuses partition field IDs by `(source, transform)` across all specs (v2+), + or carries forward the current spec with `VoidTransform`s (v1) + - reassigns sort field IDs against the fresh schema + - resolves `location` to the existing table's location when omitted + + Returns: + A tuple `(staged_table, fresh_schema, fresh_partition_spec, fresh_sort_order, resolved_location)`. + """ + existing_table = self.load_table(identifier) + existing_metadata = existing_table.metadata + + resolved_format_version = int(properties.get(TableProperties.FORMAT_VERSION, existing_metadata.format_version)) # type: ignore + iceberg_schema = self._convert_schema_if_needed(schema, resolved_format_version) + + fresh_schema, _ = assign_fresh_schema_ids_for_replace( + iceberg_schema, existing_metadata.schema(), existing_metadata.last_column_id + ) + + fresh_partition_spec, _ = assign_fresh_partition_spec_ids_for_replace( + partition_spec, + iceberg_schema, + fresh_schema, + existing_metadata.partition_specs, + existing_metadata.last_partition_id, + format_version=existing_metadata.format_version, + current_spec=existing_metadata.spec(), + ) + + fresh_sort_order = assign_fresh_sort_order_ids(sort_order, iceberg_schema, fresh_schema) + + resolved_location = location.rstrip("/") if location else existing_metadata.location + + staged_table = StagedTable( + identifier=existing_table.name(), + metadata=existing_metadata, + metadata_location=existing_table.metadata_location, + io=existing_table.io, + catalog=self, + ) + return staged_table, fresh_schema, fresh_partition_spec, fresh_sort_order, resolved_location @abstractmethod def load_table(self, identifier: str | Identifier) -> Table: @@ -985,18 +1061,6 @@ def create_table_transaction( self._create_staged_table(identifier, schema, location, partition_spec, sort_order, properties) ) - @override - def replace_table( - self, - identifier: str | Identifier, - schema: Schema | pa.Schema, - location: str | None = None, - partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC, - sort_order: SortOrder = UNSORTED_SORT_ORDER, - properties: Properties = EMPTY_DICT, - ) -> Table: - raise NotImplementedError("replace_table is not yet supported for this catalog type") - @override def replace_table_transaction( self, @@ -1007,7 +1071,18 @@ def replace_table_transaction( sort_order: SortOrder = UNSORTED_SORT_ORDER, properties: Properties = EMPTY_DICT, ) -> ReplaceTableTransaction: - raise NotImplementedError("replace_table_transaction is not yet supported for this catalog type") + _raise_if_view_exists(self, identifier) + staged_table, fresh_schema, fresh_spec, fresh_sort_order, resolved_location = self._replace_staged_table( + identifier, schema, location, partition_spec, sort_order, properties + ) + return ReplaceTableTransaction( + table=staged_table, + new_schema=fresh_schema, + new_spec=fresh_spec, + new_sort_order=fresh_sort_order, + new_location=resolved_location, + new_properties=properties, + ) @override def table_exists(self, identifier: str | Identifier) -> bool: diff --git a/pyiceberg/catalog/noop.py b/pyiceberg/catalog/noop.py index 55b484aa15..aeb3c72843 100644 --- a/pyiceberg/catalog/noop.py +++ b/pyiceberg/catalog/noop.py @@ -28,7 +28,6 @@ from pyiceberg.table import ( CommitTableResponse, CreateTableTransaction, - ReplaceTableTransaction, Table, ) from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder @@ -69,30 +68,6 @@ def create_table_transaction( ) -> CreateTableTransaction: raise NotImplementedError - @override - def replace_table( - self, - identifier: str | Identifier, - schema: Schema | pa.Schema, - location: str | None = None, - partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC, - sort_order: SortOrder = UNSORTED_SORT_ORDER, - properties: Properties = EMPTY_DICT, - ) -> Table: - raise NotImplementedError - - @override - def replace_table_transaction( - self, - identifier: str | Identifier, - schema: Schema | pa.Schema, - location: str | None = None, - partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC, - sort_order: SortOrder = UNSORTED_SORT_ORDER, - properties: Properties = EMPTY_DICT, - ) -> ReplaceTableTransaction: - raise NotImplementedError - @override def load_table(self, identifier: str | Identifier) -> Table: raise NotImplementedError diff --git a/pyiceberg/catalog/rest/__init__.py b/pyiceberg/catalog/rest/__init__.py index d349efa524..0a9e48de32 100644 --- a/pyiceberg/catalog/rest/__init__.py +++ b/pyiceberg/catalog/rest/__init__.py @@ -30,7 +30,15 @@ from typing_extensions import override from pyiceberg import __version__ -from pyiceberg.catalog import BOTOCORE_SESSION, TOKEN, URI, WAREHOUSE_LOCATION, Catalog, PropertiesUpdateSummary +from pyiceberg.catalog import ( + BOTOCORE_SESSION, + TOKEN, + URI, + WAREHOUSE_LOCATION, + Catalog, + PropertiesUpdateSummary, + _raise_if_view_exists, +) from pyiceberg.catalog.rest.auth import AUTH_MANAGER, AuthManager, AuthManagerAdapter, AuthManagerFactory, LegacyOAuth2AuthManager from pyiceberg.catalog.rest.response import _handle_non_200_response from pyiceberg.catalog.rest.scan_planning import ( @@ -72,9 +80,8 @@ UNPARTITIONED_PARTITION_SPEC, PartitionSpec, assign_fresh_partition_spec_ids, - assign_fresh_partition_spec_ids_for_replace, ) -from pyiceberg.schema import Schema, assign_fresh_schema_ids, assign_fresh_schema_ids_for_replace +from pyiceberg.schema import Schema, assign_fresh_schema_ids from pyiceberg.table import ( CommitTableRequest, CommitTableResponse, @@ -990,43 +997,14 @@ def replace_table_transaction( sort_order: SortOrder = UNSORTED_SORT_ORDER, properties: Properties = EMPTY_DICT, ) -> ReplaceTableTransaction: - existing_table = self.load_table(identifier) - existing_metadata = existing_table.metadata - - iceberg_schema = self._convert_schema_if_needed( - schema, - int(properties.get(TableProperties.FORMAT_VERSION, existing_metadata.format_version)), # type: ignore - ) - - # Assign fresh schema IDs, reusing IDs from the existing schema by field name - fresh_schema, _ = assign_fresh_schema_ids_for_replace( - iceberg_schema, existing_metadata.schema(), existing_metadata.last_column_id - ) - - # Assign fresh partition spec IDs, reusing IDs from existing specs - fresh_partition_spec, _ = assign_fresh_partition_spec_ids_for_replace( - partition_spec, iceberg_schema, fresh_schema, existing_metadata.partition_specs, existing_metadata.last_partition_id + _raise_if_view_exists(self, identifier) + staged_table, fresh_schema, fresh_spec, fresh_sort_order, resolved_location = self._replace_staged_table( + identifier, schema, location, partition_spec, sort_order, properties ) - - # Assign fresh sort order IDs - fresh_sort_order = assign_fresh_sort_order_ids(sort_order, iceberg_schema, fresh_schema) - - # Use existing location if not specified - resolved_location = location.rstrip("/") if location else existing_metadata.location - - # Create a StagedTable from the existing table - staged_table = StagedTable( - identifier=existing_table.name(), - metadata=existing_metadata, - metadata_location=existing_table.metadata_location, - io=existing_table.io, - catalog=self, - ) - return ReplaceTableTransaction( table=staged_table, new_schema=fresh_schema, - new_spec=fresh_partition_spec, + new_spec=fresh_spec, new_sort_order=fresh_sort_order, new_location=resolved_location, new_properties=properties, diff --git a/pyiceberg/partitioning.py b/pyiceberg/partitioning.py index 3861e62c30..65224211ba 100644 --- a/pyiceberg/partitioning.py +++ b/pyiceberg/partitioning.py @@ -341,27 +341,41 @@ def assign_fresh_partition_spec_ids_for_replace( fresh_schema: Schema, existing_specs: list[PartitionSpec], last_partition_id: int | None, + format_version: int = 2, + current_spec: PartitionSpec | None = None, ) -> tuple[PartitionSpec, int]: """Assign partition field IDs for a replace operation, reusing IDs from existing specs. - For each partition field, if a field with the same (source_id, transform) pair exists in - any of the existing specs, its partition field ID is reused; otherwise a fresh ID is - allocated starting from last_partition_id + 1. + Mirrors `TableMetadata.reassignPartitionIds` in iceberg-java: + - For v2+, reuse partition field IDs by `(source_id, transform)` across all existing specs. + New fields get IDs starting from `last_partition_id + 1`. + - For v1, the current spec's fields must be preserved (v1 specs are append-only). Fields + absent from the new spec are carried forward with a `VoidTransform`. Matching new fields + reuse the existing partition field ID; remaining new fields are appended with fresh IDs. Args: - spec: The new partition spec to assign IDs to. - old_schema: The schema that the new spec's source_ids reference. + spec: The new partition spec to assign IDs to. Its `source_id`s reference `old_schema`. + old_schema: The schema that the new spec's `source_id`s reference. fresh_schema: The schema with freshly assigned field IDs. existing_specs: All partition specs from the existing table metadata. - last_partition_id: The current table's last_partition_id. + last_partition_id: The current table's `last_partition_id`. + format_version: Table format version. Required to be set to 1 for v1 carry-forward. + current_spec: The current default partition spec. Required when `format_version <= 1`. Returns: - A tuple of (fresh_spec, new_last_partition_id). + A tuple of `(fresh_spec, new_last_partition_id)`. """ effective_last_partition_id = last_partition_id if last_partition_id is not None else PARTITION_FIELD_ID_START - 1 - # Build (source_id, transform) → partition_field_id mapping from all existing specs - # Use max() for dedup when the same (source_id, transform) appears in multiple specs + if format_version <= 1: + if current_spec is None: + raise ValueError("current_spec is required for v1 replace_table") + return _assign_fresh_partition_spec_ids_for_replace_v1( + spec, old_schema, fresh_schema, current_spec, effective_last_partition_id + ) + + # v2+: reuse field IDs by (source_id, transform) across all specs. + # Use max() for dedup when the same (source_id, transform) appears in multiple specs. transform_to_field_id: dict[tuple[int, str], int] = {} for existing_spec in existing_specs: for field in existing_spec.fields: @@ -402,6 +416,74 @@ def assign_fresh_partition_spec_ids_for_replace( return PartitionSpec(*partition_fields, spec_id=INITIAL_PARTITION_SPEC_ID), new_last_partition_id +def _assign_fresh_partition_spec_ids_for_replace_v1( + spec: PartitionSpec, + old_schema: Schema, + fresh_schema: Schema, + current_spec: PartitionSpec, + effective_last_partition_id: int, +) -> tuple[PartitionSpec, int]: + """v1 branch of `assign_fresh_partition_spec_ids_for_replace`. See parent docstring.""" + # Build (fresh_source_id, transform) → (new_field, fresh_source_id) for the new spec, + # in insertion order so leftover fields keep their declared order on append. + new_field_by_key: dict[tuple[int, str], tuple[PartitionField, int]] = {} + new_field_names: list[str] = [] + for new_field in spec.fields: + col_name = old_schema.find_column_name(new_field.source_id) + if col_name is None: + raise ValueError(f"Could not find in old schema: {new_field}") + fresh_field = fresh_schema.find_field(col_name) + if fresh_field is None: + raise ValueError(f"Could not find field in fresh schema: {col_name}") + validate_partition_name(new_field.name, new_field.transform, fresh_field.field_id, fresh_schema, set()) + key = (fresh_field.field_id, str(new_field.transform)) + new_field_by_key[key] = (new_field, fresh_field.field_id) + new_field_names.append(new_field.name) + + # Walk current spec, carrying forward each field. Matching new fields consume their key; + # missing fields become void transforms. + partition_fields = [] + for cur_field in current_spec.fields: + key = (cur_field.source_id, str(cur_field.transform)) + match = new_field_by_key.pop(key, None) + if match is not None: + new_field, fresh_source_id = match + partition_fields.append( + PartitionField( + name=new_field.name, + source_id=fresh_source_id, + field_id=cur_field.field_id, + transform=new_field.transform, + ) + ) + else: + void_name = f"{cur_field.name}_{cur_field.field_id}" if cur_field.name in new_field_names else cur_field.name + partition_fields.append( + PartitionField( + name=void_name, + source_id=cur_field.source_id, + field_id=cur_field.field_id, + transform=VoidTransform(), + ) + ) + + # Append remaining new fields at the end with fresh partition IDs. + next_id = effective_last_partition_id + for new_field, fresh_source_id in new_field_by_key.values(): + next_id += 1 + partition_fields.append( + PartitionField( + name=new_field.name, + source_id=fresh_source_id, + field_id=next_id, + transform=new_field.transform, + ) + ) + + new_last_partition_id = max(next_id, effective_last_partition_id) + return PartitionSpec(*partition_fields, spec_id=INITIAL_PARTITION_SPEC_ID), new_last_partition_id + + T = TypeVar("T") diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 163a06b9e5..e209e6c5c7 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -1027,51 +1027,67 @@ def _initial_changes( new_location: str, new_properties: Properties, ) -> None: - """Set the initial changes that transform the existing table into the replacement.""" - # Remove the main branch ref to clear the current snapshot + """Set the initial changes that transform the existing table into the replacement. + + Mirrors Java's `TableMetadata.buildReplacement` + `RESTSessionCatalog.replaceTransaction`: + ensures `SetCurrentSchema` / `SetDefaultPartitionSpec` / `SetDefaultSortOrder` are + always emitted (even when reused), and bumps `format-version` when requested. + """ + # Upgrade format-version if requested via properties (matches Java's buildReplacement). + requested_format_version_str = new_properties.get(TableProperties.FORMAT_VERSION) + if requested_format_version_str is not None: + requested_format_version = int(requested_format_version_str) + if requested_format_version > table_metadata.format_version: + self._updates += (UpgradeFormatVersionUpdate(format_version=requested_format_version),) + + # Remove the main branch ref to clear the current snapshot. self._updates += (RemoveSnapshotRefUpdate(ref_name=MAIN_BRANCH),) - # Reuse an existing schema if structurally identical (ignoring schema_id). + # Schema: reuse an existing schema_id if structurally identical, else add a new one + # with a fresh schema_id (max + 1, matching UpdateSchema's convention). existing_schema_id = self._find_matching_schema_id(table_metadata, new_schema) if existing_schema_id is not None: - if existing_schema_id != table_metadata.current_schema_id: - self._updates += (SetCurrentSchemaUpdate(schema_id=existing_schema_id),) + self._updates += (SetCurrentSchemaUpdate(schema_id=existing_schema_id),) else: + next_schema_id = max((s.schema_id for s in table_metadata.schemas), default=-1) + 1 + schema_with_fresh_id = new_schema.model_copy(update={"schema_id": next_schema_id}) self._updates += ( - AddSchemaUpdate(schema_=new_schema), + AddSchemaUpdate(schema_=schema_with_fresh_id), SetCurrentSchemaUpdate(schema_id=-1), ) - # Only emit AddPartitionSpecUpdate + SetDefaultSpecUpdate(-1) when the spec - # is new. If an identical spec already exists, use its concrete ID. + # Partition spec: same reuse-or-add pattern. Assign a fresh spec_id on add to avoid + # collisions with existing specs (AddPartitionSpecUpdate refuses duplicate IDs). effective_spec = UNPARTITIONED_PARTITION_SPEC if new_spec.is_unpartitioned() else new_spec existing_spec_id = self._find_matching_spec_id(table_metadata, effective_spec) if existing_spec_id is not None: - if existing_spec_id != table_metadata.default_spec_id: - self._updates += (SetDefaultSpecUpdate(spec_id=existing_spec_id),) + self._updates += (SetDefaultSpecUpdate(spec_id=existing_spec_id),) else: + next_spec_id = max((s.spec_id for s in table_metadata.partition_specs), default=-1) + 1 + spec_with_fresh_id = PartitionSpec(*effective_spec.fields, spec_id=next_spec_id) self._updates += ( - AddPartitionSpecUpdate(spec=effective_spec), + AddPartitionSpecUpdate(spec=spec_with_fresh_id), SetDefaultSpecUpdate(spec_id=-1), ) - # Set the new sort order (same logic as spec). + # Sort order: same reuse-or-add pattern with fresh order_id on add. effective_sort_order = UNSORTED_SORT_ORDER if new_sort_order.is_unsorted else new_sort_order existing_order_id = self._find_matching_sort_order_id(table_metadata, effective_sort_order) if existing_order_id is not None: - if existing_order_id != table_metadata.default_sort_order_id: - self._updates += (SetDefaultSortOrderUpdate(sort_order_id=existing_order_id),) + self._updates += (SetDefaultSortOrderUpdate(sort_order_id=existing_order_id),) else: + next_order_id = max((o.order_id for o in table_metadata.sort_orders), default=-1) + 1 + sort_order_with_fresh_id = SortOrder(*effective_sort_order.fields, order_id=next_order_id) self._updates += ( - AddSortOrderUpdate(sort_order=effective_sort_order), + AddSortOrderUpdate(sort_order=sort_order_with_fresh_id), SetDefaultSortOrderUpdate(sort_order_id=-1), ) - # Set location if changed + # Set location if changed. if new_location != table_metadata.location: self._updates += (SetLocationUpdate(location=new_location),) - # Merge properties (SetPropertiesUpdate merges onto existing properties) + # Merge properties (SetPropertiesUpdate merges onto existing properties). if new_properties: self._updates += (SetPropertiesUpdate(updates=new_properties),) diff --git a/tests/catalog/test_catalog_behaviors.py b/tests/catalog/test_catalog_behaviors.py index 01e0d2ce31..d5de2c8db7 100644 --- a/tests/catalog/test_catalog_behaviors.py +++ b/tests/catalog/test_catalog_behaviors.py @@ -45,7 +45,7 @@ from pyiceberg.table.snapshots import Operation from pyiceberg.table.sorting import NullOrder, SortDirection, SortField, SortOrder from pyiceberg.table.update import AddSchemaUpdate, SetCurrentSchemaUpdate -from pyiceberg.transforms import IdentityTransform +from pyiceberg.transforms import IdentityTransform, VoidTransform from pyiceberg.typedef import Identifier from pyiceberg.types import BooleanType, IntegerType, LongType, NestedField, StringType @@ -387,6 +387,155 @@ def test_load_table_from_self_identifier( assert table.metadata == loaded_table.metadata +# Replace table tests + + +def _create_simple_table(catalog: Catalog, identifier: Identifier, format_version: int = 2) -> tuple[Identifier, Schema]: + namespace = Catalog.namespace_from(identifier) + catalog.create_namespace_if_not_exists(namespace) + schema = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), + ) + catalog.create_table(identifier, schema=schema, properties={"format-version": str(format_version)}) + return identifier, schema + + +def test_replace_table_preserves_uuid_and_clears_current_snapshot( + catalog: Catalog, test_table_identifier: Identifier +) -> None: + _create_simple_table(catalog, test_table_identifier) + original = catalog.load_table(test_table_identifier) + data = pa.Table.from_pydict( + {"id": [1, 2], "data": ["a", "b"]}, + schema=pa.schema([pa.field("id", pa.int64()), pa.field("data", pa.large_string())]), + ) + original.append(data) + assert catalog.load_table(test_table_identifier).metadata.current_snapshot_id is not None + + new_schema = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), + NestedField(field_id=3, name="extra", field_type=BooleanType(), required=False), + ) + replaced = catalog.replace_table(test_table_identifier, schema=new_schema) + + assert replaced.metadata.table_uuid == original.metadata.table_uuid, "UUID must be preserved across replace" + assert replaced.metadata.current_snapshot_id is None, "main ref must be cleared" + # Snapshot history is preserved even though the current snapshot is cleared. + snapshots_before = len(catalog.load_table(test_table_identifier).metadata.snapshots) + assert len(replaced.metadata.snapshots) == snapshots_before + + +def test_replace_table_reuses_schema_id_when_structurally_identical( + catalog: Catalog, test_table_identifier: Identifier +) -> None: + _, schema = _create_simple_table(catalog, test_table_identifier) + replaced = catalog.replace_table(test_table_identifier, schema=schema) + # No new schema was added; current schema id is unchanged. + assert [s.schema_id for s in replaced.metadata.schemas] == [0] + assert replaced.metadata.current_schema_id == 0 + + +def test_replace_table_adds_new_schema_when_different(catalog: Catalog, test_table_identifier: Identifier) -> None: + _create_simple_table(catalog, test_table_identifier) + new_schema = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), + NestedField(field_id=3, name="extra", field_type=BooleanType(), required=False), + ) + replaced = catalog.replace_table(test_table_identifier, schema=new_schema) + assert sorted(s.schema_id for s in replaced.metadata.schemas) == [0, 1] + assert replaced.metadata.current_schema_id == 1 + # "id" and "data" keep field IDs 1 and 2; "extra" gets a fresh ID above last_column_id. + assert {f.name: f.field_id for f in replaced.metadata.schema().fields} == {"id": 1, "data": 2, "extra": 3} + assert replaced.metadata.last_column_id == 3 + + +def test_replace_table_reuses_partition_spec_id(catalog: Catalog, test_table_identifier: Identifier) -> None: + namespace = Catalog.namespace_from(test_table_identifier) + catalog.create_namespace(namespace) + schema = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), + ) + spec = PartitionSpec(PartitionField(source_id=1, field_id=1000, name="id_part", transform=IdentityTransform())) + catalog.create_table(test_table_identifier, schema=schema, partition_spec=spec) + + replaced = catalog.replace_table(test_table_identifier, schema=schema, partition_spec=spec) + assert [s.spec_id for s in replaced.metadata.partition_specs] == [0] + assert replaced.metadata.default_spec_id == 0 + + +def test_replace_table_with_new_location(catalog: Catalog, test_table_identifier: Identifier, tmp_path: Path) -> None: + _, schema = _create_simple_table(catalog, test_table_identifier) + new_location = f"file://{tmp_path}/relocated" + replaced = catalog.replace_table(test_table_identifier, schema=schema, location=new_location) + assert replaced.metadata.location == new_location + + +def test_replace_table_defaults_to_existing_location(catalog: Catalog, test_table_identifier: Identifier) -> None: + _, schema = _create_simple_table(catalog, test_table_identifier) + existing_location = catalog.load_table(test_table_identifier).metadata.location + replaced = catalog.replace_table(test_table_identifier, schema=schema) + assert replaced.metadata.location == existing_location + + +def test_replace_table_merges_properties(catalog: Catalog, test_table_identifier: Identifier) -> None: + namespace = Catalog.namespace_from(test_table_identifier) + catalog.create_namespace(namespace) + schema = Schema(NestedField(field_id=1, name="id", field_type=LongType(), required=False)) + catalog.create_table(test_table_identifier, schema=schema, properties={"keep": "yes", "override": "old"}) + replaced = catalog.replace_table(test_table_identifier, schema=schema, properties={"override": "new", "new_key": "v"}) + assert replaced.properties["keep"] == "yes" + assert replaced.properties["override"] == "new" + assert replaced.properties["new_key"] == "v" + + +def test_replace_table_upgrades_format_version(catalog: Catalog, test_table_identifier: Identifier) -> None: + _, schema = _create_simple_table(catalog, test_table_identifier, format_version=1) + assert catalog.load_table(test_table_identifier).format_version == 1 + replaced = catalog.replace_table(test_table_identifier, schema=schema, properties={"format-version": "2"}) + assert replaced.format_version == 2 + + +def test_replace_table_v1_carries_forward_partition_fields_as_void( + catalog: Catalog, test_table_identifier: Identifier +) -> None: + """v1 specs are append-only; dropped partition fields must be preserved with VoidTransform.""" + namespace = Catalog.namespace_from(test_table_identifier) + catalog.create_namespace(namespace) + schema = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), + ) + spec = PartitionSpec(PartitionField(source_id=1, field_id=1000, name="id_part", transform=IdentityTransform())) + catalog.create_table(test_table_identifier, schema=schema, partition_spec=spec, properties={"format-version": "1"}) + + replaced = catalog.replace_table(test_table_identifier, schema=schema) # default unpartitioned + new_spec = replaced.spec() + # The "id_part" field is carried forward (under a possibly-renamed identity), now as void. + void_field = next(f for f in new_spec.fields if f.field_id == 1000) + assert isinstance(void_field.transform, VoidTransform) + assert void_field.source_id == 1 + + +def test_replace_table_raises_when_table_does_not_exist(catalog: Catalog, test_table_identifier: Identifier) -> None: + schema = Schema(NestedField(field_id=1, name="id", field_type=LongType(), required=False)) + with pytest.raises(NoSuchTableError): + catalog.replace_table(test_table_identifier, schema=schema) + + +def test_replace_table_transaction_can_stage_additional_changes( + catalog: Catalog, test_table_identifier: Identifier +) -> None: + _, schema = _create_simple_table(catalog, test_table_identifier) + with catalog.replace_table_transaction(test_table_identifier, schema=schema) as txn: + txn.set_properties({"staged": "yes"}) + replaced = catalog.load_table(test_table_identifier) + assert replaced.properties.get("staged") == "yes" + + # Rename table tests diff --git a/tests/catalog/test_rest.py b/tests/catalog/test_rest.py index eb35bf58da..65874bcfcb 100644 --- a/tests/catalog/test_rest.py +++ b/tests/catalog/test_rest.py @@ -19,6 +19,7 @@ import base64 import os +import uuid from collections.abc import Callable from typing import Any, cast from unittest import mock @@ -2900,478 +2901,153 @@ def test_load_table_without_storage_credentials( assert actual == expected -def test_replace_table_transaction_200( - rest_mock: Mocker, - example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any], - example_table_metadata_no_snapshot_v1_rest_json: dict[str, Any], -) -> None: - """Test that replace_table_transaction loads the existing table, then commits with AssertTableUUID.""" - expected_table_uuid = example_table_metadata_with_snapshot_v1_rest_json["metadata"]["table-uuid"] - example_table_metadata_no_snapshot_v1_rest_json["metadata"]["table-uuid"] = expected_table_uuid - - # Mock load_table (GET) to return existing table with snapshot - rest_mock.get( - f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", - json=example_table_metadata_with_snapshot_v1_rest_json, - status_code=200, - request_headers=TEST_HEADERS, - ) - # Mock commit (POST) for the replace - rest_mock.post( - f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", - json=example_table_metadata_no_snapshot_v1_rest_json, - status_code=200, - request_headers=TEST_HEADERS, - ) - catalog = RestCatalog("rest", uri=TEST_URI, token=TEST_TOKEN) - - # Replace with a new schema (3 fields: id stays, data stays, new_col is new) - new_schema = Schema( - NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), - NestedField(field_id=2, name="data", field_type=StringType(), required=False), - NestedField(field_id=3, name="new_col", field_type=BooleanType(), required=False), - ) - txn = catalog.replace_table_transaction( - identifier=("fokko", "fokko2"), - schema=new_schema, - ) - txn.commit_transaction() - - actual_request = rest_mock.last_request.json() - - # Verify requirements: only AssertTableUUID - assert actual_request["requirements"] == [ - {"type": "assert-table-uuid", "uuid": expected_table_uuid}, - ] - - # Verify updates sequence. Since the existing table already has the same - # unpartitioned spec and unsorted sort order, those updates are skipped. - updates = actual_request["updates"] - actions = [u["action"] for u in updates] - assert actions == [ - "remove-snapshot-ref", - "add-schema", - "set-current-schema", - ] - - # Verify remove-snapshot-ref targets "main" - assert updates[0] == {"action": "remove-snapshot-ref", "ref-name": "main"} - - # Verify schema has reused field IDs (id=1, data=2 reused from existing schema) - schema_fields = updates[1]["schema"]["fields"] - assert schema_fields[0]["id"] == 1 - assert schema_fields[0]["name"] == "id" - assert schema_fields[1]["id"] == 2 - assert schema_fields[1]["name"] == "data" - # new_col gets a fresh ID above last_column_id (which is 2), so it gets 3 - assert schema_fields[2]["id"] == 3 - assert schema_fields[2]["name"] == "new_col" - - # set-current-schema uses -1 (meaning last added) - assert updates[2] == {"action": "set-current-schema", "schema-id": -1} +# replace_table tests +# +# Catalog-agnostic behavior (UUID preservation, snapshot clearing, schema/spec/sort-order reuse, +# format-version upgrade, etc.) is covered in tests/catalog/test_catalog_behaviors.py against +# both InMemoryCatalog and SqlCatalog. The tests below cover REST-specific concerns: the wire +# payload sent to the server, 404 handling, and the view-collision pre-flight check. -def test_replace_table_transaction_preserves_uuid( +def _mock_replace_endpoints( rest_mock: Mocker, - example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any], - example_table_metadata_no_snapshot_v1_rest_json: dict[str, Any], + namespace: str, + table: str, + load_response: dict[str, Any], + commit_response: dict[str, Any], + view_exists: bool = False, ) -> None: - """Test that replace preserves the table UUID.""" - table_uuid = example_table_metadata_with_snapshot_v1_rest_json["metadata"]["table-uuid"] - example_table_metadata_no_snapshot_v1_rest_json["metadata"]["table-uuid"] = table_uuid - - rest_mock.get( - f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", - json=example_table_metadata_with_snapshot_v1_rest_json, - status_code=200, - request_headers=TEST_HEADERS, - ) - rest_mock.post( - f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", - json=example_table_metadata_no_snapshot_v1_rest_json, - status_code=200, + rest_mock.head( + f"{TEST_URI}v1/namespaces/{namespace}/views/{table}", + status_code=204 if view_exists else 404, request_headers=TEST_HEADERS, ) - catalog = RestCatalog("rest", uri=TEST_URI, token=TEST_TOKEN) - - txn = catalog.replace_table_transaction( - identifier=("fokko", "fokko2"), - schema=Schema( - NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), - NestedField(field_id=2, name="data", field_type=StringType(), required=False), - ), - ) - - # The staged table should have the same UUID as the existing table - assert str(txn.table_metadata.table_uuid) == table_uuid - - result = txn.commit_transaction() - # After commit, the table should still have the same UUID - assert str(result.metadata.table_uuid) == table_uuid - - -def test_replace_table_transaction_with_new_location( - rest_mock: Mocker, - example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any], - example_table_metadata_no_snapshot_v1_rest_json: dict[str, Any], -) -> None: - """Test that replace_table_transaction with a new location includes SetLocationUpdate.""" - table_uuid = example_table_metadata_with_snapshot_v1_rest_json["metadata"]["table-uuid"] - example_table_metadata_no_snapshot_v1_rest_json["metadata"]["table-uuid"] = table_uuid - rest_mock.get( - f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", - json=example_table_metadata_with_snapshot_v1_rest_json, + f"{TEST_URI}v1/namespaces/{namespace}/tables/{table}", + json=load_response, status_code=200, request_headers=TEST_HEADERS, ) rest_mock.post( - f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", - json=example_table_metadata_no_snapshot_v1_rest_json, + f"{TEST_URI}v1/namespaces/{namespace}/tables/{table}", + json=commit_response, status_code=200, request_headers=TEST_HEADERS, ) - catalog = RestCatalog("rest", uri=TEST_URI, token=TEST_TOKEN) - - txn = catalog.replace_table_transaction( - identifier=("fokko", "fokko2"), - schema=Schema( - NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), - ), - location="s3://new-warehouse/database/table", - ) - txn.commit_transaction() - - updates = rest_mock.last_request.json()["updates"] - actions = [u["action"] for u in updates] - # Should include set-location since the location changed - assert "set-location" in actions - set_location_update = next(u for u in updates if u["action"] == "set-location") - assert set_location_update["location"] == "s3://new-warehouse/database/table" - -def test_replace_table_transaction_with_properties( +def test_replace_table_transaction_wire_payload( rest_mock: Mocker, example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any], example_table_metadata_no_snapshot_v1_rest_json: dict[str, Any], ) -> None: - """Test that replace merges properties via SetPropertiesUpdate.""" + """The replace request must carry exactly one `assert-table-uuid` requirement and the + Set* + Add* updates needed to transform the existing table — order is not asserted.""" table_uuid = example_table_metadata_with_snapshot_v1_rest_json["metadata"]["table-uuid"] example_table_metadata_no_snapshot_v1_rest_json["metadata"]["table-uuid"] = table_uuid - - rest_mock.get( - f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", - json=example_table_metadata_with_snapshot_v1_rest_json, - status_code=200, - request_headers=TEST_HEADERS, - ) - rest_mock.post( - f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", - json=example_table_metadata_no_snapshot_v1_rest_json, - status_code=200, - request_headers=TEST_HEADERS, + _mock_replace_endpoints( + rest_mock, + "fokko", + "fokko2", + example_table_metadata_with_snapshot_v1_rest_json, + example_table_metadata_no_snapshot_v1_rest_json, ) catalog = RestCatalog("rest", uri=TEST_URI, token=TEST_TOKEN) - txn = catalog.replace_table_transaction( - identifier=("fokko", "fokko2"), - schema=Schema( - NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), - ), - properties={"new-prop": "new-value"}, + new_schema = Schema( + NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), + NestedField(field_id=3, name="new_col", field_type=BooleanType(), required=False), ) - txn.commit_transaction() + catalog.replace_table_transaction(identifier=("fokko", "fokko2"), schema=new_schema).commit_transaction() + request = rest_mock.last_request.json() - updates = rest_mock.last_request.json()["updates"] - actions = [u["action"] for u in updates] + assert request["requirements"] == [{"type": "assert-table-uuid", "uuid": table_uuid}] - assert "set-properties" in actions - set_props = next(u for u in updates if u["action"] == "set-properties") - # SetPropertiesUpdate sends the user properties; the server merges onto existing - assert set_props["updates"] == {"new-prop": "new-value"} + updates_by_action = {u["action"]: u for u in request["updates"]} + # main snapshot must be cleared on replace. + assert updates_by_action["remove-snapshot-ref"] == {"action": "remove-snapshot-ref", "ref-name": "main"} + # A new schema is added because the column set differs; field IDs for existing columns + # are reused by name. + added_schema = updates_by_action["add-schema"]["schema"] + assert {f["name"]: f["id"] for f in added_schema["fields"]} == {"id": 1, "data": 2, "new_col": 3} + # Set* updates are emitted unconditionally (matches Java's RESTSessionCatalog). + assert updates_by_action["set-current-schema"]["schema-id"] == -1 + assert "set-default-spec" in updates_by_action + assert "set-default-sort-order" in updates_by_action -def test_replace_table_transaction_with_partition_spec( +def test_replace_table_transaction_404_raises( rest_mock: Mocker, - example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any], - example_table_metadata_no_snapshot_v1_rest_json: dict[str, Any], ) -> None: - """Test that replace_table_transaction with a new partition spec includes AddPartitionSpecUpdate.""" - table_uuid = example_table_metadata_with_snapshot_v1_rest_json["metadata"]["table-uuid"] - example_table_metadata_no_snapshot_v1_rest_json["metadata"]["table-uuid"] = table_uuid - - rest_mock.get( - f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", - json=example_table_metadata_with_snapshot_v1_rest_json, - status_code=200, - request_headers=TEST_HEADERS, - ) - rest_mock.post( - f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", - json=example_table_metadata_no_snapshot_v1_rest_json, - status_code=200, + rest_mock.head( + f"{TEST_URI}v1/namespaces/fokko/views/missing", + status_code=404, request_headers=TEST_HEADERS, ) - catalog = RestCatalog("rest", uri=TEST_URI, token=TEST_TOKEN) - - txn = catalog.replace_table_transaction( - identifier=("fokko", "fokko2"), - schema=Schema( - NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), - NestedField(field_id=2, name="data", field_type=StringType(), required=False), - ), - partition_spec=PartitionSpec( - PartitionField(source_id=1, field_id=1000, transform=TruncateTransform(width=3), name="id_trunc"), spec_id=1 - ), - ) - txn.commit_transaction() - - updates = rest_mock.last_request.json()["updates"] - add_spec = next(u for u in updates if u["action"] == "add-spec") - spec_fields = add_spec["spec"]["fields"] - assert len(spec_fields) == 1 - assert spec_fields[0]["source-id"] == 1 # id field - assert spec_fields[0]["transform"] == "truncate[3]" - assert spec_fields[0]["name"] == "id_trunc" - - # set-default-spec should also be present, pointing to the newly added spec - actions = [u["action"] for u in updates] - assert "set-default-spec" in actions - set_default_spec = next(u for u in updates if u["action"] == "set-default-spec") - assert set_default_spec["spec-id"] == -1 - - -def test_replace_table_404( - rest_mock: Mocker, -) -> None: - """Test that replace_table raises NoSuchTableError when the table doesn't exist.""" rest_mock.get( - f"{TEST_URI}v1/namespaces/fokko/tables/nonexistent", - json={ - "error": { - "message": "Table does not exist: fokko.nonexistent", - "type": "NoSuchTableException", - "code": 404, - } - }, + f"{TEST_URI}v1/namespaces/fokko/tables/missing", + json={"error": {"message": "Table not found", "type": "NoSuchTableException", "code": 404}}, status_code=404, request_headers=TEST_HEADERS, ) catalog = RestCatalog("rest", uri=TEST_URI, token=TEST_TOKEN) - with pytest.raises(NoSuchTableError): - catalog.replace_table( - identifier=("fokko", "nonexistent"), - schema=Schema( - NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), - ), + catalog.replace_table_transaction( + identifier=("fokko", "missing"), + schema=Schema(NestedField(field_id=1, name="id", field_type=IntegerType(), required=False)), ) -def test_replace_table_200( - rest_mock: Mocker, - example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any], - example_table_metadata_no_snapshot_v1_rest_json: dict[str, Any], -) -> None: - """Test that replace_table commits immediately and returns the table.""" - table_uuid = example_table_metadata_with_snapshot_v1_rest_json["metadata"]["table-uuid"] - example_table_metadata_no_snapshot_v1_rest_json["metadata"]["table-uuid"] = table_uuid - - rest_mock.get( - f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", - json=example_table_metadata_with_snapshot_v1_rest_json, - status_code=200, - request_headers=TEST_HEADERS, - ) - rest_mock.post( - f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", - json=example_table_metadata_no_snapshot_v1_rest_json, - status_code=200, - request_headers=TEST_HEADERS, - ) - catalog = RestCatalog("rest", uri=TEST_URI, token=TEST_TOKEN) - - result = catalog.replace_table( - identifier=("fokko", "fokko2"), - schema=Schema( - NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), - ), - ) - - assert isinstance(result, Table) - # The commit should have used assert-table-uuid - actual_request = rest_mock.last_request.json() - assert actual_request["requirements"] == [ - {"type": "assert-table-uuid", "uuid": table_uuid}, - ] - - -def test_replace_table_transaction_same_location_no_set_location( - rest_mock: Mocker, - example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any], - example_table_metadata_no_snapshot_v1_rest_json: dict[str, Any], -) -> None: - """Test that when location is not changed, SetLocationUpdate is NOT included.""" - table_uuid = example_table_metadata_with_snapshot_v1_rest_json["metadata"]["table-uuid"] - example_table_metadata_no_snapshot_v1_rest_json["metadata"]["table-uuid"] = table_uuid - - rest_mock.get( - f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", - json=example_table_metadata_with_snapshot_v1_rest_json, - status_code=200, - request_headers=TEST_HEADERS, - ) - rest_mock.post( - f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", - json=example_table_metadata_no_snapshot_v1_rest_json, - status_code=200, - request_headers=TEST_HEADERS, - ) - catalog = RestCatalog("rest", uri=TEST_URI, token=TEST_TOKEN) - - # Replace with no location specified - should use existing location - txn = catalog.replace_table_transaction( - identifier=("fokko", "fokko2"), - schema=Schema( - NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), - ), - ) - txn.commit_transaction() - - updates = rest_mock.last_request.json()["updates"] - actions = [u["action"] for u in updates] - # set-location should NOT be present since location didn't change - assert "set-location" not in actions - - -def test_replace_table_transaction_same_schema_skips_add_schema( - rest_mock: Mocker, - example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any], - example_table_metadata_no_snapshot_v1_rest_json: dict[str, Any], -) -> None: - """Test that replacing with the same schema skips add-schema and set-current-schema.""" - table_uuid = example_table_metadata_with_snapshot_v1_rest_json["metadata"]["table-uuid"] - example_table_metadata_no_snapshot_v1_rest_json["metadata"]["table-uuid"] = table_uuid - - rest_mock.get( - f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", - json=example_table_metadata_with_snapshot_v1_rest_json, - status_code=200, - request_headers=TEST_HEADERS, - ) - rest_mock.post( - f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", - json=example_table_metadata_no_snapshot_v1_rest_json, - status_code=200, - request_headers=TEST_HEADERS, - ) - catalog = RestCatalog("rest", uri=TEST_URI, token=TEST_TOKEN) - - # Use the exact same schema as the existing table (id: int, data: string) - same_schema = Schema( - NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), - NestedField(field_id=2, name="data", field_type=StringType(), required=False), - ) - txn = catalog.replace_table_transaction( - identifier=("fokko", "fokko2"), - schema=same_schema, - ) - txn.commit_transaction() - - updates = rest_mock.last_request.json()["updates"] - actions = [u["action"] for u in updates] - - # Since the schema is unchanged, add-schema and set-current-schema should be skipped - assert "add-schema" not in actions - assert "set-current-schema" not in actions - - # The only update should be remove-snapshot-ref - assert actions == ["remove-snapshot-ref"] - - -def test_replace_table_transaction_different_schema_adds_schema( +def test_replace_table_transaction_rejects_view_collision( rest_mock: Mocker, example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any], example_table_metadata_no_snapshot_v1_rest_json: dict[str, Any], ) -> None: - """Test that replacing with a genuinely new schema includes add-schema and set-current-schema.""" - table_uuid = example_table_metadata_with_snapshot_v1_rest_json["metadata"]["table-uuid"] - example_table_metadata_no_snapshot_v1_rest_json["metadata"]["table-uuid"] = table_uuid - - rest_mock.get( - f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", - json=example_table_metadata_with_snapshot_v1_rest_json, - status_code=200, - request_headers=TEST_HEADERS, - ) - rest_mock.post( - f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", - json=example_table_metadata_no_snapshot_v1_rest_json, - status_code=200, - request_headers=TEST_HEADERS, + """A view at the same identifier must fail replace, matching Java's RESTSessionCatalog.""" + _mock_replace_endpoints( + rest_mock, + "fokko", + "fokko2", + example_table_metadata_with_snapshot_v1_rest_json, + example_table_metadata_no_snapshot_v1_rest_json, + view_exists=True, ) catalog = RestCatalog("rest", uri=TEST_URI, token=TEST_TOKEN) - - # A new schema with a different field (new_col instead of data) - new_schema = Schema( - NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), - NestedField(field_id=3, name="new_col", field_type=BooleanType(), required=False), - ) - txn = catalog.replace_table_transaction( - identifier=("fokko", "fokko2"), - schema=new_schema, - ) - txn.commit_transaction() - - updates = rest_mock.last_request.json()["updates"] - actions = [u["action"] for u in updates] - - # Since the schema is different, add-schema and set-current-schema must be present - assert "add-schema" in actions - assert "set-current-schema" in actions - - # set-current-schema should reference -1 (the last added schema) - set_schema = next(u for u in updates if u["action"] == "set-current-schema") - assert set_schema["schema-id"] == -1 + with pytest.raises(TableAlreadyExistsError, match="View with same name already exists"): + catalog.replace_table_transaction( + identifier=("fokko", "fokko2"), + schema=Schema(NestedField(field_id=1, name="id", field_type=IntegerType(), required=False)), + ) -def test_replace_table_transaction_with_sort_order( +def test_replace_table_commits_immediately( rest_mock: Mocker, example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any], example_table_metadata_no_snapshot_v1_rest_json: dict[str, Any], ) -> None: - """Test that replacing with a custom sort order includes add-sort-order and set-default-sort-order.""" + """`replace_table` is `replace_table_transaction(...).commit_transaction()` — verify + it produces an HTTP commit, not just a staged transaction.""" table_uuid = example_table_metadata_with_snapshot_v1_rest_json["metadata"]["table-uuid"] example_table_metadata_no_snapshot_v1_rest_json["metadata"]["table-uuid"] = table_uuid - - rest_mock.get( - f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", - json=example_table_metadata_with_snapshot_v1_rest_json, - status_code=200, - request_headers=TEST_HEADERS, - ) - rest_mock.post( - f"{TEST_URI}v1/namespaces/fokko/tables/fokko2", - json=example_table_metadata_no_snapshot_v1_rest_json, - status_code=200, - request_headers=TEST_HEADERS, + _mock_replace_endpoints( + rest_mock, + "fokko", + "fokko2", + example_table_metadata_with_snapshot_v1_rest_json, + example_table_metadata_no_snapshot_v1_rest_json, ) catalog = RestCatalog("rest", uri=TEST_URI, token=TEST_TOKEN) - txn = catalog.replace_table_transaction( + replaced = catalog.replace_table( identifier=("fokko", "fokko2"), schema=Schema( NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), NestedField(field_id=2, name="data", field_type=StringType(), required=False), ), - sort_order=SortOrder(SortField(source_id=1, transform=IdentityTransform())), ) - txn.commit_transaction() + assert replaced.metadata.table_uuid == uuid.UUID(table_uuid) + # The last request should be the commit POST (not the GET load_table). + assert rest_mock.last_request.method == "POST" - updates = rest_mock.last_request.json()["updates"] - actions = [u["action"] for u in updates] - # Should include add-sort-order and set-default-sort-order - assert "add-sort-order" in actions - assert "set-default-sort-order" in actions diff --git a/tests/integration/test_rest_catalog.py b/tests/integration/test_rest_catalog.py index 1fbefd5a19..6c60caef54 100644 --- a/tests/integration/test_rest_catalog.py +++ b/tests/integration/test_rest_catalog.py @@ -24,10 +24,10 @@ from pyiceberg.catalog import Catalog from pyiceberg.catalog.rest import RestCatalog -from pyiceberg.exceptions import NoSuchTableError, NoSuchViewError +from pyiceberg.exceptions import NoSuchViewError from pyiceberg.io.pyarrow import _dataframe_to_data_files from pyiceberg.schema import Schema -from pyiceberg.types import BooleanType, IntegerType, LongType, NestedField, StringType +from pyiceberg.types import BooleanType, LongType, NestedField, StringType from pyiceberg.view.metadata import SQLViewRepresentation, ViewVersion TEST_NAMESPACE_IDENTIFIER = "TEST NS" @@ -75,474 +75,46 @@ def test_create_namespace_if_already_existing(catalog: RestCatalog) -> None: @pytest.mark.integration @pytest.mark.parametrize("catalog", [lf("session_catalog")]) -@pytest.mark.parametrize("format_version", [1, 2]) -def test_replace_table_transaction(catalog: Catalog, format_version: int) -> None: - identifier = f"default.test_replace_table_txn_{catalog.name}_{format_version}" - try: +def test_replace_table_end_to_end_against_rest_server(catalog: Catalog) -> None: + """End-to-end smoke test: replace_table against a real REST server. + + Detailed replace_table semantics are covered against InMemoryCatalog and SqlCatalog in + `tests/catalog/test_catalog_behaviors.py`. This test verifies the REST wire path: server + accepts the commit, preserves the UUID, and clears the current snapshot.""" + identifier = f"default.test_replace_table_e2e_{catalog.name}" + if not catalog.namespace_exists("default"): catalog.create_namespace("default") - except Exception: - pass - try: + if catalog.table_exists(identifier): catalog.drop_table(identifier) - except NoSuchTableError: - pass - # Create a table with initial schema and write some data original_schema = Schema( NestedField(field_id=1, name="id", field_type=LongType(), required=False), NestedField(field_id=2, name="data", field_type=StringType(), required=False), ) - original = catalog.create_table(identifier, schema=original_schema, properties={"format-version": str(format_version)}) - original_uuid = original.metadata.table_uuid + original = catalog.create_table(identifier, schema=original_schema) pa_table = pa.Table.from_pydict( {"id": [1, 2, 3], "data": ["a", "b", "c"]}, schema=pa.schema([pa.field("id", pa.int64()), pa.field("data", pa.large_string())]), ) - with original.transaction() as txn: with txn.update_snapshot().fast_append() as snapshot_update: for data_file in _dataframe_to_data_files(table_metadata=txn.table_metadata, df=pa_table, io=original.io): snapshot_update.append_data_file(data_file) - original.refresh() - current_snapshot = original.current_snapshot() - assert current_snapshot is not None - original_snapshot_id = current_snapshot.snapshot_id - - # Replace with a new schema - new_schema = Schema( - NestedField(field_id=1, name="id", field_type=LongType(), required=False), - NestedField(field_id=2, name="name", field_type=StringType(), required=False), - NestedField(field_id=3, name="active", field_type=BooleanType(), required=False), - ) - - with catalog.replace_table_transaction( - identifier, schema=new_schema, properties={"format-version": str(format_version)} - ) as txn: - pass # just replace the schema, no data - - table = catalog.load_table(identifier) - - # UUID must be preserved - assert table.metadata.table_uuid == original_uuid - - # Current snapshot should be cleared (main ref removed) - assert table.current_snapshot() is None - - # Old snapshots should still exist in the metadata - assert len(table.metadata.snapshots) >= 1 - assert any(s.snapshot_id == original_snapshot_id for s in table.metadata.snapshots) - - # New schema should be current, with field IDs reused for "id" (should still be 1) - current_schema = table.schema() - id_field = current_schema.find_field("id") - assert id_field.field_id == 1 # reused from original schema - - name_field = current_schema.find_field("name") - assert name_field is not None - assert name_field.field_id >= 3 # "name" is new, must not reuse "data"'s ID (2) - - active_field = current_schema.find_field("active") - assert active_field is not None - assert active_field.field_id >= 4 # "active" is new - - # last_column_id must account for the new fields - assert table.metadata.last_column_id >= 4 - - # Old schemas should still exist — exactly 2 (original + replacement) - assert len(table.metadata.schemas) == 2 - - -@pytest.mark.integration -@pytest.mark.parametrize("catalog", [lf("session_catalog")]) -def test_replace_table(catalog: Catalog) -> None: - identifier = f"default.test_replace_table_{catalog.name}" - try: - catalog.create_namespace("default") - except Exception: - pass - try: - catalog.drop_table(identifier) - except NoSuchTableError: - pass - - original_schema = Schema( - NestedField(field_id=1, name="id", field_type=LongType(), required=False), - NestedField(field_id=2, name="data", field_type=StringType(), required=False), - ) - original = catalog.create_table(identifier, schema=original_schema) - original_uuid = original.metadata.table_uuid + original_snapshot_id = original.current_snapshot().snapshot_id # type: ignore[union-attr] new_schema = Schema( - NestedField(field_id=1, name="x", field_type=IntegerType(), required=False), - NestedField(field_id=2, name="y", field_type=IntegerType(), required=False), - ) - - result = catalog.replace_table(identifier, schema=new_schema) - - # UUID preserved - assert result.metadata.table_uuid == original_uuid - # New schema applied — "x" and "y" are entirely new names, so they get fresh IDs >= 3 - x_field = result.schema().find_field("x") - y_field = result.schema().find_field("y") - assert x_field is not None - assert y_field is not None - assert x_field.field_id >= 3 - assert y_field.field_id >= 4 - assert result.metadata.last_column_id >= 4 - - -@pytest.mark.integration -@pytest.mark.parametrize("catalog", [lf("session_catalog")]) -def test_replace_table_not_found(catalog: Catalog) -> None: - with pytest.raises(NoSuchTableError): - catalog.replace_table( - "default.does_not_exist_for_replace", - schema=Schema(NestedField(field_id=1, name="id", field_type=LongType(), required=False)), - ) - - -@pytest.mark.integration -@pytest.mark.parametrize("catalog", [lf("session_catalog")]) -def test_replace_table_same_schema_no_duplication(catalog: Catalog) -> None: - """Replacing a table with the exact same schema should succeed without adding duplicates. - - The code detects that the schema already exists and skips AddSchemaUpdate, - matching Java's reuseOrCreateNewSchemaId behavior. - """ - identifier = f"default.test_replace_same_schema_{catalog.name}" - try: - catalog.create_namespace("default") - except Exception: - pass - try: - catalog.drop_table(identifier) - except NoSuchTableError: - pass - - schema_a = Schema( - NestedField(field_id=1, name="id", field_type=LongType(), required=False), - NestedField(field_id=2, name="data", field_type=StringType(), required=False), - ) - catalog.create_table(identifier, schema=schema_a) - - # Replace with the SAME schema — should succeed without adding a duplicate - catalog.replace_table(identifier, schema=schema_a) - - table = catalog.load_table(identifier) - # Should still have exactly 1 schema (no duplicate added) - assert len(table.metadata.schemas) == 1 - # Current schema ID should remain 0 (the original) - assert table.metadata.current_schema_id == 0 - # Schema should be unchanged - assert len(table.schema().fields) == 2 - assert table.schema().find_field("id") is not None - assert table.schema().find_field("data") is not None - - -@pytest.mark.integration -@pytest.mark.parametrize("catalog", [lf("session_catalog")]) -def test_replace_table_back_to_previous_schema(catalog: Catalog) -> None: - """Replacing A -> B -> A where A and B have disjoint fields. - - Since field IDs are reused from the current schema only (matching Java), - "data" gets a new field ID when replacing back from B (which doesn't have "data"). - The resulting schema is structurally different from the original, so a 3rd schema - is created. This matches Java's behavior. - """ - identifier = f"default.test_replace_back_to_prev_schema_{catalog.name}" - try: - catalog.create_namespace("default") - except Exception: - pass - try: - catalog.drop_table(identifier) - except NoSuchTableError: - pass - - schema_a = Schema( - NestedField(field_id=1, name="id", field_type=LongType(), required=False), - NestedField(field_id=2, name="data", field_type=StringType(), required=False), - ) - catalog.create_table(identifier, schema=schema_a) - - # Step 2: Replace with schema B (disjoint fields: "name" and "active" instead of "data") - schema_b = Schema( NestedField(field_id=1, name="id", field_type=LongType(), required=False), NestedField(field_id=2, name="name", field_type=StringType(), required=False), NestedField(field_id=3, name="active", field_type=BooleanType(), required=False), ) - catalog.replace_table(identifier, schema=schema_b) - - table_after_b = catalog.load_table(identifier) - assert len(table_after_b.metadata.schemas) == 2 - - # Step 3: Replace BACK to schema A - catalog.replace_table(identifier, schema=schema_a) - - table = catalog.load_table(identifier) - # "data" gets a new field ID (not reused from historical schema A), so a 3rd schema is created - assert len(table.metadata.schemas) == 3 - # last_column_id must be monotonically non-decreasing - assert table.metadata.last_column_id >= table_after_b.metadata.last_column_id - - -@pytest.mark.integration -@pytest.mark.parametrize("catalog", [lf("session_catalog")]) -def test_replace_table_last_column_id_monotonic(catalog: Catalog) -> None: - """last_column_id must never decrease, even when replacing with fewer columns.""" - identifier = f"default.test_replace_last_col_id_{catalog.name}" - try: - catalog.create_namespace("default") - except Exception: - pass - try: - catalog.drop_table(identifier) - except NoSuchTableError: - pass - - # Create table with 5 columns - schema_5col = Schema( - NestedField(field_id=1, name="a", field_type=LongType(), required=False), - NestedField(field_id=2, name="b", field_type=StringType(), required=False), - NestedField(field_id=3, name="c", field_type=StringType(), required=False), - NestedField(field_id=4, name="d", field_type=StringType(), required=False), - NestedField(field_id=5, name="e", field_type=StringType(), required=False), - ) - catalog.create_table(identifier, schema=schema_5col) - - table = catalog.load_table(identifier) - initial_last_col_id = table.metadata.last_column_id - assert initial_last_col_id >= 5, f"Initial last_column_id should be >= 5, got {initial_last_col_id}" - - # Replace with only 2 columns (subset) - schema_2col = Schema( - NestedField(field_id=1, name="a", field_type=LongType(), required=False), - NestedField(field_id=2, name="b", field_type=StringType(), required=False), - ) - catalog.replace_table(identifier, schema=schema_2col) - - table = catalog.load_table(identifier) - after_shrink_last_col_id = table.metadata.last_column_id - # last_column_id must NOT decrease - assert after_shrink_last_col_id >= initial_last_col_id, ( - f"last_column_id decreased from {initial_last_col_id} to {after_shrink_last_col_id} " - f"after replacing with fewer columns. It must be monotonically non-decreasing." - ) + replaced = catalog.replace_table(identifier, schema=new_schema) - # Replace with 3 columns (2 existing + 1 new) - schema_3col = Schema( - NestedField(field_id=1, name="a", field_type=LongType(), required=False), - NestedField(field_id=2, name="b", field_type=StringType(), required=False), - NestedField(field_id=3, name="f", field_type=BooleanType(), required=False), # new column - ) - catalog.replace_table(identifier, schema=schema_3col) - - table = catalog.load_table(identifier) - after_grow_last_col_id = table.metadata.last_column_id - # New column should get an ID > previous last_column_id, so last_column_id should grow - assert after_grow_last_col_id >= initial_last_col_id + 1, ( - f"last_column_id should be >= {initial_last_col_id + 1} after adding a new column, got {after_grow_last_col_id}" - ) - - # Verify the new column "f" got an ID > initial_last_col_id (not reusing a dropped column's ID) - f_field = table.schema().find_field("f") - assert f_field.field_id > initial_last_col_id, ( - f"New field 'f' got field_id={f_field.field_id}, but it should be > {initial_last_col_id} to maintain monotonicity" - ) - - -@pytest.mark.integration -@pytest.mark.parametrize("catalog", [lf("session_catalog")]) -def test_replace_table_metadata_log_grows(catalog: Catalog) -> None: - """After replacing a table, the metadata_log should contain the pre-replace metadata.""" - identifier = f"default.test_replace_metadata_log_{catalog.name}" - try: - catalog.create_namespace("default") - except Exception: - pass - try: - catalog.drop_table(identifier) - except NoSuchTableError: - pass - - schema_a = Schema( - NestedField(field_id=1, name="id", field_type=LongType(), required=False), - NestedField(field_id=2, name="data", field_type=StringType(), required=False), - ) - catalog.create_table(identifier, schema=schema_a) - - table_before = catalog.load_table(identifier) - metadata_log_before = len(table_before.metadata.metadata_log) - - # Replace with a different schema - schema_b = Schema( - NestedField(field_id=1, name="x", field_type=IntegerType(), required=False), - NestedField(field_id=2, name="y", field_type=IntegerType(), required=False), - ) - catalog.replace_table(identifier, schema=schema_b) - - table_after = catalog.load_table(identifier) - metadata_log_after = len(table_after.metadata.metadata_log) - - # The metadata_log should have grown by at least 1 entry - assert metadata_log_after > metadata_log_before, ( - f"metadata_log did not grow after replace_table. " - f"Before: {metadata_log_before} entries, After: {metadata_log_after} entries. " - f"The pre-replace metadata location should have been appended." - ) - - -@pytest.mark.integration -@pytest.mark.parametrize("catalog", [lf("session_catalog")]) -def test_replace_table_snapshot_preserved_after_replace(catalog: Catalog) -> None: - """Snapshots are preserved but current snapshot is cleared after replace.""" - identifier = f"default.test_replace_snapshot_preserved_{catalog.name}" - try: - catalog.create_namespace("default") - except Exception: - pass - try: - catalog.drop_table(identifier) - except NoSuchTableError: - pass - - schema = Schema( - NestedField(field_id=1, name="id", field_type=LongType(), required=False), - NestedField(field_id=2, name="data", field_type=StringType(), required=False), - ) - table = catalog.create_table(identifier, schema=schema) - - # Write data to create a snapshot - pa_table = pa.Table.from_pydict( - {"id": [1, 2], "data": ["a", "b"]}, - schema=pa.schema([pa.field("id", pa.int64()), pa.field("data", pa.large_string())]), - ) - with table.transaction() as txn: - with txn.update_snapshot().fast_append() as snapshot_update: - for data_file in _dataframe_to_data_files(table_metadata=txn.table_metadata, df=pa_table, io=table.io): - snapshot_update.append_data_file(data_file) - - table.refresh() - current_snapshot = table.current_snapshot() - assert current_snapshot is not None - original_snapshot_id = current_snapshot.snapshot_id - original_snapshot_log_len = len(table.metadata.snapshot_log) - - # Replace with new schema - new_schema = Schema( - NestedField(field_id=1, name="id", field_type=LongType(), required=False), - NestedField(field_id=2, name="value", field_type=StringType(), required=False), - ) - catalog.replace_table(identifier, schema=new_schema) - - replaced = catalog.load_table(identifier) - - # Current snapshot cleared (main ref removed) + assert replaced.metadata.table_uuid == original.metadata.table_uuid assert replaced.current_snapshot() is None - - # Old snapshot still in metadata assert any(s.snapshot_id == original_snapshot_id for s in replaced.metadata.snapshots) - - # Snapshot log preserved - assert len(replaced.metadata.snapshot_log) >= original_snapshot_log_len - - -@pytest.mark.integration -@pytest.mark.parametrize("catalog", [lf("session_catalog")]) -def test_replace_table_with_partition_spec(catalog: Catalog) -> None: - """Replace table with a new partition spec preserves old spec in metadata.""" - from pyiceberg.partitioning import PartitionField, PartitionSpec - from pyiceberg.transforms import IdentityTransform - - identifier = f"default.test_replace_with_spec_{catalog.name}" - try: - catalog.create_namespace("default") - except Exception: - pass - try: - catalog.drop_table(identifier) - except NoSuchTableError: - pass - - schema = Schema( - NestedField(field_id=1, name="id", field_type=LongType(), required=False), - NestedField(field_id=2, name="data", field_type=StringType(), required=False), - ) - catalog.create_table(identifier, schema=schema) - - # Replace with a partition spec on "id" - new_spec = PartitionSpec( - PartitionField(source_id=1, field_id=1000, transform=IdentityTransform(), name="id"), - spec_id=0, - ) - catalog.replace_table(identifier, schema=schema, partition_spec=new_spec) - - table = catalog.load_table(identifier) - - # New spec is the default - current_spec = table.metadata.spec() - assert len(current_spec.fields) == 1 - assert current_spec.fields[0].name == "id" - - # Old unpartitioned spec still in metadata - assert len(table.metadata.partition_specs) >= 2 - - # last_partition_id should be correct - assert table.metadata.last_partition_id is not None and table.metadata.last_partition_id >= 1000 - - -@pytest.mark.integration -@pytest.mark.parametrize("catalog", [lf("session_catalog")]) -def test_replace_table_sequential_replaces(catalog: Catalog) -> None: - """Multiple sequential replaces: schemas grow, last_column_id is monotonic, metadata_log grows.""" - identifier = f"default.test_replace_sequential_{catalog.name}" - try: - catalog.create_namespace("default") - except Exception: - pass - try: - catalog.drop_table(identifier) - except NoSuchTableError: - pass - - schema_a = Schema( - NestedField(field_id=1, name="a", field_type=LongType(), required=False), - NestedField(field_id=2, name="b", field_type=StringType(), required=False), - ) - catalog.create_table(identifier, schema=schema_a) - - table = catalog.load_table(identifier) - prev_last_col_id = table.metadata.last_column_id - prev_metadata_log_len = len(table.metadata.metadata_log) - - # Replace 1: A -> B (completely different fields) - schema_b = Schema( - NestedField(field_id=1, name="x", field_type=IntegerType(), required=False), - NestedField(field_id=2, name="y", field_type=IntegerType(), required=False), - NestedField(field_id=3, name="z", field_type=BooleanType(), required=False), - ) - catalog.replace_table(identifier, schema=schema_b) - - table = catalog.load_table(identifier) - assert table.metadata.last_column_id >= prev_last_col_id - assert len(table.metadata.schemas) == 2 - assert len(table.metadata.metadata_log) > prev_metadata_log_len - prev_last_col_id = table.metadata.last_column_id - prev_metadata_log_len = len(table.metadata.metadata_log) - - # Replace 2: B -> C (again different) - schema_c = Schema( - NestedField(field_id=1, name="p", field_type=StringType(), required=False), - NestedField(field_id=2, name="q", field_type=LongType(), required=False), - ) - catalog.replace_table(identifier, schema=schema_c) - - table = catalog.load_table(identifier) - assert table.metadata.last_column_id >= prev_last_col_id - assert len(table.metadata.schemas) == 3 # A, B, C all have different fields - assert len(table.metadata.metadata_log) > prev_metadata_log_len + catalog.drop_table(identifier) @pytest.mark.integration diff --git a/tests/test_schema.py b/tests/test_schema.py index 3937354a5d..825713d83a 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -1818,38 +1818,35 @@ def test_check_schema_compatible_optional_map_field_present() -> None: _check_schema_compatible(requested_schema, provided_schema) -def test_assign_fresh_schema_ids_for_replace_reuses_ids() -> None: - """Test that field IDs are reused from the base schema by name.""" - base_schema = Schema( - NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), - NestedField(field_id=2, name="data", field_type=StringType(), required=False), - ) - new_schema = Schema( - NestedField(field_id=10, name="id", field_type=IntegerType(), required=False), - NestedField(field_id=20, name="data", field_type=StringType(), required=False), - ) - fresh, last_col_id = assign_fresh_schema_ids_for_replace(new_schema, base_schema, 2) - assert fresh.fields[0].field_id == 1 # reused from base - assert fresh.fields[1].field_id == 2 # reused from base - assert last_col_id == 2 # no new columns added - - -def test_assign_fresh_schema_ids_for_replace_assigns_new_ids_for_new_fields() -> None: - """Test that new fields get IDs above last_column_id.""" +@pytest.mark.parametrize( + "new_fields, expected_ids, expected_last_col_id", + [ + # All columns reused by name: IDs come from base, last_column_id unchanged. + ([("id", IntegerType()), ("data", StringType())], [1, 2], 2), + # Mix of reused and new: new column gets ID above last_column_id. + ([("id", IntegerType()), ("data", StringType()), ("new_col", BooleanType())], [1, 2, 3], 3), + # No column names match: all fresh IDs starting from last_column_id + 1. + ([("x", IntegerType()), ("y", IntegerType())], [3, 4], 4), + ], + ids=["all-reused", "mixed", "none-reused"], +) +def test_assign_fresh_schema_ids_for_replace_primitive_fields( + new_fields: list[tuple[str, IcebergType]], expected_ids: list[int], expected_last_col_id: int +) -> None: + """Replace schema field IDs are reused from the base schema by name; new fields get IDs above last_column_id.""" base_schema = Schema( NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), NestedField(field_id=2, name="data", field_type=StringType(), required=False), ) new_schema = Schema( - NestedField(field_id=10, name="id", field_type=IntegerType(), required=False), - NestedField(field_id=20, name="data", field_type=StringType(), required=False), - NestedField(field_id=30, name="new_col", field_type=BooleanType(), required=False), + *( + NestedField(field_id=10 * (i + 1), name=name, field_type=field_type, required=False) + for i, (name, field_type) in enumerate(new_fields) + ) ) fresh, last_col_id = assign_fresh_schema_ids_for_replace(new_schema, base_schema, 2) - assert fresh.fields[0].field_id == 1 # reused - assert fresh.fields[1].field_id == 2 # reused - assert fresh.fields[2].field_id == 3 # new, starts after last_column_id=2 - assert last_col_id == 3 + assert [f.field_id for f in fresh.fields] == expected_ids + assert last_col_id == expected_last_col_id def test_assign_fresh_schema_ids_for_replace_with_nested_struct() -> None: @@ -1889,17 +1886,3 @@ def test_assign_fresh_schema_ids_for_replace_with_nested_struct() -> None: assert last_col_id == 5 -def test_assign_fresh_schema_ids_for_replace_completely_new_schema() -> None: - """Test that a completely new schema gets IDs starting from last_column_id + 1.""" - base_schema = Schema( - NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), - NestedField(field_id=2, name="data", field_type=StringType(), required=False), - ) - new_schema = Schema( - NestedField(field_id=10, name="x", field_type=IntegerType(), required=False), - NestedField(field_id=20, name="y", field_type=IntegerType(), required=False), - ) - fresh, last_col_id = assign_fresh_schema_ids_for_replace(new_schema, base_schema, 2) - assert fresh.fields[0].field_id == 3 # starts after last_column_id=2 - assert fresh.fields[1].field_id == 4 - assert last_col_id == 4 From 86752569ab8688f153cdf1b93c20a8fa024628d1 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Mon, 18 May 2026 18:41:50 +0100 Subject: [PATCH 03/24] Test RTAS: replace_table_transaction with atomic write Adds two new behavior tests: - test_replace_table_transaction_with_write_atomic_rtas (memory + sql): replace + fast_append in one transaction lands schema swap and new data atomically. New snapshot is current, old snapshot preserved in history. - test_replace_table_followed_by_separate_append (memory + sql): replace_table clears the current snapshot; a subsequent append restores main ref with new data only. - test_replace_table_transaction_rtas_against_rest_server: same RTAS flow exercised end-to-end against the REST docker stack. The bare replace_table() is the DDL-only form (clears current snapshot, preserves history). RTAS via replace_table_transaction is the primary use case for atomic schema-and-data swaps. --- tests/catalog/test_catalog_behaviors.py | 68 +++++++++++++++++++++++++ tests/integration/test_rest_catalog.py | 45 ++++++++++++++++ 2 files changed, 113 insertions(+) diff --git a/tests/catalog/test_catalog_behaviors.py b/tests/catalog/test_catalog_behaviors.py index d5de2c8db7..4b0c66c3f1 100644 --- a/tests/catalog/test_catalog_behaviors.py +++ b/tests/catalog/test_catalog_behaviors.py @@ -536,6 +536,74 @@ def test_replace_table_transaction_can_stage_additional_changes( assert replaced.properties.get("staged") == "yes" +def test_replace_table_transaction_with_write_atomic_rtas( + catalog: Catalog, test_table_identifier: Identifier +) -> None: + """RTAS (Replace Table As Select): replace the table and write new data in one transaction. + + Verifies the primary use case for `replace_table_transaction`: the new schema and the new + data land atomically, the new snapshot becomes the current snapshot (main ref is restored + on commit because the transaction emits a fast-append), and the old snapshot is preserved + in history.""" + _create_simple_table(catalog, test_table_identifier) + original_table = catalog.load_table(test_table_identifier) + old_data = pa.Table.from_pydict( + {"id": [1], "data": ["old"]}, + schema=pa.schema([pa.field("id", pa.int64()), pa.field("data", pa.large_string())]), + ) + original_table.append(old_data) + old_snapshot_id = catalog.load_table(test_table_identifier).current_snapshot().snapshot_id # type: ignore[union-attr] + + new_schema = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=False), + NestedField(field_id=2, name="name", field_type=StringType(), required=False), + ) + new_data = pa.Table.from_pydict( + {"id": [10, 20], "name": ["alice", "bob"]}, + schema=pa.schema([pa.field("id", pa.int64()), pa.field("name", pa.large_string())]), + ) + with catalog.replace_table_transaction(test_table_identifier, schema=new_schema) as txn: + with txn.update_snapshot().fast_append() as snap: + for data_file in _dataframe_to_data_files(table_metadata=txn.table_metadata, df=new_data, io=txn._table.io): + snap.append_data_file(data_file) + + replaced = catalog.load_table(test_table_identifier) + # Atomically: new schema is current, new snapshot is current, old snapshot is in history. + assert replaced.current_snapshot() is not None + assert replaced.current_snapshot().snapshot_id != old_snapshot_id # type: ignore[union-attr] + assert any(s.snapshot_id == old_snapshot_id for s in replaced.metadata.snapshots) + assert {f.name for f in replaced.schema().fields} == {"id", "name"} + # The new snapshot reflects the new data only — the old "data" column's row is gone from + # the active view (still in history). + assert replaced.scan().to_arrow().num_rows == 2 + + +def test_replace_table_followed_by_separate_append( + catalog: Catalog, test_table_identifier: Identifier +) -> None: + """`replace_table` clears the current snapshot; a subsequent `append` makes a new one current.""" + _, schema = _create_simple_table(catalog, test_table_identifier) + catalog.load_table(test_table_identifier).append( + pa.Table.from_pydict( + {"id": [1], "data": ["x"]}, + schema=pa.schema([pa.field("id", pa.int64()), pa.field("data", pa.large_string())]), + ) + ) + + replaced = catalog.replace_table(test_table_identifier, schema=schema) + assert replaced.current_snapshot() is None + + replaced.append( + pa.Table.from_pydict( + {"id": [42], "data": ["after-replace"]}, + schema=pa.schema([pa.field("id", pa.int64()), pa.field("data", pa.large_string())]), + ) + ) + after = catalog.load_table(test_table_identifier) + assert after.current_snapshot() is not None + assert after.scan().to_arrow().num_rows == 1 # Only the post-replace row is visible. + + # Rename table tests diff --git a/tests/integration/test_rest_catalog.py b/tests/integration/test_rest_catalog.py index 6c60caef54..73e23dfd57 100644 --- a/tests/integration/test_rest_catalog.py +++ b/tests/integration/test_rest_catalog.py @@ -117,6 +117,51 @@ def test_replace_table_end_to_end_against_rest_server(catalog: Catalog) -> None: catalog.drop_table(identifier) +@pytest.mark.integration +@pytest.mark.parametrize("catalog", [lf("session_catalog")]) +def test_replace_table_transaction_rtas_against_rest_server(catalog: Catalog) -> None: + """RTAS (Replace Table As Select) against a real REST server: the schema swap and the + new-data write must land atomically — the new snapshot is current on commit.""" + identifier = f"default.test_replace_rtas_{catalog.name}" + if not catalog.namespace_exists("default"): + catalog.create_namespace("default") + if catalog.table_exists(identifier): + catalog.drop_table(identifier) + + original_schema = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), + ) + original = catalog.create_table(identifier, schema=original_schema) + original.append( + pa.Table.from_pydict( + {"id": [1], "data": ["old"]}, + schema=pa.schema([pa.field("id", pa.int64()), pa.field("data", pa.large_string())]), + ) + ) + old_snapshot_id = catalog.load_table(identifier).current_snapshot().snapshot_id # type: ignore[union-attr] + + new_schema = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=False), + NestedField(field_id=2, name="name", field_type=StringType(), required=False), + ) + new_data = pa.Table.from_pydict( + {"id": [10, 20], "name": ["alice", "bob"]}, + schema=pa.schema([pa.field("id", pa.int64()), pa.field("name", pa.large_string())]), + ) + with catalog.replace_table_transaction(identifier, schema=new_schema) as txn: + with txn.update_snapshot().fast_append() as snap: + for data_file in _dataframe_to_data_files(table_metadata=txn.table_metadata, df=new_data, io=txn._table.io): + snap.append_data_file(data_file) + + replaced = catalog.load_table(identifier) + assert replaced.current_snapshot() is not None + assert replaced.current_snapshot().snapshot_id != old_snapshot_id # type: ignore[union-attr] + assert any(s.snapshot_id == old_snapshot_id for s in replaced.metadata.snapshots) + assert replaced.scan().to_arrow().num_rows == 2 + catalog.drop_table(identifier) + + @pytest.mark.integration @pytest.mark.parametrize("catalog", [lf("session_catalog")]) def test_load_view(catalog: RestCatalog, table_schema_nested: Schema, database_name: str, view_name: str) -> None: From fd7e11f722c48fad92aee2551edfc6cbc73a941a Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Mon, 18 May 2026 19:06:38 +0100 Subject: [PATCH 04/24] Address review feedback (see PR description for full details) --- mkdocs/docs/api.md | 6 +- pyiceberg/catalog/__init__.py | 26 ++- pyiceberg/catalog/rest/__init__.py | 20 -- pyiceberg/partitioning.py | 36 ++- pyiceberg/schema.py | 4 + pyiceberg/table/__init__.py | 50 ++-- tests/catalog/test_catalog_behaviors.py | 297 ++++++++++++++++-------- tests/catalog/test_rest.py | 47 ++-- tests/table/test_partitioning.py | 114 +++++---- tests/test_schema.py | 6 +- 10 files changed, 402 insertions(+), 204 deletions(-) diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md index 5d1429a283..fb43851fe9 100644 --- a/mkdocs/docs/api.md +++ b/mkdocs/docs/api.md @@ -187,7 +187,7 @@ with catalog.create_table_transaction(identifier="docs_example.bids", schema=sch ## Replace a table -Atomically replace an existing table's schema, partition spec, sort order, location, and properties. The table UUID and history (snapshots, schemas, specs, sort orders, metadata log) are preserved; the current snapshot is cleared (the `main` branch ref is removed). This is the analog of Spark/Trino's `CREATE OR REPLACE TABLE` for the table-metadata side, and supports RTAS-style workflows when combined with subsequent writes. +Atomically replace an existing table's schema, partition spec, sort order, location, and properties. The table UUID and history (snapshots, schemas, specs, sort orders, metadata log) are preserved; the current snapshot is cleared (the `main` branch ref is removed). Use this when you want to redefine the table's metadata; pair it with `replace_table_transaction` to atomically write new data alongside the metadata change (RTAS-style). ```python from pyiceberg.schema import Schema @@ -206,7 +206,9 @@ catalog.replace_table( Field IDs from columns whose names appear in the previous schema are reused, so existing data files remain readable when the new schema is a compatible superset. New columns get fresh IDs above `last-column-id`. -Use `replace_table_transaction` to stage additional changes (writes, property updates, schema evolution) before committing — the equivalent of `CREATE OR REPLACE TABLE AS SELECT`: +Properties passed to `replace_table` are **merged** with the existing table properties (your values override; existing keys you don't pass are preserved). To remove a property as part of the replace, use `replace_table_transaction` and remove it explicitly within the transaction. + +Use `replace_table_transaction` to stage additional changes (writes, property updates, schema evolution) before committing — for example, swap the schema and write new data atomically: ```python with catalog.replace_table_transaction(identifier="docs_example.bids", schema=new_schema) as txn: diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index 8a560190e5..ae79fa8ade 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -331,8 +331,8 @@ def delete_data_files(io: FileIO, manifests_to_delete: list[ManifestFile]) -> No def _raise_if_view_exists(catalog: Catalog, identifier: str | Identifier) -> None: """Raise TableAlreadyExistsError if a view exists at the same identifier. - Mirrors Java's `RESTSessionCatalog.replaceTransaction()` precondition. Catalogs that - don't support views raise `NotImplementedError` from `view_exists` — treat as "no view". + Catalogs that don't support views raise `NotImplementedError` from `view_exists` — + treat as "no view" in that case. """ try: view_collision = catalog.view_exists(identifier) @@ -483,7 +483,10 @@ def replace_table( location (str | None): New table location. Defaults to the existing location. partition_spec (PartitionSpec): New partition spec. sort_order (SortOrder): New sort order. - properties (Properties): New table properties (merged with existing). + properties (Properties): Properties to apply. Merged on top of the existing + table properties: keys present here override existing values; existing keys + not present here are preserved. To remove a property, follow up with a + transaction that removes it explicitly. Returns: Table: the replaced table instance. @@ -516,7 +519,10 @@ def replace_table_transaction( location (str | None): New table location. Defaults to the existing location. partition_spec (PartitionSpec): New partition spec. sort_order (SortOrder): New sort order. - properties (Properties): New table properties (merged with existing). + properties (Properties): Properties to apply. Merged on top of the existing + table properties: keys present here override existing values; existing keys + not present here are preserved. To remove a property, follow up with a + transaction that removes it explicitly. Returns: ReplaceTableTransaction: A transaction for the replace operation. @@ -538,8 +544,7 @@ def _replace_staged_table( ) -> tuple[StagedTable, Schema, PartitionSpec, SortOrder, str]: """Load the existing table and build fresh schema/spec/sort-order for replacement. - Mirrors the bookkeeping in `TableMetadata.buildReplacement` (iceberg-java): - - reuses existing field IDs by name (current schema) + - reuses existing field IDs by name (from the current schema) - reuses partition field IDs by `(source, transform)` across all specs (v2+), or carries forward the current spec with `VoidTransform`s (v1) - reassigns sort field IDs against the fresh schema @@ -551,7 +556,14 @@ def _replace_staged_table( existing_table = self.load_table(identifier) existing_metadata = existing_table.metadata - resolved_format_version = int(properties.get(TableProperties.FORMAT_VERSION, existing_metadata.format_version)) # type: ignore + requested_format_version = properties.get(TableProperties.FORMAT_VERSION) + if requested_format_version is not None and int(requested_format_version) < existing_metadata.format_version: + raise ValueError( + f"Cannot downgrade format-version from {existing_metadata.format_version} to {requested_format_version}" + ) + resolved_format_version = ( + int(requested_format_version) if requested_format_version is not None else existing_metadata.format_version + ) iceberg_schema = self._convert_schema_if_needed(schema, resolved_format_version) fresh_schema, _ = assign_fresh_schema_ids_for_replace( diff --git a/pyiceberg/catalog/rest/__init__.py b/pyiceberg/catalog/rest/__init__.py index 0a9e48de32..0ef4c809f2 100644 --- a/pyiceberg/catalog/rest/__init__.py +++ b/pyiceberg/catalog/rest/__init__.py @@ -966,26 +966,6 @@ def create_table_transaction( staged_table = self._response_to_staged_table(self.identifier_to_tuple(identifier), table_response) return CreateTableTransaction(staged_table) - @override - def replace_table( - self, - identifier: str | Identifier, - schema: Schema | pa.Schema, - location: str | None = None, - partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC, - sort_order: SortOrder = UNSORTED_SORT_ORDER, - properties: Properties = EMPTY_DICT, - ) -> Table: - txn = self.replace_table_transaction( - identifier=identifier, - schema=schema, - location=location, - partition_spec=partition_spec, - sort_order=sort_order, - properties=properties, - ) - return txn.commit_transaction() - @override @retry(**_RETRY_ARGS) def replace_table_transaction( diff --git a/pyiceberg/partitioning.py b/pyiceberg/partitioning.py index 65224211ba..b51f37443c 100644 --- a/pyiceberg/partitioning.py +++ b/pyiceberg/partitioning.py @@ -346,7 +346,6 @@ def assign_fresh_partition_spec_ids_for_replace( ) -> tuple[PartitionSpec, int]: """Assign partition field IDs for a replace operation, reusing IDs from existing specs. - Mirrors `TableMetadata.reassignPartitionIds` in iceberg-java: - For v2+, reuse partition field IDs by `(source_id, transform)` across all existing specs. New fields get IDs starting from `last_partition_id + 1`. - For v1, the current spec's fields must be preserved (v1 specs are append-only). Fields @@ -374,8 +373,8 @@ def assign_fresh_partition_spec_ids_for_replace( spec, old_schema, fresh_schema, current_spec, effective_last_partition_id ) - # v2+: reuse field IDs by (source_id, transform) across all specs. - # Use max() for dedup when the same (source_id, transform) appears in multiple specs. + # v2+: reuse field IDs by (source_id, transform) across all specs. When the same + # (source_id, transform) appears in multiple specs, prefer the highest field_id. transform_to_field_id: dict[tuple[int, str], int] = {} for existing_spec in existing_specs: for field in existing_spec.fields: @@ -412,8 +411,9 @@ def assign_fresh_partition_spec_ids_for_replace( ) ) - new_last_partition_id = max(next_id, effective_last_partition_id) - return PartitionSpec(*partition_fields, spec_id=INITIAL_PARTITION_SPEC_ID), new_last_partition_id + # `next_id` starts at `effective_last_partition_id` and only increments, so it is the + # new last partition id. + return PartitionSpec(*partition_fields, spec_id=INITIAL_PARTITION_SPEC_ID), next_id def _assign_fresh_partition_spec_ids_for_replace_v1( @@ -442,6 +442,7 @@ def _assign_fresh_partition_spec_ids_for_replace_v1( # Walk current spec, carrying forward each field. Matching new fields consume their key; # missing fields become void transforms. + used_names: set[str] = set(new_field_names) partition_fields = [] for cur_field in current_spec.fields: key = (cur_field.source_id, str(cur_field.transform)) @@ -456,8 +457,10 @@ def _assign_fresh_partition_spec_ids_for_replace_v1( transform=new_field.transform, ) ) + used_names.add(new_field.name) else: - void_name = f"{cur_field.name}_{cur_field.field_id}" if cur_field.name in new_field_names else cur_field.name + void_name = _unique_void_name(cur_field.name, cur_field.field_id, used_names) + used_names.add(void_name) partition_fields.append( PartitionField( name=void_name, @@ -480,8 +483,25 @@ def _assign_fresh_partition_spec_ids_for_replace_v1( ) ) - new_last_partition_id = max(next_id, effective_last_partition_id) - return PartitionSpec(*partition_fields, spec_id=INITIAL_PARTITION_SPEC_ID), new_last_partition_id + # `next_id` starts at `effective_last_partition_id` and only increments, so it is the + # new last partition id. + return PartitionSpec(*partition_fields, spec_id=INITIAL_PARTITION_SPEC_ID), next_id + + +def _unique_void_name(base_name: str, field_id: int, used_names: set[str]) -> str: + """Pick a void-transform name that does not collide with already-used names. + + First tries `base_name`; if taken, tries `base_name_{field_id}`; if still taken, + appends `_2`, `_3`, ... until unique. + """ + if base_name not in used_names: + return base_name + candidate = f"{base_name}_{field_id}" + suffix = 2 + while candidate in used_names: + candidate = f"{base_name}_{field_id}_{suffix}" + suffix += 1 + return candidate T = TypeVar("T") diff --git a/pyiceberg/schema.py b/pyiceberg/schema.py index 9d60787978..7ae198c74d 100644 --- a/pyiceberg/schema.py +++ b/pyiceberg/schema.py @@ -1386,6 +1386,10 @@ class _SetFreshIDsForReplace(_SetFreshIDs): For each field in the new schema, if a field with the same full name exists in the base schema, its ID is reused; otherwise a fresh ID is allocated starting from last_column_id + 1. + + Note: ID reuse is purely name-based — a field whose name matches but whose type differs + (e.g. `int` → `string`) will reuse the base ID. This is intentional: replace allows + arbitrary schema changes; type compatibility is the caller's responsibility. """ def __init__(self, old_id_to_base_id: dict[int, int], starting_id: int) -> None: diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index e209e6c5c7..de4a3fc4b3 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -58,6 +58,8 @@ AddSchemaUpdate, AddSortOrderUpdate, AssertCreate, + AssertLastAssignedFieldId, + AssertLastAssignedPartitionId, AssertRefSnapshotId, AssertTableUUID, AssignUUIDUpdate, @@ -1018,6 +1020,18 @@ class ReplaceTableTransaction(Transaction): schema/spec/sort-order/location/properties are applied. """ + def __init__( + self, + table: StagedTable, + new_schema: Schema, + new_spec: PartitionSpec, + new_sort_order: SortOrder, + new_location: str, + new_properties: Properties, + ) -> None: + super().__init__(table, autocommit=False) + self._initial_changes(table.metadata, new_schema, new_spec, new_sort_order, new_location, new_properties) + def _initial_changes( self, table_metadata: TableMetadata, @@ -1029,11 +1043,11 @@ def _initial_changes( ) -> None: """Set the initial changes that transform the existing table into the replacement. - Mirrors Java's `TableMetadata.buildReplacement` + `RESTSessionCatalog.replaceTransaction`: - ensures `SetCurrentSchema` / `SetDefaultPartitionSpec` / `SetDefaultSortOrder` are - always emitted (even when reused), and bumps `format-version` when requested. + Always emits `SetCurrentSchema` / `SetDefaultPartitionSpec` / `SetDefaultSortOrder` + (even when the resulting id is reused) so the request body unambiguously signals a + replace. Bumps `format-version` when the new properties request it. """ - # Upgrade format-version if requested via properties (matches Java's buildReplacement). + # Upgrade format-version if requested via properties. requested_format_version_str = new_properties.get(TableProperties.FORMAT_VERSION) if requested_format_version_str is not None: requested_format_version = int(requested_format_version_str) @@ -1115,30 +1129,30 @@ def _find_matching_sort_order_id(table_metadata: TableMetadata, sort_order: Sort return existing.order_id return None - def __init__( - self, - table: StagedTable, - new_schema: Schema, - new_spec: PartitionSpec, - new_sort_order: SortOrder, - new_location: str, - new_properties: Properties, - ) -> None: - super().__init__(table, autocommit=False) - self._initial_changes(table.metadata, new_schema, new_spec, new_sort_order, new_location, new_properties) - def commit_transaction(self) -> Table: """Commit the replace changes to the catalog. - Uses AssertTableUUID as the only requirement. + Requirements: + - `AssertTableUUID` — the table identity hasn't changed since load. + - `AssertLastAssignedFieldId` — guards against a concurrent commit bumping + `last-column-id` between load and commit (which would cause our newly-assigned + field IDs to collide). + - `AssertLastAssignedPartitionId` — same guard for partition field IDs. Returns: The table with the updates applied. """ if len(self._updates) > 0: + base = self._table.metadata + requirements: tuple[TableRequirement, ...] = ( + AssertTableUUID(uuid=base.table_uuid), + AssertLastAssignedFieldId(last_assigned_field_id=base.last_column_id), + ) + if base.last_partition_id is not None: + requirements += (AssertLastAssignedPartitionId(last_assigned_partition_id=base.last_partition_id),) self._table._do_commit( # pylint: disable=W0212 updates=self._updates, - requirements=(AssertTableUUID(uuid=self._table.metadata.table_uuid),), + requirements=requirements, ) self._updates = () diff --git a/tests/catalog/test_catalog_behaviors.py b/tests/catalog/test_catalog_behaviors.py index 4b0c66c3f1..ddc0135e62 100644 --- a/tests/catalog/test_catalog_behaviors.py +++ b/tests/catalog/test_catalog_behaviors.py @@ -390,28 +390,46 @@ def test_load_table_from_self_identifier( # Replace table tests -def _create_simple_table(catalog: Catalog, identifier: Identifier, format_version: int = 2) -> tuple[Identifier, Schema]: +_SIMPLE_SCHEMA = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), +) + + +def _create_simple_table( + catalog: Catalog, + identifier: Identifier, + *, + schema: Schema = _SIMPLE_SCHEMA, + format_version: int = 2, + partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC, + properties: dict[str, str] | None = None, +) -> tuple[Identifier, Schema]: namespace = Catalog.namespace_from(identifier) catalog.create_namespace_if_not_exists(namespace) - schema = Schema( - NestedField(field_id=1, name="id", field_type=LongType(), required=False), - NestedField(field_id=2, name="data", field_type=StringType(), required=False), - ) - catalog.create_table(identifier, schema=schema, properties={"format-version": str(format_version)}) + merged_properties = {"format-version": str(format_version), **(properties or {})} + catalog.create_table(identifier, schema=schema, partition_spec=partition_spec, properties=merged_properties) return identifier, schema +def _simple_data(num_rows: int = 2) -> pa.Table: + return pa.Table.from_pydict( + {"id": list(range(num_rows)), "data": [chr(ord("a") + i) for i in range(num_rows)]}, + schema=pa.schema([pa.field("id", pa.int64()), pa.field("data", pa.large_string())]), + ) + + def test_replace_table_preserves_uuid_and_clears_current_snapshot( catalog: Catalog, test_table_identifier: Identifier ) -> None: + """Replace preserves table_uuid and snapshot history, but clears the main ref.""" _create_simple_table(catalog, test_table_identifier) original = catalog.load_table(test_table_identifier) - data = pa.Table.from_pydict( - {"id": [1, 2], "data": ["a", "b"]}, - schema=pa.schema([pa.field("id", pa.int64()), pa.field("data", pa.large_string())]), - ) - original.append(data) - assert catalog.load_table(test_table_identifier).metadata.current_snapshot_id is not None + original.append(_simple_data()) + after_append = catalog.load_table(test_table_identifier) + assert after_append.metadata.current_snapshot_id is not None, "fixture must produce a snapshot before we replace" + snapshots_before = len(after_append.metadata.snapshots) + assert snapshots_before >= 1 new_schema = Schema( NestedField(field_id=1, name="id", field_type=LongType(), required=False), @@ -420,104 +438,162 @@ def test_replace_table_preserves_uuid_and_clears_current_snapshot( ) replaced = catalog.replace_table(test_table_identifier, schema=new_schema) - assert replaced.metadata.table_uuid == original.metadata.table_uuid, "UUID must be preserved across replace" - assert replaced.metadata.current_snapshot_id is None, "main ref must be cleared" - # Snapshot history is preserved even though the current snapshot is cleared. - snapshots_before = len(catalog.load_table(test_table_identifier).metadata.snapshots) + assert replaced.metadata.table_uuid == original.metadata.table_uuid + assert replaced.metadata.current_snapshot_id is None assert len(replaced.metadata.snapshots) == snapshots_before -def test_replace_table_reuses_schema_id_when_structurally_identical( - catalog: Catalog, test_table_identifier: Identifier +@pytest.mark.parametrize( + "extra_fields, expected_schema_ids, expected_current, expected_fields, expected_last_col_id", + [ + # Identical to the existing schema → reuse the same schema_id, no new schema appended. + ([], [0], 0, {"id": 1, "data": 2}, 2), + # Extra column → append a new schema; existing IDs preserved by name, new one above last_column_id. + ( + [NestedField(field_id=99, name="extra", field_type=BooleanType(), required=False)], + [0, 1], + 1, + {"id": 1, "data": 2, "extra": 3}, + 3, + ), + ], + ids=["identical-reuses-id", "extended-adds-new-id"], +) +def test_replace_table_schema_id_reuse( + catalog: Catalog, + test_table_identifier: Identifier, + extra_fields: list[NestedField], + expected_schema_ids: list[int], + expected_current: int, + expected_fields: dict[str, int], + expected_last_col_id: int, ) -> None: - _, schema = _create_simple_table(catalog, test_table_identifier) - replaced = catalog.replace_table(test_table_identifier, schema=schema) - # No new schema was added; current schema id is unchanged. - assert [s.schema_id for s in replaced.metadata.schemas] == [0] - assert replaced.metadata.current_schema_id == 0 + """A structurally identical schema reuses its schema_id; an extended schema adds a new one.""" + _, base_schema = _create_simple_table(catalog, test_table_identifier) + new_schema = Schema(*base_schema.fields, *extra_fields) + replaced = catalog.replace_table(test_table_identifier, schema=new_schema) + assert sorted(s.schema_id for s in replaced.metadata.schemas) == expected_schema_ids + assert replaced.metadata.current_schema_id == expected_current + assert {f.name: f.field_id for f in replaced.metadata.schema().fields} == expected_fields + assert replaced.metadata.last_column_id == expected_last_col_id -def test_replace_table_adds_new_schema_when_different(catalog: Catalog, test_table_identifier: Identifier) -> None: + +def test_replace_table_accepts_pyarrow_schema(catalog: Catalog, test_table_identifier: Identifier) -> None: + """The schema argument may be a PyArrow schema; it must be converted and have IDs assigned.""" _create_simple_table(catalog, test_table_identifier) + pa_schema = pa.schema([pa.field("id", pa.int64()), pa.field("data", pa.large_string()), pa.field("extra", pa.bool_())]) + replaced = catalog.replace_table(test_table_identifier, schema=pa_schema) + assert {f.name for f in replaced.schema().fields} == {"id", "data", "extra"} + + +def test_replace_table_preserves_identifier_field_ids(catalog: Catalog, test_table_identifier: Identifier) -> None: + """Identifier-field IDs on the new schema are honored after reuse-by-name.""" + schema = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), + identifier_field_ids=[1], + ) + _create_simple_table(catalog, test_table_identifier, schema=schema) new_schema = Schema( - NestedField(field_id=1, name="id", field_type=LongType(), required=False), + NestedField(field_id=1, name="id", field_type=LongType(), required=True), NestedField(field_id=2, name="data", field_type=StringType(), required=False), NestedField(field_id=3, name="extra", field_type=BooleanType(), required=False), + identifier_field_ids=[1], ) replaced = catalog.replace_table(test_table_identifier, schema=new_schema) - assert sorted(s.schema_id for s in replaced.metadata.schemas) == [0, 1] - assert replaced.metadata.current_schema_id == 1 - # "id" and "data" keep field IDs 1 and 2; "extra" gets a fresh ID above last_column_id. - assert {f.name: f.field_id for f in replaced.metadata.schema().fields} == {"id": 1, "data": 2, "extra": 3} - assert replaced.metadata.last_column_id == 3 + assert list(replaced.schema().identifier_field_ids) == [1] def test_replace_table_reuses_partition_spec_id(catalog: Catalog, test_table_identifier: Identifier) -> None: - namespace = Catalog.namespace_from(test_table_identifier) - catalog.create_namespace(namespace) - schema = Schema( - NestedField(field_id=1, name="id", field_type=LongType(), required=False), - NestedField(field_id=2, name="data", field_type=StringType(), required=False), - ) + """An identical partition spec reuses its spec_id rather than appending a new one.""" spec = PartitionSpec(PartitionField(source_id=1, field_id=1000, name="id_part", transform=IdentityTransform())) - catalog.create_table(test_table_identifier, schema=schema, partition_spec=spec) - + _, schema = _create_simple_table(catalog, test_table_identifier, partition_spec=spec) replaced = catalog.replace_table(test_table_identifier, schema=schema, partition_spec=spec) assert [s.spec_id for s in replaced.metadata.partition_specs] == [0] assert replaced.metadata.default_spec_id == 0 -def test_replace_table_with_new_location(catalog: Catalog, test_table_identifier: Identifier, tmp_path: Path) -> None: - _, schema = _create_simple_table(catalog, test_table_identifier) - new_location = f"file://{tmp_path}/relocated" - replaced = catalog.replace_table(test_table_identifier, schema=schema, location=new_location) - assert replaced.metadata.location == new_location - - -def test_replace_table_defaults_to_existing_location(catalog: Catalog, test_table_identifier: Identifier) -> None: +@pytest.mark.parametrize( + "location_kwarg, expected", + [ + ("explicit", "explicit"), # explicit location is used verbatim (minus any trailing slash) + (None, "existing"), # no location → existing location is preserved + ("explicit-with-trailing-slash", "explicit-with-trailing-slash-stripped"), + ], + ids=["explicit", "inherited", "trailing-slash-stripped"], +) +def test_replace_table_location_resolution( + catalog: Catalog, + test_table_identifier: Identifier, + tmp_path: Path, + location_kwarg: str | None, + expected: str, +) -> None: + """`location=None` keeps the existing location; an explicit location is used (sans trailing slash).""" _, schema = _create_simple_table(catalog, test_table_identifier) - existing_location = catalog.load_table(test_table_identifier).metadata.location - replaced = catalog.replace_table(test_table_identifier, schema=schema) - assert replaced.metadata.location == existing_location - - -def test_replace_table_merges_properties(catalog: Catalog, test_table_identifier: Identifier) -> None: - namespace = Catalog.namespace_from(test_table_identifier) - catalog.create_namespace(namespace) + existing = catalog.load_table(test_table_identifier).metadata.location + + if location_kwarg is None: + replaced = catalog.replace_table(test_table_identifier, schema=schema) + assert replaced.metadata.location == existing + elif location_kwarg == "explicit": + new_location = f"file://{tmp_path}/relocated" + replaced = catalog.replace_table(test_table_identifier, schema=schema, location=new_location) + assert replaced.metadata.location == new_location + else: + new_location = f"file://{tmp_path}/with-slash" + replaced = catalog.replace_table(test_table_identifier, schema=schema, location=new_location + "/") + assert replaced.metadata.location == new_location + + +def test_replace_table_merges_properties_with_overrides_and_additions( + catalog: Catalog, test_table_identifier: Identifier +) -> None: + """Properties are merged onto existing: new values override, untouched keys are preserved.""" schema = Schema(NestedField(field_id=1, name="id", field_type=LongType(), required=False)) - catalog.create_table(test_table_identifier, schema=schema, properties={"keep": "yes", "override": "old"}) - replaced = catalog.replace_table(test_table_identifier, schema=schema, properties={"override": "new", "new_key": "v"}) + _create_simple_table(catalog, test_table_identifier, schema=schema, properties={"keep": "yes", "override": "old"}) + replaced = catalog.replace_table( + test_table_identifier, schema=schema, properties={"override": "new", "new_key": "v"} + ) assert replaced.properties["keep"] == "yes" assert replaced.properties["override"] == "new" assert replaced.properties["new_key"] == "v" def test_replace_table_upgrades_format_version(catalog: Catalog, test_table_identifier: Identifier) -> None: + """`properties={'format-version': '2'}` on a v1 table emits an UpgradeFormatVersion update.""" _, schema = _create_simple_table(catalog, test_table_identifier, format_version=1) assert catalog.load_table(test_table_identifier).format_version == 1 replaced = catalog.replace_table(test_table_identifier, schema=schema, properties={"format-version": "2"}) assert replaced.format_version == 2 +def test_replace_table_rejects_format_version_downgrade( + catalog: Catalog, test_table_identifier: Identifier +) -> None: + """A `format-version` lower than the existing one must be rejected to avoid silently + running the schema-conversion path with the wrong semantics.""" + _, schema = _create_simple_table(catalog, test_table_identifier, format_version=2) + with pytest.raises(ValueError, match="Cannot downgrade format-version"): + catalog.replace_table(test_table_identifier, schema=schema, properties={"format-version": "1"}) + + def test_replace_table_v1_carries_forward_partition_fields_as_void( catalog: Catalog, test_table_identifier: Identifier ) -> None: - """v1 specs are append-only; dropped partition fields must be preserved with VoidTransform.""" - namespace = Catalog.namespace_from(test_table_identifier) - catalog.create_namespace(namespace) - schema = Schema( - NestedField(field_id=1, name="id", field_type=LongType(), required=False), - NestedField(field_id=2, name="data", field_type=StringType(), required=False), - ) + """v1 specs are append-only; dropped partition fields must be carried forward as VoidTransform.""" spec = PartitionSpec(PartitionField(source_id=1, field_id=1000, name="id_part", transform=IdentityTransform())) - catalog.create_table(test_table_identifier, schema=schema, partition_spec=spec, properties={"format-version": "1"}) + _, schema = _create_simple_table(catalog, test_table_identifier, partition_spec=spec, format_version=1) - replaced = catalog.replace_table(test_table_identifier, schema=schema) # default unpartitioned + # Replace with default unpartitioned spec — the old field must be preserved as void. + replaced = catalog.replace_table(test_table_identifier, schema=schema) new_spec = replaced.spec() - # The "id_part" field is carried forward (under a possibly-renamed identity), now as void. void_field = next(f for f in new_spec.fields if f.field_id == 1000) assert isinstance(void_field.transform, VoidTransform) assert void_field.source_id == 1 + # Name is preserved when there's no collision in the new spec (the new spec is empty here). + assert void_field.name == "id_part" def test_replace_table_raises_when_table_does_not_exist(catalog: Catalog, test_table_identifier: Identifier) -> None: @@ -529,6 +605,7 @@ def test_replace_table_raises_when_table_does_not_exist(catalog: Catalog, test_t def test_replace_table_transaction_can_stage_additional_changes( catalog: Catalog, test_table_identifier: Identifier ) -> None: + """The transaction context lets callers stage extra updates (e.g. properties) before commit.""" _, schema = _create_simple_table(catalog, test_table_identifier) with catalog.replace_table_transaction(test_table_identifier, schema=schema) as txn: txn.set_properties({"staged": "yes"}) @@ -539,19 +616,14 @@ def test_replace_table_transaction_can_stage_additional_changes( def test_replace_table_transaction_with_write_atomic_rtas( catalog: Catalog, test_table_identifier: Identifier ) -> None: - """RTAS (Replace Table As Select): replace the table and write new data in one transaction. + """RTAS (Replace Table As Select): replace + write new data in one transaction. - Verifies the primary use case for `replace_table_transaction`: the new schema and the new - data land atomically, the new snapshot becomes the current snapshot (main ref is restored - on commit because the transaction emits a fast-append), and the old snapshot is preserved - in history.""" + The new schema and new data must land atomically. After commit: + - the new snapshot is current (main ref restored by the fast_append), + - the new snapshot's parent is None (no lineage to the pre-replace data), and + - the pre-replace snapshot is preserved in history for time-travel.""" _create_simple_table(catalog, test_table_identifier) - original_table = catalog.load_table(test_table_identifier) - old_data = pa.Table.from_pydict( - {"id": [1], "data": ["old"]}, - schema=pa.schema([pa.field("id", pa.int64()), pa.field("data", pa.large_string())]), - ) - original_table.append(old_data) + catalog.load_table(test_table_identifier).append(_simple_data(num_rows=1)) old_snapshot_id = catalog.load_table(test_table_identifier).current_snapshot().snapshot_id # type: ignore[union-attr] new_schema = Schema( @@ -568,31 +640,71 @@ def test_replace_table_transaction_with_write_atomic_rtas( snap.append_data_file(data_file) replaced = catalog.load_table(test_table_identifier) - # Atomically: new schema is current, new snapshot is current, old snapshot is in history. - assert replaced.current_snapshot() is not None - assert replaced.current_snapshot().snapshot_id != old_snapshot_id # type: ignore[union-attr] + new_snapshot = replaced.current_snapshot() + assert new_snapshot is not None + assert new_snapshot.snapshot_id != old_snapshot_id + assert new_snapshot.parent_snapshot_id is None, "fresh start: new snapshot must not inherit old lineage" assert any(s.snapshot_id == old_snapshot_id for s in replaced.metadata.snapshots) assert {f.name for f in replaced.schema().fields} == {"id", "name"} - # The new snapshot reflects the new data only — the old "data" column's row is gone from - # the active view (still in history). assert replaced.scan().to_arrow().num_rows == 2 -def test_replace_table_followed_by_separate_append( +def test_replace_table_transaction_rolls_back_on_failure( catalog: Catalog, test_table_identifier: Identifier ) -> None: - """`replace_table` clears the current snapshot; a subsequent `append` makes a new one current.""" + """If the body of the transaction raises, no metadata change is committed. + + The table's UUID, schemas, current snapshot, and current schema id must all be unchanged.""" + _create_simple_table(catalog, test_table_identifier) + catalog.load_table(test_table_identifier).append(_simple_data()) + before = catalog.load_table(test_table_identifier).metadata + + new_schema = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), + NestedField(field_id=3, name="extra", field_type=BooleanType(), required=False), + ) + with pytest.raises(RuntimeError, match="simulated failure inside replace transaction"): + with catalog.replace_table_transaction(test_table_identifier, schema=new_schema): + raise RuntimeError("simulated failure inside replace transaction") + + after = catalog.load_table(test_table_identifier).metadata + assert after.table_uuid == before.table_uuid + assert after.current_snapshot_id == before.current_snapshot_id + assert after.current_schema_id == before.current_schema_id + assert len(after.schemas) == len(before.schemas) + + +def test_concurrent_replace_table(catalog: Catalog, test_table_identifier: Identifier) -> None: + """Two replace_table calls staged from the same base metadata cannot both commit. + + The first replace updates the catalog pointer to a new metadata location; the second + replace was built against the now-stale base, so its commit must be rejected with + `CommitFailedException`.""" _, schema = _create_simple_table(catalog, test_table_identifier) - catalog.load_table(test_table_identifier).append( - pa.Table.from_pydict( - {"id": [1], "data": ["x"]}, - schema=pa.schema([pa.field("id", pa.int64()), pa.field("data", pa.large_string())]), - ) + new_schema = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), + NestedField(field_id=3, name="extra", field_type=BooleanType(), required=False), ) + # Both transactions are built from the same base metadata. + txn_a = catalog.replace_table_transaction(test_table_identifier, schema=new_schema) + txn_b = catalog.replace_table_transaction(test_table_identifier, schema=schema) - replaced = catalog.replace_table(test_table_identifier, schema=schema) - assert replaced.current_snapshot() is None + txn_a.commit_transaction() + with pytest.raises(CommitFailedException): + txn_b.commit_transaction() + +def test_replace_table_allows_subsequent_append( + catalog: Catalog, test_table_identifier: Identifier +) -> None: + """After `replace_table` clears the current snapshot, a separate `append` produces a new + snapshot containing only the post-replace data — the pre-replace rows are not visible.""" + _, schema = _create_simple_table(catalog, test_table_identifier) + catalog.load_table(test_table_identifier).append(_simple_data(num_rows=3)) + + replaced = catalog.replace_table(test_table_identifier, schema=schema) replaced.append( pa.Table.from_pydict( {"id": [42], "data": ["after-replace"]}, @@ -600,8 +712,7 @@ def test_replace_table_followed_by_separate_append( ) ) after = catalog.load_table(test_table_identifier) - assert after.current_snapshot() is not None - assert after.scan().to_arrow().num_rows == 1 # Only the post-replace row is visible. + assert after.scan().to_arrow().num_rows == 1 # Rename table tests diff --git a/tests/catalog/test_rest.py b/tests/catalog/test_rest.py index 65874bcfcb..12519ea451 100644 --- a/tests/catalog/test_rest.py +++ b/tests/catalog/test_rest.py @@ -2962,16 +2962,26 @@ def test_replace_table_transaction_wire_payload( catalog.replace_table_transaction(identifier=("fokko", "fokko2"), schema=new_schema).commit_transaction() request = rest_mock.last_request.json() - assert request["requirements"] == [{"type": "assert-table-uuid", "uuid": table_uuid}] + assert request["requirements"] == [ + {"type": "assert-table-uuid", "uuid": table_uuid}, + {"type": "assert-last-assigned-field-id", "last-assigned-field-id": 2}, + {"type": "assert-last-assigned-partition-id", "last-assigned-partition-id": 999}, + ] + # No duplicate actions in the payload (the dict-by-action collapses them — guard against it). + actions = [u["action"] for u in request["updates"]] + assert len(actions) == len(set(actions)), f"duplicate actions in request: {actions}" updates_by_action = {u["action"]: u for u in request["updates"]} + # main snapshot must be cleared on replace. assert updates_by_action["remove-snapshot-ref"] == {"action": "remove-snapshot-ref", "ref-name": "main"} # A new schema is added because the column set differs; field IDs for existing columns - # are reused by name. + # are reused by name. The highest id (3) flows from the schema's fields, which the + # server uses to bump the table's last-column-id on the server side. added_schema = updates_by_action["add-schema"]["schema"] assert {f["name"]: f["id"] for f in added_schema["fields"]} == {"id": 1, "data": 2, "new_col": 3} - # Set* updates are emitted unconditionally (matches Java's RESTSessionCatalog). + # Set* updates are emitted unconditionally even when their resulting id is reused. + # schema-id=-1 is the wire sentinel meaning "the schema we just added in this commit". assert updates_by_action["set-current-schema"]["schema-id"] == -1 assert "set-default-spec" in updates_by_action assert "set-default-sort-order" in updates_by_action @@ -3004,7 +3014,7 @@ def test_replace_table_transaction_rejects_view_collision( example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any], example_table_metadata_no_snapshot_v1_rest_json: dict[str, Any], ) -> None: - """A view at the same identifier must fail replace, matching Java's RESTSessionCatalog.""" + """A view at the same identifier must fail replace.""" _mock_replace_endpoints( rest_mock, "fokko", @@ -3021,13 +3031,14 @@ def test_replace_table_transaction_rejects_view_collision( ) -def test_replace_table_commits_immediately( +def test_replace_table_issues_commit_post_immediately( rest_mock: Mocker, example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any], example_table_metadata_no_snapshot_v1_rest_json: dict[str, Any], ) -> None: - """`replace_table` is `replace_table_transaction(...).commit_transaction()` — verify - it produces an HTTP commit, not just a staged transaction.""" + """`replace_table` issues a commit POST during the call — distinguishing it from + `replace_table_transaction(...)` alone, which only issues the load GET (no POST until + the caller commits the transaction).""" table_uuid = example_table_metadata_with_snapshot_v1_rest_json["metadata"]["table-uuid"] example_table_metadata_no_snapshot_v1_rest_json["metadata"]["table-uuid"] = table_uuid _mock_replace_endpoints( @@ -3038,16 +3049,20 @@ def test_replace_table_commits_immediately( example_table_metadata_no_snapshot_v1_rest_json, ) catalog = RestCatalog("rest", uri=TEST_URI, token=TEST_TOKEN) - - replaced = catalog.replace_table( - identifier=("fokko", "fokko2"), - schema=Schema( - NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), - NestedField(field_id=2, name="data", field_type=StringType(), required=False), - ), + new_schema = Schema( + NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), ) + + # Baseline: opening the transaction alone makes the HEAD (view-exists) + GET (load) calls + # but no POST until the caller commits. + catalog.replace_table_transaction(identifier=("fokko", "fokko2"), schema=new_schema) + methods_after_open = [r.method for r in rest_mock.request_history] + assert "POST" not in methods_after_open + + replaced = catalog.replace_table(identifier=("fokko", "fokko2"), schema=new_schema) assert replaced.metadata.table_uuid == uuid.UUID(table_uuid) - # The last request should be the commit POST (not the GET load_table). - assert rest_mock.last_request.method == "POST" + methods_after_replace = [r.method for r in rest_mock.request_history] + assert "POST" in methods_after_replace, "replace_table must commit immediately" diff --git a/tests/table/test_partitioning.py b/tests/table/test_partitioning.py index e7a4c5fc00..d089c1b740 100644 --- a/tests/table/test_partitioning.py +++ b/tests/table/test_partitioning.py @@ -36,6 +36,7 @@ IdentityTransform, MonthTransform, TruncateTransform, + VoidTransform, YearTransform, ) from pyiceberg.typedef import Record @@ -305,48 +306,83 @@ def test_incompatible_transform_source_type() -> None: assert "Invalid source field foo with type int for transform: year" in str(exc.value) -def test_assign_fresh_partition_spec_ids_for_replace_reuses_ids() -> None: - """Test that partition field IDs are reused from existing specs.""" - old_schema = Schema( - NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), - NestedField(field_id=2, name="data", field_type=StringType(), required=False), - ) - fresh_schema = Schema( - NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), - NestedField(field_id=2, name="data", field_type=StringType(), required=False), - ) - existing_specs = [ - PartitionSpec( - PartitionField(source_id=1, field_id=1000, transform=IdentityTransform(), name="id"), - spec_id=0, - ) - ] - spec = PartitionSpec( - PartitionField(source_id=1, field_id=999, transform=IdentityTransform(), name="id"), - spec_id=0, +_REPLACE_SCHEMA_FOR_PARTITION = Schema( + NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), +) + + +@pytest.mark.parametrize( + "existing_specs, last_partition_id, expected_field_id, expected_last_partition_id", + [ + # Reuse: same (source_id, transform) already in an existing spec → reuse its field_id. + ( + [PartitionSpec(PartitionField(source_id=1, field_id=1000, transform=IdentityTransform(), name="id"), spec_id=0)], + 1000, + 1000, + 1000, + ), + # New: no matching (source_id, transform) → fresh id above last_partition_id. + ([PartitionSpec(spec_id=0)], 999, 1000, 1000), + ], + ids=["reuse-from-existing-spec", "new-field-above-last-partition-id"], +) +def test_assign_fresh_partition_spec_ids_for_replace_v2( + existing_specs: list[PartitionSpec], + last_partition_id: int, + expected_field_id: int, + expected_last_partition_id: int, +) -> None: + spec = PartitionSpec(PartitionField(source_id=1, field_id=999, transform=IdentityTransform(), name="id")) + fresh_spec, new_last_pid = assign_fresh_partition_spec_ids_for_replace( + spec, _REPLACE_SCHEMA_FOR_PARTITION, _REPLACE_SCHEMA_FOR_PARTITION, existing_specs, last_partition_id ) - fresh_spec, last_pid = assign_fresh_partition_spec_ids_for_replace(spec, old_schema, fresh_schema, existing_specs, 1000) - assert fresh_spec.fields[0].field_id == 1000 # reused from existing spec - assert last_pid == 1000 + assert fresh_spec.fields[0].field_id == expected_field_id + assert new_last_pid == expected_last_partition_id -def test_assign_fresh_partition_spec_ids_for_replace_new_field() -> None: - """Test that new partition fields get IDs above last_partition_id.""" - old_schema = Schema( - NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), - NestedField(field_id=2, name="data", field_type=StringType(), required=False), +def test_assign_fresh_partition_spec_ids_for_replace_v1_carries_forward_as_void() -> None: + """v1 specs are append-only: a field absent from the new spec is carried forward as void.""" + current_spec = PartitionSpec( + PartitionField(source_id=1, field_id=1000, transform=IdentityTransform(), name="id"), spec_id=0 ) - fresh_schema = Schema( - NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), - NestedField(field_id=2, name="data", field_type=StringType(), required=False), + # New spec drops "id" entirely, partitioned by "data" instead. + new_spec = PartitionSpec(PartitionField(source_id=2, field_id=999, transform=IdentityTransform(), name="data")) + fresh_spec, new_last_pid = assign_fresh_partition_spec_ids_for_replace( + new_spec, + _REPLACE_SCHEMA_FOR_PARTITION, + _REPLACE_SCHEMA_FOR_PARTITION, + existing_specs=[current_spec], + last_partition_id=1000, + format_version=1, + current_spec=current_spec, ) - existing_specs = [ - PartitionSpec(spec_id=0) # unpartitioned - ] - spec = PartitionSpec( - PartitionField(source_id=1, field_id=999, transform=IdentityTransform(), name="id"), - spec_id=0, + # Two fields: the carried-forward void at field_id=1000, and the new "data" field above it. + fields_by_id = {f.field_id: f for f in fresh_spec.fields} + assert isinstance(fields_by_id[1000].transform, VoidTransform) + assert fields_by_id[1000].name == "id" + assert fields_by_id[1001].name == "data" + assert isinstance(fields_by_id[1001].transform, IdentityTransform) + assert new_last_pid == 1001 + + +def test_assign_fresh_partition_spec_ids_for_replace_v1_renames_void_on_name_collision() -> None: + """When a void field's name collides with a new field's name, a unique suffix is added.""" + current_spec = PartitionSpec( + PartitionField(source_id=1, field_id=1000, transform=IdentityTransform(), name="data"), spec_id=0 + ) + # New spec partitions "data" by a different transform — the OLD "data" must be voided + # under a different name to avoid collision with the NEW "data" partition. + new_spec = PartitionSpec(PartitionField(source_id=2, field_id=999, transform=IdentityTransform(), name="data")) + fresh_spec, _ = assign_fresh_partition_spec_ids_for_replace( + new_spec, + _REPLACE_SCHEMA_FOR_PARTITION, + _REPLACE_SCHEMA_FOR_PARTITION, + existing_specs=[current_spec], + last_partition_id=1000, + format_version=1, + current_spec=current_spec, ) - fresh_spec, last_pid = assign_fresh_partition_spec_ids_for_replace(spec, old_schema, fresh_schema, existing_specs, 999) - assert fresh_spec.fields[0].field_id == 1000 # new, above last_partition_id=999 - assert last_pid == 1000 + void_field = next(f for f in fresh_spec.fields if isinstance(f.transform, VoidTransform)) + assert void_field.name != "data", "void name must not collide with active partition name" + assert void_field.name == "data_1000" diff --git a/tests/test_schema.py b/tests/test_schema.py index 825713d83a..b64fd34f23 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -1828,7 +1828,11 @@ def test_check_schema_compatible_optional_map_field_present() -> None: # No column names match: all fresh IDs starting from last_column_id + 1. ([("x", IntegerType()), ("y", IntegerType())], [3, 4], 4), ], - ids=["all-reused", "mixed", "none-reused"], + ids=[ + "all-reused-keeps-last-col-id", + "new-field-bumps-last-col-id", + "no-name-overlap-bumps-from-base", + ], ) def test_assign_fresh_schema_ids_for_replace_primitive_fields( new_fields: list[tuple[str, IcebergType]], expected_ids: list[int], expected_last_col_id: int From b4d76c1dba6c7c39dce4bf2825dc8c2582a976bc Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Mon, 18 May 2026 19:59:48 +0100 Subject: [PATCH 05/24] Address review feedback (round 2) - Drop view-collision pre-flight check; deferring to a follow-up PR that ports the check across create/replace/rename in one pass. - Simplify RTAS tests + docs to use txn.append(df) instead of the verbose update_snapshot().fast_append() + _dataframe_to_data_files pattern. - Drop parallel new_schema construction in RTAS tests; pass df.schema directly (replace_table* accepts Schema | pa.Schema). - Reword docs intro per reviewer suggestion. - Run formatter / linter; trailing whitespace and trailing newline fix. - mypy: cast format_version to TableVersion; extract inner with into helper to silence unreachable warning under pytest.raises. --- mkdocs/docs/api.md | 25 ++++--------- pyiceberg/catalog/__init__.py | 17 +-------- pyiceberg/catalog/rest/__init__.py | 11 +----- tests/catalog/test_catalog_behaviors.py | 48 ++++++++----------------- tests/catalog/test_rest.py | 32 +---------------- tests/integration/test_rest_catalog.py | 48 +++++++------------------ tests/table/test_partitioning.py | 4 +-- tests/test_schema.py | 2 -- 8 files changed, 38 insertions(+), 149 deletions(-) diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md index fb43851fe9..8513c60158 100644 --- a/mkdocs/docs/api.md +++ b/mkdocs/docs/api.md @@ -187,41 +187,28 @@ with catalog.create_table_transaction(identifier="docs_example.bids", schema=sch ## Replace a table -Atomically replace an existing table's schema, partition spec, sort order, location, and properties. The table UUID and history (snapshots, schemas, specs, sort orders, metadata log) are preserved; the current snapshot is cleared (the `main` branch ref is removed). Use this when you want to redefine the table's metadata; pair it with `replace_table_transaction` to atomically write new data alongside the metadata change (RTAS-style). +Atomically replace an existing table's schema, partition spec, sort order, location, and properties. The table UUID and history (snapshots, schemas, specs, sort orders, metadata log) are preserved; the current snapshot is cleared (the `main` branch ref is removed). `replace_table` redefines the table in this way; `replace_table_transaction` lets you write new data alongside this change to permit RTAS (replace-table-as-select) workflows. ```python -from pyiceberg.schema import Schema -from pyiceberg.types import NestedField, LongType, StringType, BooleanType - -new_schema = Schema( - NestedField(field_id=1, name="datetime", field_type=LongType(), required=False), - NestedField(field_id=2, name="symbol", field_type=StringType(), required=False), - NestedField(field_id=3, name="active", field_type=BooleanType(), required=False), -) -catalog.replace_table( - identifier="docs_example.bids", - schema=new_schema, -) +catalog.replace_table(identifier="docs_example.bids", schema=df.schema) ``` -Field IDs from columns whose names appear in the previous schema are reused, so existing data files remain readable when the new schema is a compatible superset. New columns get fresh IDs above `last-column-id`. +Where `df` is a PyArrow table (or `Schema`) carrying the new column set. Field IDs from columns whose names appear in the previous schema are reused, so existing data files remain readable when the new schema is a compatible superset. New columns get fresh IDs above `last-column-id`. Properties passed to `replace_table` are **merged** with the existing table properties (your values override; existing keys you don't pass are preserved). To remove a property as part of the replace, use `replace_table_transaction` and remove it explicitly within the transaction. Use `replace_table_transaction` to stage additional changes (writes, property updates, schema evolution) before committing — for example, swap the schema and write new data atomically: ```python -with catalog.replace_table_transaction(identifier="docs_example.bids", schema=new_schema) as txn: - with txn.update_snapshot().fast_append() as snap: - for data_file in _dataframe_to_data_files(table_metadata=txn.table_metadata, df=df, io=txn._table.io): - snap.append_data_file(data_file) +with catalog.replace_table_transaction(identifier="docs_example.bids", schema=df.schema) as txn: + txn.append(df) txn.set_properties(write_replaced_at="2026-04-19T00:00:00Z") ``` To upgrade the table's format version as part of the replace, pass `format-version` in `properties`: ```python -catalog.replace_table(identifier="docs_example.bids", schema=new_schema, properties={"format-version": "2"}) +catalog.replace_table(identifier="docs_example.bids", schema=df.schema, properties={"format-version": "2"}) ``` ## Register a table diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index ae79fa8ade..f11cfe83df 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -328,20 +328,6 @@ def delete_data_files(io: FileIO, manifests_to_delete: list[ManifestFile]) -> No deleted_files[path] = True -def _raise_if_view_exists(catalog: Catalog, identifier: str | Identifier) -> None: - """Raise TableAlreadyExistsError if a view exists at the same identifier. - - Catalogs that don't support views raise `NotImplementedError` from `view_exists` — - treat as "no view" in that case. - """ - try: - view_collision = catalog.view_exists(identifier) - except NotImplementedError: - view_collision = False - if view_collision: - raise TableAlreadyExistsError(f"View with same name already exists: {identifier}") - - def _import_catalog(name: str, catalog_impl: str, properties: Properties) -> Catalog | None: try: path_parts = catalog_impl.split(".") @@ -564,7 +550,7 @@ def _replace_staged_table( resolved_format_version = ( int(requested_format_version) if requested_format_version is not None else existing_metadata.format_version ) - iceberg_schema = self._convert_schema_if_needed(schema, resolved_format_version) + iceberg_schema = self._convert_schema_if_needed(schema, cast(TableVersion, resolved_format_version)) fresh_schema, _ = assign_fresh_schema_ids_for_replace( iceberg_schema, existing_metadata.schema(), existing_metadata.last_column_id @@ -1083,7 +1069,6 @@ def replace_table_transaction( sort_order: SortOrder = UNSORTED_SORT_ORDER, properties: Properties = EMPTY_DICT, ) -> ReplaceTableTransaction: - _raise_if_view_exists(self, identifier) staged_table, fresh_schema, fresh_spec, fresh_sort_order, resolved_location = self._replace_staged_table( identifier, schema, location, partition_spec, sort_order, properties ) diff --git a/pyiceberg/catalog/rest/__init__.py b/pyiceberg/catalog/rest/__init__.py index 0ef4c809f2..072b9f2034 100644 --- a/pyiceberg/catalog/rest/__init__.py +++ b/pyiceberg/catalog/rest/__init__.py @@ -30,15 +30,7 @@ from typing_extensions import override from pyiceberg import __version__ -from pyiceberg.catalog import ( - BOTOCORE_SESSION, - TOKEN, - URI, - WAREHOUSE_LOCATION, - Catalog, - PropertiesUpdateSummary, - _raise_if_view_exists, -) +from pyiceberg.catalog import BOTOCORE_SESSION, TOKEN, URI, WAREHOUSE_LOCATION, Catalog, PropertiesUpdateSummary from pyiceberg.catalog.rest.auth import AUTH_MANAGER, AuthManager, AuthManagerAdapter, AuthManagerFactory, LegacyOAuth2AuthManager from pyiceberg.catalog.rest.response import _handle_non_200_response from pyiceberg.catalog.rest.scan_planning import ( @@ -977,7 +969,6 @@ def replace_table_transaction( sort_order: SortOrder = UNSORTED_SORT_ORDER, properties: Properties = EMPTY_DICT, ) -> ReplaceTableTransaction: - _raise_if_view_exists(self, identifier) staged_table, fresh_schema, fresh_spec, fresh_sort_order, resolved_location = self._replace_staged_table( identifier, schema, location, partition_spec, sort_order, properties ) diff --git a/tests/catalog/test_catalog_behaviors.py b/tests/catalog/test_catalog_behaviors.py index ddc0135e62..c3f8c77157 100644 --- a/tests/catalog/test_catalog_behaviors.py +++ b/tests/catalog/test_catalog_behaviors.py @@ -419,9 +419,7 @@ def _simple_data(num_rows: int = 2) -> pa.Table: ) -def test_replace_table_preserves_uuid_and_clears_current_snapshot( - catalog: Catalog, test_table_identifier: Identifier -) -> None: +def test_replace_table_preserves_uuid_and_clears_current_snapshot(catalog: Catalog, test_table_identifier: Identifier) -> None: """Replace preserves table_uuid and snapshot history, but clears the main ref.""" _create_simple_table(catalog, test_table_identifier) original = catalog.load_table(test_table_identifier) @@ -553,9 +551,7 @@ def test_replace_table_merges_properties_with_overrides_and_additions( """Properties are merged onto existing: new values override, untouched keys are preserved.""" schema = Schema(NestedField(field_id=1, name="id", field_type=LongType(), required=False)) _create_simple_table(catalog, test_table_identifier, schema=schema, properties={"keep": "yes", "override": "old"}) - replaced = catalog.replace_table( - test_table_identifier, schema=schema, properties={"override": "new", "new_key": "v"} - ) + replaced = catalog.replace_table(test_table_identifier, schema=schema, properties={"override": "new", "new_key": "v"}) assert replaced.properties["keep"] == "yes" assert replaced.properties["override"] == "new" assert replaced.properties["new_key"] == "v" @@ -569,9 +565,7 @@ def test_replace_table_upgrades_format_version(catalog: Catalog, test_table_iden assert replaced.format_version == 2 -def test_replace_table_rejects_format_version_downgrade( - catalog: Catalog, test_table_identifier: Identifier -) -> None: +def test_replace_table_rejects_format_version_downgrade(catalog: Catalog, test_table_identifier: Identifier) -> None: """A `format-version` lower than the existing one must be rejected to avoid silently running the schema-conversion path with the wrong semantics.""" _, schema = _create_simple_table(catalog, test_table_identifier, format_version=2) @@ -579,9 +573,7 @@ def test_replace_table_rejects_format_version_downgrade( catalog.replace_table(test_table_identifier, schema=schema, properties={"format-version": "1"}) -def test_replace_table_v1_carries_forward_partition_fields_as_void( - catalog: Catalog, test_table_identifier: Identifier -) -> None: +def test_replace_table_v1_carries_forward_partition_fields_as_void(catalog: Catalog, test_table_identifier: Identifier) -> None: """v1 specs are append-only; dropped partition fields must be carried forward as VoidTransform.""" spec = PartitionSpec(PartitionField(source_id=1, field_id=1000, name="id_part", transform=IdentityTransform())) _, schema = _create_simple_table(catalog, test_table_identifier, partition_spec=spec, format_version=1) @@ -602,9 +594,7 @@ def test_replace_table_raises_when_table_does_not_exist(catalog: Catalog, test_t catalog.replace_table(test_table_identifier, schema=schema) -def test_replace_table_transaction_can_stage_additional_changes( - catalog: Catalog, test_table_identifier: Identifier -) -> None: +def test_replace_table_transaction_can_stage_additional_changes(catalog: Catalog, test_table_identifier: Identifier) -> None: """The transaction context lets callers stage extra updates (e.g. properties) before commit.""" _, schema = _create_simple_table(catalog, test_table_identifier) with catalog.replace_table_transaction(test_table_identifier, schema=schema) as txn: @@ -613,9 +603,7 @@ def test_replace_table_transaction_can_stage_additional_changes( assert replaced.properties.get("staged") == "yes" -def test_replace_table_transaction_with_write_atomic_rtas( - catalog: Catalog, test_table_identifier: Identifier -) -> None: +def test_replace_table_transaction_with_write_atomic_rtas(catalog: Catalog, test_table_identifier: Identifier) -> None: """RTAS (Replace Table As Select): replace + write new data in one transaction. The new schema and new data must land atomically. After commit: @@ -626,18 +614,12 @@ def test_replace_table_transaction_with_write_atomic_rtas( catalog.load_table(test_table_identifier).append(_simple_data(num_rows=1)) old_snapshot_id = catalog.load_table(test_table_identifier).current_snapshot().snapshot_id # type: ignore[union-attr] - new_schema = Schema( - NestedField(field_id=1, name="id", field_type=LongType(), required=False), - NestedField(field_id=2, name="name", field_type=StringType(), required=False), - ) new_data = pa.Table.from_pydict( {"id": [10, 20], "name": ["alice", "bob"]}, schema=pa.schema([pa.field("id", pa.int64()), pa.field("name", pa.large_string())]), ) - with catalog.replace_table_transaction(test_table_identifier, schema=new_schema) as txn: - with txn.update_snapshot().fast_append() as snap: - for data_file in _dataframe_to_data_files(table_metadata=txn.table_metadata, df=new_data, io=txn._table.io): - snap.append_data_file(data_file) + with catalog.replace_table_transaction(test_table_identifier, schema=new_data.schema) as txn: + txn.append(new_data) replaced = catalog.load_table(test_table_identifier) new_snapshot = replaced.current_snapshot() @@ -649,9 +631,7 @@ def test_replace_table_transaction_with_write_atomic_rtas( assert replaced.scan().to_arrow().num_rows == 2 -def test_replace_table_transaction_rolls_back_on_failure( - catalog: Catalog, test_table_identifier: Identifier -) -> None: +def test_replace_table_transaction_rolls_back_on_failure(catalog: Catalog, test_table_identifier: Identifier) -> None: """If the body of the transaction raises, no metadata change is committed. The table's UUID, schemas, current snapshot, and current schema id must all be unchanged.""" @@ -664,10 +644,14 @@ def test_replace_table_transaction_rolls_back_on_failure( NestedField(field_id=2, name="data", field_type=StringType(), required=False), NestedField(field_id=3, name="extra", field_type=BooleanType(), required=False), ) - with pytest.raises(RuntimeError, match="simulated failure inside replace transaction"): + + def run_failing_replace() -> None: with catalog.replace_table_transaction(test_table_identifier, schema=new_schema): raise RuntimeError("simulated failure inside replace transaction") + with pytest.raises(RuntimeError, match="simulated failure inside replace transaction"): + run_failing_replace() + after = catalog.load_table(test_table_identifier).metadata assert after.table_uuid == before.table_uuid assert after.current_snapshot_id == before.current_snapshot_id @@ -696,9 +680,7 @@ def test_concurrent_replace_table(catalog: Catalog, test_table_identifier: Ident txn_b.commit_transaction() -def test_replace_table_allows_subsequent_append( - catalog: Catalog, test_table_identifier: Identifier -) -> None: +def test_replace_table_allows_subsequent_append(catalog: Catalog, test_table_identifier: Identifier) -> None: """After `replace_table` clears the current snapshot, a separate `append` produces a new snapshot containing only the post-replace data — the pre-replace rows are not visible.""" _, schema = _create_simple_table(catalog, test_table_identifier) diff --git a/tests/catalog/test_rest.py b/tests/catalog/test_rest.py index 12519ea451..c979c45a43 100644 --- a/tests/catalog/test_rest.py +++ b/tests/catalog/test_rest.py @@ -2906,7 +2906,7 @@ def test_load_table_without_storage_credentials( # Catalog-agnostic behavior (UUID preservation, snapshot clearing, schema/spec/sort-order reuse, # format-version upgrade, etc.) is covered in tests/catalog/test_catalog_behaviors.py against # both InMemoryCatalog and SqlCatalog. The tests below cover REST-specific concerns: the wire -# payload sent to the server, 404 handling, and the view-collision pre-flight check. +# payload sent to the server and 404 handling. def _mock_replace_endpoints( @@ -2915,13 +2915,7 @@ def _mock_replace_endpoints( table: str, load_response: dict[str, Any], commit_response: dict[str, Any], - view_exists: bool = False, ) -> None: - rest_mock.head( - f"{TEST_URI}v1/namespaces/{namespace}/views/{table}", - status_code=204 if view_exists else 404, - request_headers=TEST_HEADERS, - ) rest_mock.get( f"{TEST_URI}v1/namespaces/{namespace}/tables/{table}", json=load_response, @@ -3009,28 +3003,6 @@ def test_replace_table_transaction_404_raises( ) -def test_replace_table_transaction_rejects_view_collision( - rest_mock: Mocker, - example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any], - example_table_metadata_no_snapshot_v1_rest_json: dict[str, Any], -) -> None: - """A view at the same identifier must fail replace.""" - _mock_replace_endpoints( - rest_mock, - "fokko", - "fokko2", - example_table_metadata_with_snapshot_v1_rest_json, - example_table_metadata_no_snapshot_v1_rest_json, - view_exists=True, - ) - catalog = RestCatalog("rest", uri=TEST_URI, token=TEST_TOKEN) - with pytest.raises(TableAlreadyExistsError, match="View with same name already exists"): - catalog.replace_table_transaction( - identifier=("fokko", "fokko2"), - schema=Schema(NestedField(field_id=1, name="id", field_type=IntegerType(), required=False)), - ) - - def test_replace_table_issues_commit_post_immediately( rest_mock: Mocker, example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any], @@ -3064,5 +3036,3 @@ def test_replace_table_issues_commit_post_immediately( assert replaced.metadata.table_uuid == uuid.UUID(table_uuid) methods_after_replace = [r.method for r in rest_mock.request_history] assert "POST" in methods_after_replace, "replace_table must commit immediately" - - diff --git a/tests/integration/test_rest_catalog.py b/tests/integration/test_rest_catalog.py index 73e23dfd57..7e66c4dcf4 100644 --- a/tests/integration/test_rest_catalog.py +++ b/tests/integration/test_rest_catalog.py @@ -25,9 +25,7 @@ from pyiceberg.catalog import Catalog from pyiceberg.catalog.rest import RestCatalog from pyiceberg.exceptions import NoSuchViewError -from pyiceberg.io.pyarrow import _dataframe_to_data_files from pyiceberg.schema import Schema -from pyiceberg.types import BooleanType, LongType, NestedField, StringType from pyiceberg.view.metadata import SQLViewRepresentation, ViewVersion TEST_NAMESPACE_IDENTIFIER = "TEST NS" @@ -87,29 +85,20 @@ def test_replace_table_end_to_end_against_rest_server(catalog: Catalog) -> None: if catalog.table_exists(identifier): catalog.drop_table(identifier) - original_schema = Schema( - NestedField(field_id=1, name="id", field_type=LongType(), required=False), - NestedField(field_id=2, name="data", field_type=StringType(), required=False), - ) - original = catalog.create_table(identifier, schema=original_schema) - pa_table = pa.Table.from_pydict( {"id": [1, 2, 3], "data": ["a", "b", "c"]}, schema=pa.schema([pa.field("id", pa.int64()), pa.field("data", pa.large_string())]), ) - with original.transaction() as txn: - with txn.update_snapshot().fast_append() as snapshot_update: - for data_file in _dataframe_to_data_files(table_metadata=txn.table_metadata, df=pa_table, io=original.io): - snapshot_update.append_data_file(data_file) + original = catalog.create_table(identifier, schema=pa_table.schema) + original.append(pa_table) original.refresh() original_snapshot_id = original.current_snapshot().snapshot_id # type: ignore[union-attr] - new_schema = Schema( - NestedField(field_id=1, name="id", field_type=LongType(), required=False), - NestedField(field_id=2, name="name", field_type=StringType(), required=False), - NestedField(field_id=3, name="active", field_type=BooleanType(), required=False), + new_data = pa.Table.from_pydict( + {"id": [10], "name": ["alice"], "active": [True]}, + schema=pa.schema([pa.field("id", pa.int64()), pa.field("name", pa.large_string()), pa.field("active", pa.bool_())]), ) - replaced = catalog.replace_table(identifier, schema=new_schema) + replaced = catalog.replace_table(identifier, schema=new_data.schema) assert replaced.metadata.table_uuid == original.metadata.table_uuid assert replaced.current_snapshot() is None @@ -128,31 +117,20 @@ def test_replace_table_transaction_rtas_against_rest_server(catalog: Catalog) -> if catalog.table_exists(identifier): catalog.drop_table(identifier) - original_schema = Schema( - NestedField(field_id=1, name="id", field_type=LongType(), required=False), - NestedField(field_id=2, name="data", field_type=StringType(), required=False), - ) - original = catalog.create_table(identifier, schema=original_schema) - original.append( - pa.Table.from_pydict( - {"id": [1], "data": ["old"]}, - schema=pa.schema([pa.field("id", pa.int64()), pa.field("data", pa.large_string())]), - ) + old_data = pa.Table.from_pydict( + {"id": [1], "data": ["old"]}, + schema=pa.schema([pa.field("id", pa.int64()), pa.field("data", pa.large_string())]), ) + original = catalog.create_table(identifier, schema=old_data.schema) + original.append(old_data) old_snapshot_id = catalog.load_table(identifier).current_snapshot().snapshot_id # type: ignore[union-attr] - new_schema = Schema( - NestedField(field_id=1, name="id", field_type=LongType(), required=False), - NestedField(field_id=2, name="name", field_type=StringType(), required=False), - ) new_data = pa.Table.from_pydict( {"id": [10, 20], "name": ["alice", "bob"]}, schema=pa.schema([pa.field("id", pa.int64()), pa.field("name", pa.large_string())]), ) - with catalog.replace_table_transaction(identifier, schema=new_schema) as txn: - with txn.update_snapshot().fast_append() as snap: - for data_file in _dataframe_to_data_files(table_metadata=txn.table_metadata, df=new_data, io=txn._table.io): - snap.append_data_file(data_file) + with catalog.replace_table_transaction(identifier, schema=new_data.schema) as txn: + txn.append(new_data) replaced = catalog.load_table(identifier) assert replaced.current_snapshot() is not None diff --git a/tests/table/test_partitioning.py b/tests/table/test_partitioning.py index d089c1b740..144ed205ed 100644 --- a/tests/table/test_partitioning.py +++ b/tests/table/test_partitioning.py @@ -343,9 +343,7 @@ def test_assign_fresh_partition_spec_ids_for_replace_v2( def test_assign_fresh_partition_spec_ids_for_replace_v1_carries_forward_as_void() -> None: """v1 specs are append-only: a field absent from the new spec is carried forward as void.""" - current_spec = PartitionSpec( - PartitionField(source_id=1, field_id=1000, transform=IdentityTransform(), name="id"), spec_id=0 - ) + current_spec = PartitionSpec(PartitionField(source_id=1, field_id=1000, transform=IdentityTransform(), name="id"), spec_id=0) # New spec drops "id" entirely, partitioned by "data" instead. new_spec = PartitionSpec(PartitionField(source_id=2, field_id=999, transform=IdentityTransform(), name="data")) fresh_spec, new_last_pid = assign_fresh_partition_spec_ids_for_replace( diff --git a/tests/test_schema.py b/tests/test_schema.py index b64fd34f23..4d5359cfee 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -1888,5 +1888,3 @@ def test_assign_fresh_schema_ids_for_replace_with_nested_struct() -> None: assert loc_fields[1].field_id == 4 # location.lon reused assert loc_fields[2].field_id == 5 # location.alt is new assert last_col_id == 5 - - From 1efa6fe9be097219b13e17f96919e1810703392c Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Mon, 18 May 2026 20:01:55 +0100 Subject: [PATCH 06/24] Use manual Schema for plain replace_table, df.schema for RTAS Per reviewer feedback: bare replace_table examples and tests should construct an explicit Schema, since that's the natural user-facing API for DDL-only redefinition. RTAS flows keep df.schema since the data and schema are coupled there. --- mkdocs/docs/api.md | 14 +++++++++++--- tests/integration/test_rest_catalog.py | 25 ++++++++++++++++--------- 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md index 8513c60158..44b374191b 100644 --- a/mkdocs/docs/api.md +++ b/mkdocs/docs/api.md @@ -190,10 +190,18 @@ with catalog.create_table_transaction(identifier="docs_example.bids", schema=sch Atomically replace an existing table's schema, partition spec, sort order, location, and properties. The table UUID and history (snapshots, schemas, specs, sort orders, metadata log) are preserved; the current snapshot is cleared (the `main` branch ref is removed). `replace_table` redefines the table in this way; `replace_table_transaction` lets you write new data alongside this change to permit RTAS (replace-table-as-select) workflows. ```python -catalog.replace_table(identifier="docs_example.bids", schema=df.schema) +from pyiceberg.schema import Schema +from pyiceberg.types import NestedField, LongType, StringType, BooleanType + +new_schema = Schema( + NestedField(field_id=1, name="datetime", field_type=LongType(), required=False), + NestedField(field_id=2, name="symbol", field_type=StringType(), required=False), + NestedField(field_id=3, name="active", field_type=BooleanType(), required=False), +) +catalog.replace_table(identifier="docs_example.bids", schema=new_schema) ``` -Where `df` is a PyArrow table (or `Schema`) carrying the new column set. Field IDs from columns whose names appear in the previous schema are reused, so existing data files remain readable when the new schema is a compatible superset. New columns get fresh IDs above `last-column-id`. +Field IDs from columns whose names appear in the previous schema are reused, so existing data files remain readable when the new schema is a compatible superset. New columns get fresh IDs above `last-column-id`. Properties passed to `replace_table` are **merged** with the existing table properties (your values override; existing keys you don't pass are preserved). To remove a property as part of the replace, use `replace_table_transaction` and remove it explicitly within the transaction. @@ -208,7 +216,7 @@ with catalog.replace_table_transaction(identifier="docs_example.bids", schema=df To upgrade the table's format version as part of the replace, pass `format-version` in `properties`: ```python -catalog.replace_table(identifier="docs_example.bids", schema=df.schema, properties={"format-version": "2"}) +catalog.replace_table(identifier="docs_example.bids", schema=new_schema, properties={"format-version": "2"}) ``` ## Register a table diff --git a/tests/integration/test_rest_catalog.py b/tests/integration/test_rest_catalog.py index 7e66c4dcf4..14ecffd922 100644 --- a/tests/integration/test_rest_catalog.py +++ b/tests/integration/test_rest_catalog.py @@ -26,6 +26,7 @@ from pyiceberg.catalog.rest import RestCatalog from pyiceberg.exceptions import NoSuchViewError from pyiceberg.schema import Schema +from pyiceberg.types import BooleanType, LongType, NestedField, StringType from pyiceberg.view.metadata import SQLViewRepresentation, ViewVersion TEST_NAMESPACE_IDENTIFIER = "TEST NS" @@ -85,20 +86,26 @@ def test_replace_table_end_to_end_against_rest_server(catalog: Catalog) -> None: if catalog.table_exists(identifier): catalog.drop_table(identifier) - pa_table = pa.Table.from_pydict( - {"id": [1, 2, 3], "data": ["a", "b", "c"]}, - schema=pa.schema([pa.field("id", pa.int64()), pa.field("data", pa.large_string())]), + original_schema = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), + ) + original = catalog.create_table(identifier, schema=original_schema) + original.append( + pa.Table.from_pydict( + {"id": [1, 2, 3], "data": ["a", "b", "c"]}, + schema=pa.schema([pa.field("id", pa.int64()), pa.field("data", pa.large_string())]), + ) ) - original = catalog.create_table(identifier, schema=pa_table.schema) - original.append(pa_table) original.refresh() original_snapshot_id = original.current_snapshot().snapshot_id # type: ignore[union-attr] - new_data = pa.Table.from_pydict( - {"id": [10], "name": ["alice"], "active": [True]}, - schema=pa.schema([pa.field("id", pa.int64()), pa.field("name", pa.large_string()), pa.field("active", pa.bool_())]), + new_schema = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=False), + NestedField(field_id=2, name="name", field_type=StringType(), required=False), + NestedField(field_id=3, name="active", field_type=BooleanType(), required=False), ) - replaced = catalog.replace_table(identifier, schema=new_data.schema) + replaced = catalog.replace_table(identifier, schema=new_schema) assert replaced.metadata.table_uuid == original.metadata.table_uuid assert replaced.current_snapshot() is None From a813a92b2fbf2c314ac2a29e86999916dbfd9c33 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Mon, 18 May 2026 20:16:30 +0100 Subject: [PATCH 07/24] Make replace_table_transaction @abstractmethod on Catalog Matches PR #498 (create_table_transaction) precedent. NoopCatalog gets an explicit stub since it extends Catalog directly. --- pyiceberg/catalog/__init__.py | 4 +--- pyiceberg/catalog/noop.py | 13 +++++++++++++ 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index f11cfe83df..0ad543d34d 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -479,12 +479,12 @@ def replace_table( Raises: NoSuchTableError: If the table does not exist. - TableAlreadyExistsError: If a view exists at the same identifier. """ return self.replace_table_transaction( identifier, schema, location, partition_spec, sort_order, properties ).commit_transaction() + @abstractmethod def replace_table_transaction( self, identifier: str | Identifier, @@ -515,9 +515,7 @@ def replace_table_transaction( Raises: NoSuchTableError: If the table does not exist. - TableAlreadyExistsError: If a view exists at the same identifier. """ - raise NotImplementedError("replace_table_transaction is not supported for this catalog type") def _replace_staged_table( self, diff --git a/pyiceberg/catalog/noop.py b/pyiceberg/catalog/noop.py index aeb3c72843..06348903af 100644 --- a/pyiceberg/catalog/noop.py +++ b/pyiceberg/catalog/noop.py @@ -28,6 +28,7 @@ from pyiceberg.table import ( CommitTableResponse, CreateTableTransaction, + ReplaceTableTransaction, Table, ) from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder @@ -68,6 +69,18 @@ def create_table_transaction( ) -> CreateTableTransaction: raise NotImplementedError + @override + def replace_table_transaction( + self, + identifier: str | Identifier, + schema: Schema | pa.Schema, + location: str | None = None, + partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC, + sort_order: SortOrder = UNSORTED_SORT_ORDER, + properties: Properties = EMPTY_DICT, + ) -> ReplaceTableTransaction: + raise NotImplementedError + @override def load_table(self, identifier: str | Identifier) -> Table: raise NotImplementedError From 5f1e0dc5677afa927967473ff24cb15d27a88277 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Mon, 18 May 2026 20:18:40 +0100 Subject: [PATCH 08/24] Reframe ReplaceTableTransaction.commit_transaction docstring Per audit: clarify that we always emit AssertLastAssigned*Id, which is stricter than the reference contract that conditions them on Add* being present. Fail-safe vs corruption-risk. --- pyiceberg/table/__init__.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index de4a3fc4b3..27c2623519 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -1132,13 +1132,18 @@ def _find_matching_sort_order_id(table_metadata: TableMetadata, sort_order: Sort def commit_transaction(self) -> Table: """Commit the replace changes to the catalog. - Requirements: + Always emits three optimistic-concurrency requirements: - `AssertTableUUID` — the table identity hasn't changed since load. - `AssertLastAssignedFieldId` — guards against a concurrent commit bumping `last-column-id` between load and commit (which would cause our newly-assigned field IDs to collide). - `AssertLastAssignedPartitionId` — same guard for partition field IDs. + Note: this is deliberately stricter than the reference REST `forReplaceTable` + contract, which conditions the `AssertLastAssigned*Id` requirements on + `AddSchema` / `AddPartitionSpec` being present in the change set. Emitting them + unconditionally is fail-safe — a no-op replace just confirms the assertions hold. + Returns: The table with the updates applied. """ From bafa06b488a0b693034de8a4235989d65b2d273e Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Mon, 18 May 2026 20:20:30 +0100 Subject: [PATCH 09/24] Trim ReplaceTableTransaction.commit_transaction docstring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Matches the sibling commit_transaction signatures — the requirements list is visible in the code below; no need to enumerate it in prose. --- pyiceberg/table/__init__.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 27c2623519..77eca2ccb2 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -1130,19 +1130,7 @@ def _find_matching_sort_order_id(table_metadata: TableMetadata, sort_order: Sort return None def commit_transaction(self) -> Table: - """Commit the replace changes to the catalog. - - Always emits three optimistic-concurrency requirements: - - `AssertTableUUID` — the table identity hasn't changed since load. - - `AssertLastAssignedFieldId` — guards against a concurrent commit bumping - `last-column-id` between load and commit (which would cause our newly-assigned - field IDs to collide). - - `AssertLastAssignedPartitionId` — same guard for partition field IDs. - - Note: this is deliberately stricter than the reference REST `forReplaceTable` - contract, which conditions the `AssertLastAssigned*Id` requirements on - `AddSchema` / `AddPartitionSpec` being present in the change set. Emitting them - unconditionally is fail-safe — a no-op replace just confirms the assertions hold. + """Commit the changes to the catalog. Returns: The table with the updates applied. From 1e6af57e672a779c68107c0a972bfaab6b19a1b0 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Mon, 18 May 2026 20:30:43 +0100 Subject: [PATCH 10/24] Nits --- tests/catalog/test_rest.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tests/catalog/test_rest.py b/tests/catalog/test_rest.py index c979c45a43..3757f071e2 100644 --- a/tests/catalog/test_rest.py +++ b/tests/catalog/test_rest.py @@ -2901,14 +2901,6 @@ def test_load_table_without_storage_credentials( assert actual == expected -# replace_table tests -# -# Catalog-agnostic behavior (UUID preservation, snapshot clearing, schema/spec/sort-order reuse, -# format-version upgrade, etc.) is covered in tests/catalog/test_catalog_behaviors.py against -# both InMemoryCatalog and SqlCatalog. The tests below cover REST-specific concerns: the wire -# payload sent to the server and 404 handling. - - def _mock_replace_endpoints( rest_mock: Mocker, namespace: str, From 14a6e034a7668e31d23ea51722fb55ea488809fd Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Mon, 18 May 2026 20:42:04 +0100 Subject: [PATCH 11/24] Drop made-up property example from RTAS docs snippet --- mkdocs/docs/api.md | 1 - 1 file changed, 1 deletion(-) diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md index 44b374191b..2a3a4d6910 100644 --- a/mkdocs/docs/api.md +++ b/mkdocs/docs/api.md @@ -210,7 +210,6 @@ Use `replace_table_transaction` to stage additional changes (writes, property up ```python with catalog.replace_table_transaction(identifier="docs_example.bids", schema=df.schema) as txn: txn.append(df) - txn.set_properties(write_replaced_at="2026-04-19T00:00:00Z") ``` To upgrade the table's format version as part of the replace, pass `format-version` in `properties`: From eaba3ea2f69bc61ecad41ba3b1f50ed837680277 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Mon, 18 May 2026 20:44:30 +0100 Subject: [PATCH 12/24] Drop stray '# Replace table tests' divider in test_catalog_behaviors --- tests/catalog/test_catalog_behaviors.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/catalog/test_catalog_behaviors.py b/tests/catalog/test_catalog_behaviors.py index c3f8c77157..99228d412c 100644 --- a/tests/catalog/test_catalog_behaviors.py +++ b/tests/catalog/test_catalog_behaviors.py @@ -387,8 +387,6 @@ def test_load_table_from_self_identifier( assert table.metadata == loaded_table.metadata -# Replace table tests - _SIMPLE_SCHEMA = Schema( NestedField(field_id=1, name="id", field_type=LongType(), required=False), From a6200546d252e796d1a68b4884a032c17faaa78b Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Mon, 18 May 2026 20:52:05 +0100 Subject: [PATCH 13/24] Drop extra blank line left by divider removal --- tests/catalog/test_catalog_behaviors.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/catalog/test_catalog_behaviors.py b/tests/catalog/test_catalog_behaviors.py index 99228d412c..6e83d7bb40 100644 --- a/tests/catalog/test_catalog_behaviors.py +++ b/tests/catalog/test_catalog_behaviors.py @@ -387,7 +387,6 @@ def test_load_table_from_self_identifier( assert table.metadata == loaded_table.metadata - _SIMPLE_SCHEMA = Schema( NestedField(field_id=1, name="id", field_type=LongType(), required=False), NestedField(field_id=2, name="data", field_type=StringType(), required=False), From 0847d204ba078c47ae445baf96b33da10ac584a9 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Mon, 18 May 2026 21:00:46 +0100 Subject: [PATCH 14/24] Apply test review findings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Critical: - test_concurrent_replace_table now uses identical new schemas across the two transactions and matches on 'last assigned field id' — this is the only path that actually validates AssertLastAssignedFieldId. Coverage: - Add test_replace_table_with_sort_order_changes: unsorted → sorted → unsorted with order_id reuse from history. - Extend the v2 partition-spec helper test with a reuse-by-bucket case (matching field_id under a renamed partition field). Brittleness fixes: - Drop dead HEAD mock in test_replace_table_transaction_404_raises; no view-exists pre-flight to mock anymore. - Pin wire-payload assertions to fixture metadata fields (no more magic 999 / spec_id / sort_order_id constants). - Tighten set-default-spec / set-default-sort-order to check the emitted id, not just existence. - Split test_replace_table_location_resolution into two non-stringly- discriminated tests. Removed: - test_replace_table_accepts_pyarrow_schema: redundant with the RTAS test which already exercises pa.Schema via df.schema. --- tests/catalog/test_catalog_behaviors.py | 86 ++++++++++++------------- tests/catalog/test_rest.py | 17 +++-- tests/table/test_partitioning.py | 37 ++++++++--- 3 files changed, 78 insertions(+), 62 deletions(-) diff --git a/tests/catalog/test_catalog_behaviors.py b/tests/catalog/test_catalog_behaviors.py index 6e83d7bb40..338674c036 100644 --- a/tests/catalog/test_catalog_behaviors.py +++ b/tests/catalog/test_catalog_behaviors.py @@ -474,14 +474,6 @@ def test_replace_table_schema_id_reuse( assert replaced.metadata.last_column_id == expected_last_col_id -def test_replace_table_accepts_pyarrow_schema(catalog: Catalog, test_table_identifier: Identifier) -> None: - """The schema argument may be a PyArrow schema; it must be converted and have IDs assigned.""" - _create_simple_table(catalog, test_table_identifier) - pa_schema = pa.schema([pa.field("id", pa.int64()), pa.field("data", pa.large_string()), pa.field("extra", pa.bool_())]) - replaced = catalog.replace_table(test_table_identifier, schema=pa_schema) - assert {f.name for f in replaced.schema().fields} == {"id", "data", "extra"} - - def test_replace_table_preserves_identifier_field_ids(catalog: Catalog, test_table_identifier: Identifier) -> None: """Identifier-field IDs on the new schema are honored after reuse-by-name.""" schema = Schema( @@ -509,37 +501,42 @@ def test_replace_table_reuses_partition_spec_id(catalog: Catalog, test_table_ide assert replaced.metadata.default_spec_id == 0 -@pytest.mark.parametrize( - "location_kwarg, expected", - [ - ("explicit", "explicit"), # explicit location is used verbatim (minus any trailing slash) - (None, "existing"), # no location → existing location is preserved - ("explicit-with-trailing-slash", "explicit-with-trailing-slash-stripped"), - ], - ids=["explicit", "inherited", "trailing-slash-stripped"], -) -def test_replace_table_location_resolution( - catalog: Catalog, - test_table_identifier: Identifier, - tmp_path: Path, - location_kwarg: str | None, - expected: str, -) -> None: - """`location=None` keeps the existing location; an explicit location is used (sans trailing slash).""" +def test_replace_table_with_sort_order_changes(catalog: Catalog, test_table_identifier: Identifier) -> None: + """Replace can change the sort order. The new sort order is appended to the history and + becomes the default; a follow-up replace back to unsorted reuses the unsorted order_id + rather than appending a duplicate.""" + _, schema = _create_simple_table(catalog, test_table_identifier) + sort = SortOrder(SortField(source_id=1, transform=IdentityTransform(), direction=SortDirection.ASC)) + + # unsorted → sorted: a new order is added and becomes the default. + sorted_table = catalog.replace_table(test_table_identifier, schema=schema, sort_order=sort) + assert sorted_table.sort_order().fields == sort.fields + assert sorted_table.metadata.default_sort_order_id != 0 + + # sorted → unsorted: reuses the unsorted order_id 0 from history. + unsorted_table = catalog.replace_table(test_table_identifier, schema=schema) + assert unsorted_table.sort_order().is_unsorted + assert unsorted_table.metadata.default_sort_order_id == 0 + + +def test_replace_table_inherits_existing_location(catalog: Catalog, test_table_identifier: Identifier) -> None: + """`location=None` keeps the existing table's location.""" _, schema = _create_simple_table(catalog, test_table_identifier) existing = catalog.load_table(test_table_identifier).metadata.location + replaced = catalog.replace_table(test_table_identifier, schema=schema) + assert replaced.metadata.location == existing - if location_kwarg is None: - replaced = catalog.replace_table(test_table_identifier, schema=schema) - assert replaced.metadata.location == existing - elif location_kwarg == "explicit": - new_location = f"file://{tmp_path}/relocated" - replaced = catalog.replace_table(test_table_identifier, schema=schema, location=new_location) - assert replaced.metadata.location == new_location - else: - new_location = f"file://{tmp_path}/with-slash" - replaced = catalog.replace_table(test_table_identifier, schema=schema, location=new_location + "/") - assert replaced.metadata.location == new_location + +@pytest.mark.parametrize("trailing_slash", [False, True], ids=["no-slash", "trailing-slash"]) +def test_replace_table_uses_explicit_location( + catalog: Catalog, test_table_identifier: Identifier, tmp_path: Path, trailing_slash: bool +) -> None: + """An explicit `location` is used verbatim; trailing slash is stripped.""" + _, schema = _create_simple_table(catalog, test_table_identifier) + bare = f"file://{tmp_path}/relocated" + arg = bare + "/" if trailing_slash else bare + replaced = catalog.replace_table(test_table_identifier, schema=schema, location=arg) + assert replaced.metadata.location == bare def test_replace_table_merges_properties_with_overrides_and_additions( @@ -657,23 +654,22 @@ def run_failing_replace() -> None: def test_concurrent_replace_table(catalog: Catalog, test_table_identifier: Identifier) -> None: - """Two replace_table calls staged from the same base metadata cannot both commit. - - The first replace updates the catalog pointer to a new metadata location; the second - replace was built against the now-stale base, so its commit must be rejected with - `CommitFailedException`.""" - _, schema = _create_simple_table(catalog, test_table_identifier) + """Two concurrent replace_table calls staged from the same base both adding a column + must fail on the second commit with an `assert-last-assigned-field-id` violation — + proving the `AssertLastAssignedFieldId` requirement actually guards against duplicate + field-id assignment under concurrent writers.""" + _create_simple_table(catalog, test_table_identifier) new_schema = Schema( NestedField(field_id=1, name="id", field_type=LongType(), required=False), NestedField(field_id=2, name="data", field_type=StringType(), required=False), NestedField(field_id=3, name="extra", field_type=BooleanType(), required=False), ) - # Both transactions are built from the same base metadata. + # Both transactions build from the same base metadata with the same new schema. txn_a = catalog.replace_table_transaction(test_table_identifier, schema=new_schema) - txn_b = catalog.replace_table_transaction(test_table_identifier, schema=schema) + txn_b = catalog.replace_table_transaction(test_table_identifier, schema=new_schema) txn_a.commit_transaction() - with pytest.raises(CommitFailedException): + with pytest.raises(CommitFailedException, match="last assigned field id"): txn_b.commit_transaction() diff --git a/tests/catalog/test_rest.py b/tests/catalog/test_rest.py index 3757f071e2..45a18fd2ec 100644 --- a/tests/catalog/test_rest.py +++ b/tests/catalog/test_rest.py @@ -2948,10 +2948,13 @@ def test_replace_table_transaction_wire_payload( catalog.replace_table_transaction(identifier=("fokko", "fokko2"), schema=new_schema).commit_transaction() request = rest_mock.last_request.json() + # Pin the requirement values to the fixture's last-column-id / last-partition-id so the + # assertion fails loudly if the fixture changes underneath us. + fixture_metadata = example_table_metadata_with_snapshot_v1_rest_json["metadata"] assert request["requirements"] == [ {"type": "assert-table-uuid", "uuid": table_uuid}, - {"type": "assert-last-assigned-field-id", "last-assigned-field-id": 2}, - {"type": "assert-last-assigned-partition-id", "last-assigned-partition-id": 999}, + {"type": "assert-last-assigned-field-id", "last-assigned-field-id": fixture_metadata["last-column-id"]}, + {"type": "assert-last-assigned-partition-id", "last-assigned-partition-id": fixture_metadata["last-partition-id"]}, ] # No duplicate actions in the payload (the dict-by-action collapses them — guard against it). @@ -2969,18 +2972,14 @@ def test_replace_table_transaction_wire_payload( # Set* updates are emitted unconditionally even when their resulting id is reused. # schema-id=-1 is the wire sentinel meaning "the schema we just added in this commit". assert updates_by_action["set-current-schema"]["schema-id"] == -1 - assert "set-default-spec" in updates_by_action - assert "set-default-sort-order" in updates_by_action + # Spec/sort-order are reused from the fixture (default ids) since the replace doesn't change them. + assert updates_by_action["set-default-spec"]["spec-id"] == fixture_metadata["default-spec-id"] + assert updates_by_action["set-default-sort-order"]["sort-order-id"] == fixture_metadata["default-sort-order-id"] def test_replace_table_transaction_404_raises( rest_mock: Mocker, ) -> None: - rest_mock.head( - f"{TEST_URI}v1/namespaces/fokko/views/missing", - status_code=404, - request_headers=TEST_HEADERS, - ) rest_mock.get( f"{TEST_URI}v1/namespaces/fokko/tables/missing", json={"error": {"message": "Table not found", "type": "NoSuchTableException", "code": 404}}, diff --git a/tests/table/test_partitioning.py b/tests/table/test_partitioning.py index 144ed205ed..5fb81dc4d4 100644 --- a/tests/table/test_partitioning.py +++ b/tests/table/test_partitioning.py @@ -313,29 +313,50 @@ def test_incompatible_transform_source_type() -> None: @pytest.mark.parametrize( - "existing_specs, last_partition_id, expected_field_id, expected_last_partition_id", + "new_spec, existing_specs, last_partition_id, expected_field_id, expected_last_partition_id", [ - # Reuse: same (source_id, transform) already in an existing spec → reuse its field_id. - ( + # Reuse-by-identity: same (source_id, IdentityTransform) already in an existing spec. + pytest.param( + PartitionSpec(PartitionField(source_id=1, field_id=999, transform=IdentityTransform(), name="id")), [PartitionSpec(PartitionField(source_id=1, field_id=1000, transform=IdentityTransform(), name="id"), spec_id=0)], 1000, 1000, 1000, + id="reuse-identity", + ), + # Reuse-by-(source,bucket): same source_id + same BucketTransform, even under a renamed field. + pytest.param( + PartitionSpec(PartitionField(source_id=1, field_id=999, transform=BucketTransform(8), name="id_bucket_renamed")), + [ + PartitionSpec( + PartitionField(source_id=1, field_id=1042, transform=BucketTransform(8), name="id_bucket"), spec_id=0 + ) + ], + 1042, + 1042, + 1042, + id="reuse-bucket-under-rename", + ), + # No match: fresh id above last_partition_id. + pytest.param( + PartitionSpec(PartitionField(source_id=1, field_id=999, transform=IdentityTransform(), name="id")), + [PartitionSpec(spec_id=0)], + 999, + 1000, + 1000, + id="new-field-above-last-partition-id", ), - # New: no matching (source_id, transform) → fresh id above last_partition_id. - ([PartitionSpec(spec_id=0)], 999, 1000, 1000), ], - ids=["reuse-from-existing-spec", "new-field-above-last-partition-id"], ) def test_assign_fresh_partition_spec_ids_for_replace_v2( + new_spec: PartitionSpec, existing_specs: list[PartitionSpec], last_partition_id: int, expected_field_id: int, expected_last_partition_id: int, ) -> None: - spec = PartitionSpec(PartitionField(source_id=1, field_id=999, transform=IdentityTransform(), name="id")) fresh_spec, new_last_pid = assign_fresh_partition_spec_ids_for_replace( - spec, _REPLACE_SCHEMA_FOR_PARTITION, _REPLACE_SCHEMA_FOR_PARTITION, existing_specs, last_partition_id + new_spec, _REPLACE_SCHEMA_FOR_PARTITION, _REPLACE_SCHEMA_FOR_PARTITION, existing_specs, last_partition_id ) assert fresh_spec.fields[0].field_id == expected_field_id assert new_last_pid == expected_last_partition_id From 4278b6a8d3e20b723bde3efdb1e8d9071562d6d5 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Mon, 18 May 2026 21:04:08 +0100 Subject: [PATCH 15/24] Add niche but no-regret tests from audit follow-up - test_replace_table_drops_identifier_field: pairs with the existing preserve test; verifies that a new schema without identifier_field_ids clears the previous set rather than silently carrying it forward. - test_replace_table_v2_does_not_carry_forward_void_field: v2 specs aren't append-only, so a dropped partition field is gone (unlike v1). - test_replace_after_format_version_upgrade: v1 -> v2 via replace, then a second replace on the now-v2 table must not retrigger the upgrade or fail. --- tests/catalog/test_catalog_behaviors.py | 47 +++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/tests/catalog/test_catalog_behaviors.py b/tests/catalog/test_catalog_behaviors.py index 338674c036..824feb9cf2 100644 --- a/tests/catalog/test_catalog_behaviors.py +++ b/tests/catalog/test_catalog_behaviors.py @@ -492,6 +492,23 @@ def test_replace_table_preserves_identifier_field_ids(catalog: Catalog, test_tab assert list(replaced.schema().identifier_field_ids) == [1] +def test_replace_table_drops_identifier_field(catalog: Catalog, test_table_identifier: Identifier) -> None: + """Replacing with a schema that has no `identifier_field_ids` clears them on the table — + the previous identifier set is not silently carried forward.""" + schema_with_id = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), + identifier_field_ids=[1], + ) + _create_simple_table(catalog, test_table_identifier, schema=schema_with_id) + schema_without_id = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), + ) + replaced = catalog.replace_table(test_table_identifier, schema=schema_without_id) + assert list(replaced.schema().identifier_field_ids) == [] + + def test_replace_table_reuses_partition_spec_id(catalog: Catalog, test_table_identifier: Identifier) -> None: """An identical partition spec reuses its spec_id rather than appending a new one.""" spec = PartitionSpec(PartitionField(source_id=1, field_id=1000, name="id_part", transform=IdentityTransform())) @@ -582,6 +599,36 @@ def test_replace_table_v1_carries_forward_partition_fields_as_void(catalog: Cata assert void_field.name == "id_part" +def test_replace_table_v2_does_not_carry_forward_void_field(catalog: Catalog, test_table_identifier: Identifier) -> None: + """v2 specs are not append-only — a replace that drops a partition field does not + carry it forward (unlike v1). The new default spec contains only the new field(s).""" + spec = PartitionSpec(PartitionField(source_id=1, field_id=1000, name="id_part", transform=IdentityTransform())) + _, schema = _create_simple_table(catalog, test_table_identifier, partition_spec=spec, format_version=2) + + # Replace with default unpartitioned spec — the old field is gone from the default spec. + replaced = catalog.replace_table(test_table_identifier, schema=schema) + new_spec = replaced.spec() + assert new_spec.is_unpartitioned() + assert all(not isinstance(f.transform, VoidTransform) for f in new_spec.fields) + + +def test_replace_after_format_version_upgrade(catalog: Catalog, test_table_identifier: Identifier) -> None: + """A v1 table can be upgraded to v2 via replace and then re-replaced without issue.""" + _, schema = _create_simple_table(catalog, test_table_identifier, format_version=1) + upgraded = catalog.replace_table(test_table_identifier, schema=schema, properties={"format-version": "2"}) + assert upgraded.format_version == 2 + + # Second replace on the now-v2 table should not re-trigger an upgrade or fail. + new_schema = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), + NestedField(field_id=3, name="extra", field_type=BooleanType(), required=False), + ) + replaced = catalog.replace_table(test_table_identifier, schema=new_schema) + assert replaced.format_version == 2 + assert {f.name for f in replaced.schema().fields} == {"id", "data", "extra"} + + def test_replace_table_raises_when_table_does_not_exist(catalog: Catalog, test_table_identifier: Identifier) -> None: schema = Schema(NestedField(field_id=1, name="id", field_type=LongType(), required=False)) with pytest.raises(NoSuchTableError): From 08c5b2da11eceb26ecc23b15848277590707bff3 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Mon, 18 May 2026 21:32:37 +0100 Subject: [PATCH 16/24] Strip format-version from persisted properties on replace Match new_table_metadata's behavior of popping format-version before persistence, and validate schema compatibility against the resolved format version at the same site. --- pyiceberg/catalog/__init__.py | 1 + pyiceberg/table/__init__.py | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index 0ad543d34d..17c9ab7c96 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -549,6 +549,7 @@ def _replace_staged_table( int(requested_format_version) if requested_format_version is not None else existing_metadata.format_version ) iceberg_schema = self._convert_schema_if_needed(schema, cast(TableVersion, resolved_format_version)) + iceberg_schema.check_format_version_compatibility(cast(TableVersion, resolved_format_version)) fresh_schema, _ = assign_fresh_schema_ids_for_replace( iceberg_schema, existing_metadata.schema(), existing_metadata.last_column_id diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 77eca2ccb2..06221675c4 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -1102,8 +1102,10 @@ def _initial_changes( self._updates += (SetLocationUpdate(location=new_location),) # Merge properties (SetPropertiesUpdate merges onto existing properties). - if new_properties: - self._updates += (SetPropertiesUpdate(updates=new_properties),) + # Strip `format-version` so it does not get persisted as a regular property. + persisted_properties = {k: v for k, v in new_properties.items() if k != TableProperties.FORMAT_VERSION} + if persisted_properties: + self._updates += (SetPropertiesUpdate(updates=persisted_properties),) @staticmethod def _find_matching_schema_id(table_metadata: TableMetadata, schema: Schema) -> int | None: From feff7ddaf4d00f3343fd87cdfea8fabae5d51754 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Mon, 18 May 2026 21:32:47 +0100 Subject: [PATCH 17/24] Test coverage: persisted props, list/map fresh ids, sort-order reuse - Pin that 'format-version' is not in replaced.metadata.properties. - Add list/map element_id/key_id/value_id reuse coverage for assign_fresh_schema_ids_for_replace. - Extend sort-order reuse test to cover reusing a non-zero sorted order_id from history. - Drop the stale HEAD (view-exists) reference in test_replace_table_issues_commit_post_immediately. --- tests/catalog/test_catalog_behaviors.py | 13 +++++++-- tests/catalog/test_rest.py | 4 +-- tests/test_schema.py | 39 +++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 5 deletions(-) diff --git a/tests/catalog/test_catalog_behaviors.py b/tests/catalog/test_catalog_behaviors.py index 824feb9cf2..8f0ddbbdc0 100644 --- a/tests/catalog/test_catalog_behaviors.py +++ b/tests/catalog/test_catalog_behaviors.py @@ -520,21 +520,26 @@ def test_replace_table_reuses_partition_spec_id(catalog: Catalog, test_table_ide def test_replace_table_with_sort_order_changes(catalog: Catalog, test_table_identifier: Identifier) -> None: """Replace can change the sort order. The new sort order is appended to the history and - becomes the default; a follow-up replace back to unsorted reuses the unsorted order_id - rather than appending a duplicate.""" + becomes the default; a follow-up replace back to a sort already in history reuses its + order_id rather than appending a duplicate.""" _, schema = _create_simple_table(catalog, test_table_identifier) sort = SortOrder(SortField(source_id=1, transform=IdentityTransform(), direction=SortDirection.ASC)) # unsorted → sorted: a new order is added and becomes the default. sorted_table = catalog.replace_table(test_table_identifier, schema=schema, sort_order=sort) assert sorted_table.sort_order().fields == sort.fields - assert sorted_table.metadata.default_sort_order_id != 0 + sorted_order_id = sorted_table.metadata.default_sort_order_id + assert sorted_order_id != 0 # sorted → unsorted: reuses the unsorted order_id 0 from history. unsorted_table = catalog.replace_table(test_table_identifier, schema=schema) assert unsorted_table.sort_order().is_unsorted assert unsorted_table.metadata.default_sort_order_id == 0 + # unsorted → original sorted form: reuses the existing sorted order_id from history. + replayed = catalog.replace_table(test_table_identifier, schema=schema, sort_order=sort) + assert replayed.metadata.default_sort_order_id == sorted_order_id + def test_replace_table_inherits_existing_location(catalog: Catalog, test_table_identifier: Identifier) -> None: """`location=None` keeps the existing table's location.""" @@ -574,6 +579,8 @@ def test_replace_table_upgrades_format_version(catalog: Catalog, test_table_iden assert catalog.load_table(test_table_identifier).format_version == 1 replaced = catalog.replace_table(test_table_identifier, schema=schema, properties={"format-version": "2"}) assert replaced.format_version == 2 + # `format-version` is a control input, not a regular property — must not leak into persisted properties. + assert "format-version" not in replaced.properties def test_replace_table_rejects_format_version_downgrade(catalog: Catalog, test_table_identifier: Identifier) -> None: diff --git a/tests/catalog/test_rest.py b/tests/catalog/test_rest.py index 45a18fd2ec..23a2adb5ae 100644 --- a/tests/catalog/test_rest.py +++ b/tests/catalog/test_rest.py @@ -3017,8 +3017,8 @@ def test_replace_table_issues_commit_post_immediately( NestedField(field_id=2, name="data", field_type=StringType(), required=False), ) - # Baseline: opening the transaction alone makes the HEAD (view-exists) + GET (load) calls - # but no POST until the caller commits. + # Baseline: opening the transaction alone makes only the load GET call; no POST is + # issued until the caller commits. catalog.replace_table_transaction(identifier=("fokko", "fokko2"), schema=new_schema) methods_after_open = [r.method for r in rest_mock.request_history] assert "POST" not in methods_after_open diff --git a/tests/test_schema.py b/tests/test_schema.py index 4d5359cfee..5f5368fda5 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -1888,3 +1888,42 @@ def test_assign_fresh_schema_ids_for_replace_with_nested_struct() -> None: assert loc_fields[1].field_id == 4 # location.lon reused assert loc_fields[2].field_id == 5 # location.alt is new assert last_col_id == 5 + + +def test_assign_fresh_schema_ids_for_replace_with_list_and_map() -> None: + """`element_id`, `key_id`, and `value_id` are reused by name path (e.g. `tags.element`, `m.key`, `m.value`).""" + base_schema = Schema( + NestedField( + field_id=1, + name="tags", + field_type=ListType(element_id=2, element_type=StringType(), element_required=False), + required=False, + ), + NestedField( + field_id=3, + name="m", + field_type=MapType(key_id=4, key_type=StringType(), value_id=5, value_type=IntegerType(), value_required=False), + required=False, + ), + ) + new_schema = Schema( + NestedField( + field_id=10, + name="tags", + field_type=ListType(element_id=20, element_type=StringType(), element_required=False), + required=False, + ), + NestedField( + field_id=30, + name="m", + field_type=MapType(key_id=40, key_type=StringType(), value_id=50, value_type=IntegerType(), value_required=False), + required=False, + ), + ) + fresh, last_col_id = assign_fresh_schema_ids_for_replace(new_schema, base_schema, 5) + assert fresh.fields[0].field_id == 1 + assert fresh.fields[0].field_type.element_id == 2 + assert fresh.fields[1].field_id == 3 + assert fresh.fields[1].field_type.key_id == 4 + assert fresh.fields[1].field_type.value_id == 5 + assert last_col_id == 5 From a88b976fcf6c33ace658da67c6463d7390013893 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Mon, 18 May 2026 22:42:21 +0100 Subject: [PATCH 18/24] Readability pass on replace tests Drop docstrings that restated the function name or assertions, drop narrative inline comments where variable names already carry the meaning, and tighten multi-paragraph docstrings to one line. Also shorten the integration test function names to match how the rest of the file is structured. --- tests/catalog/test_catalog_behaviors.py | 48 ++----------------------- tests/catalog/test_rest.py | 17 +-------- tests/integration/test_rest_catalog.py | 13 +++---- 3 files changed, 8 insertions(+), 70 deletions(-) diff --git a/tests/catalog/test_catalog_behaviors.py b/tests/catalog/test_catalog_behaviors.py index 8f0ddbbdc0..9157049a77 100644 --- a/tests/catalog/test_catalog_behaviors.py +++ b/tests/catalog/test_catalog_behaviors.py @@ -417,7 +417,6 @@ def _simple_data(num_rows: int = 2) -> pa.Table: def test_replace_table_preserves_uuid_and_clears_current_snapshot(catalog: Catalog, test_table_identifier: Identifier) -> None: - """Replace preserves table_uuid and snapshot history, but clears the main ref.""" _create_simple_table(catalog, test_table_identifier) original = catalog.load_table(test_table_identifier) original.append(_simple_data()) @@ -441,9 +440,7 @@ def test_replace_table_preserves_uuid_and_clears_current_snapshot(catalog: Catal @pytest.mark.parametrize( "extra_fields, expected_schema_ids, expected_current, expected_fields, expected_last_col_id", [ - # Identical to the existing schema → reuse the same schema_id, no new schema appended. ([], [0], 0, {"id": 1, "data": 2}, 2), - # Extra column → append a new schema; existing IDs preserved by name, new one above last_column_id. ( [NestedField(field_id=99, name="extra", field_type=BooleanType(), required=False)], [0, 1], @@ -463,7 +460,6 @@ def test_replace_table_schema_id_reuse( expected_fields: dict[str, int], expected_last_col_id: int, ) -> None: - """A structurally identical schema reuses its schema_id; an extended schema adds a new one.""" _, base_schema = _create_simple_table(catalog, test_table_identifier) new_schema = Schema(*base_schema.fields, *extra_fields) replaced = catalog.replace_table(test_table_identifier, schema=new_schema) @@ -475,7 +471,6 @@ def test_replace_table_schema_id_reuse( def test_replace_table_preserves_identifier_field_ids(catalog: Catalog, test_table_identifier: Identifier) -> None: - """Identifier-field IDs on the new schema are honored after reuse-by-name.""" schema = Schema( NestedField(field_id=1, name="id", field_type=LongType(), required=True), NestedField(field_id=2, name="data", field_type=StringType(), required=False), @@ -493,8 +488,6 @@ def test_replace_table_preserves_identifier_field_ids(catalog: Catalog, test_tab def test_replace_table_drops_identifier_field(catalog: Catalog, test_table_identifier: Identifier) -> None: - """Replacing with a schema that has no `identifier_field_ids` clears them on the table — - the previous identifier set is not silently carried forward.""" schema_with_id = Schema( NestedField(field_id=1, name="id", field_type=LongType(), required=True), NestedField(field_id=2, name="data", field_type=StringType(), required=False), @@ -510,7 +503,6 @@ def test_replace_table_drops_identifier_field(catalog: Catalog, test_table_ident def test_replace_table_reuses_partition_spec_id(catalog: Catalog, test_table_identifier: Identifier) -> None: - """An identical partition spec reuses its spec_id rather than appending a new one.""" spec = PartitionSpec(PartitionField(source_id=1, field_id=1000, name="id_part", transform=IdentityTransform())) _, schema = _create_simple_table(catalog, test_table_identifier, partition_spec=spec) replaced = catalog.replace_table(test_table_identifier, schema=schema, partition_spec=spec) @@ -519,30 +511,23 @@ def test_replace_table_reuses_partition_spec_id(catalog: Catalog, test_table_ide def test_replace_table_with_sort_order_changes(catalog: Catalog, test_table_identifier: Identifier) -> None: - """Replace can change the sort order. The new sort order is appended to the history and - becomes the default; a follow-up replace back to a sort already in history reuses its - order_id rather than appending a duplicate.""" _, schema = _create_simple_table(catalog, test_table_identifier) sort = SortOrder(SortField(source_id=1, transform=IdentityTransform(), direction=SortDirection.ASC)) - # unsorted → sorted: a new order is added and becomes the default. sorted_table = catalog.replace_table(test_table_identifier, schema=schema, sort_order=sort) assert sorted_table.sort_order().fields == sort.fields sorted_order_id = sorted_table.metadata.default_sort_order_id assert sorted_order_id != 0 - # sorted → unsorted: reuses the unsorted order_id 0 from history. unsorted_table = catalog.replace_table(test_table_identifier, schema=schema) assert unsorted_table.sort_order().is_unsorted assert unsorted_table.metadata.default_sort_order_id == 0 - # unsorted → original sorted form: reuses the existing sorted order_id from history. replayed = catalog.replace_table(test_table_identifier, schema=schema, sort_order=sort) assert replayed.metadata.default_sort_order_id == sorted_order_id def test_replace_table_inherits_existing_location(catalog: Catalog, test_table_identifier: Identifier) -> None: - """`location=None` keeps the existing table's location.""" _, schema = _create_simple_table(catalog, test_table_identifier) existing = catalog.load_table(test_table_identifier).metadata.location replaced = catalog.replace_table(test_table_identifier, schema=schema) @@ -553,7 +538,6 @@ def test_replace_table_inherits_existing_location(catalog: Catalog, test_table_i def test_replace_table_uses_explicit_location( catalog: Catalog, test_table_identifier: Identifier, tmp_path: Path, trailing_slash: bool ) -> None: - """An explicit `location` is used verbatim; trailing slash is stripped.""" _, schema = _create_simple_table(catalog, test_table_identifier) bare = f"file://{tmp_path}/relocated" arg = bare + "/" if trailing_slash else bare @@ -564,7 +548,6 @@ def test_replace_table_uses_explicit_location( def test_replace_table_merges_properties_with_overrides_and_additions( catalog: Catalog, test_table_identifier: Identifier ) -> None: - """Properties are merged onto existing: new values override, untouched keys are preserved.""" schema = Schema(NestedField(field_id=1, name="id", field_type=LongType(), required=False)) _create_simple_table(catalog, test_table_identifier, schema=schema, properties={"keep": "yes", "override": "old"}) replaced = catalog.replace_table(test_table_identifier, schema=schema, properties={"override": "new", "new_key": "v"}) @@ -574,7 +557,6 @@ def test_replace_table_merges_properties_with_overrides_and_additions( def test_replace_table_upgrades_format_version(catalog: Catalog, test_table_identifier: Identifier) -> None: - """`properties={'format-version': '2'}` on a v1 table emits an UpgradeFormatVersion update.""" _, schema = _create_simple_table(catalog, test_table_identifier, format_version=1) assert catalog.load_table(test_table_identifier).format_version == 1 replaced = catalog.replace_table(test_table_identifier, schema=schema, properties={"format-version": "2"}) @@ -584,35 +566,29 @@ def test_replace_table_upgrades_format_version(catalog: Catalog, test_table_iden def test_replace_table_rejects_format_version_downgrade(catalog: Catalog, test_table_identifier: Identifier) -> None: - """A `format-version` lower than the existing one must be rejected to avoid silently - running the schema-conversion path with the wrong semantics.""" _, schema = _create_simple_table(catalog, test_table_identifier, format_version=2) with pytest.raises(ValueError, match="Cannot downgrade format-version"): catalog.replace_table(test_table_identifier, schema=schema, properties={"format-version": "1"}) def test_replace_table_v1_carries_forward_partition_fields_as_void(catalog: Catalog, test_table_identifier: Identifier) -> None: - """v1 specs are append-only; dropped partition fields must be carried forward as VoidTransform.""" + """v1 specs are append-only; dropped partition fields are carried forward as VoidTransform.""" spec = PartitionSpec(PartitionField(source_id=1, field_id=1000, name="id_part", transform=IdentityTransform())) _, schema = _create_simple_table(catalog, test_table_identifier, partition_spec=spec, format_version=1) - # Replace with default unpartitioned spec — the old field must be preserved as void. replaced = catalog.replace_table(test_table_identifier, schema=schema) new_spec = replaced.spec() void_field = next(f for f in new_spec.fields if f.field_id == 1000) assert isinstance(void_field.transform, VoidTransform) assert void_field.source_id == 1 - # Name is preserved when there's no collision in the new spec (the new spec is empty here). assert void_field.name == "id_part" def test_replace_table_v2_does_not_carry_forward_void_field(catalog: Catalog, test_table_identifier: Identifier) -> None: - """v2 specs are not append-only — a replace that drops a partition field does not - carry it forward (unlike v1). The new default spec contains only the new field(s).""" + """v2 specs are not append-only — a dropped partition field is not carried forward (unlike v1).""" spec = PartitionSpec(PartitionField(source_id=1, field_id=1000, name="id_part", transform=IdentityTransform())) _, schema = _create_simple_table(catalog, test_table_identifier, partition_spec=spec, format_version=2) - # Replace with default unpartitioned spec — the old field is gone from the default spec. replaced = catalog.replace_table(test_table_identifier, schema=schema) new_spec = replaced.spec() assert new_spec.is_unpartitioned() @@ -620,12 +596,10 @@ def test_replace_table_v2_does_not_carry_forward_void_field(catalog: Catalog, te def test_replace_after_format_version_upgrade(catalog: Catalog, test_table_identifier: Identifier) -> None: - """A v1 table can be upgraded to v2 via replace and then re-replaced without issue.""" _, schema = _create_simple_table(catalog, test_table_identifier, format_version=1) upgraded = catalog.replace_table(test_table_identifier, schema=schema, properties={"format-version": "2"}) assert upgraded.format_version == 2 - # Second replace on the now-v2 table should not re-trigger an upgrade or fail. new_schema = Schema( NestedField(field_id=1, name="id", field_type=LongType(), required=False), NestedField(field_id=2, name="data", field_type=StringType(), required=False), @@ -643,7 +617,6 @@ def test_replace_table_raises_when_table_does_not_exist(catalog: Catalog, test_t def test_replace_table_transaction_can_stage_additional_changes(catalog: Catalog, test_table_identifier: Identifier) -> None: - """The transaction context lets callers stage extra updates (e.g. properties) before commit.""" _, schema = _create_simple_table(catalog, test_table_identifier) with catalog.replace_table_transaction(test_table_identifier, schema=schema) as txn: txn.set_properties({"staged": "yes"}) @@ -652,12 +625,6 @@ def test_replace_table_transaction_can_stage_additional_changes(catalog: Catalog def test_replace_table_transaction_with_write_atomic_rtas(catalog: Catalog, test_table_identifier: Identifier) -> None: - """RTAS (Replace Table As Select): replace + write new data in one transaction. - - The new schema and new data must land atomically. After commit: - - the new snapshot is current (main ref restored by the fast_append), - - the new snapshot's parent is None (no lineage to the pre-replace data), and - - the pre-replace snapshot is preserved in history for time-travel.""" _create_simple_table(catalog, test_table_identifier) catalog.load_table(test_table_identifier).append(_simple_data(num_rows=1)) old_snapshot_id = catalog.load_table(test_table_identifier).current_snapshot().snapshot_id # type: ignore[union-attr] @@ -680,9 +647,6 @@ def test_replace_table_transaction_with_write_atomic_rtas(catalog: Catalog, test def test_replace_table_transaction_rolls_back_on_failure(catalog: Catalog, test_table_identifier: Identifier) -> None: - """If the body of the transaction raises, no metadata change is committed. - - The table's UUID, schemas, current snapshot, and current schema id must all be unchanged.""" _create_simple_table(catalog, test_table_identifier) catalog.load_table(test_table_identifier).append(_simple_data()) before = catalog.load_table(test_table_identifier).metadata @@ -708,17 +672,13 @@ def run_failing_replace() -> None: def test_concurrent_replace_table(catalog: Catalog, test_table_identifier: Identifier) -> None: - """Two concurrent replace_table calls staged from the same base both adding a column - must fail on the second commit with an `assert-last-assigned-field-id` violation — - proving the `AssertLastAssignedFieldId` requirement actually guards against duplicate - field-id assignment under concurrent writers.""" + """`AssertLastAssignedFieldId` rejects the second of two replaces staged from the same base.""" _create_simple_table(catalog, test_table_identifier) new_schema = Schema( NestedField(field_id=1, name="id", field_type=LongType(), required=False), NestedField(field_id=2, name="data", field_type=StringType(), required=False), NestedField(field_id=3, name="extra", field_type=BooleanType(), required=False), ) - # Both transactions build from the same base metadata with the same new schema. txn_a = catalog.replace_table_transaction(test_table_identifier, schema=new_schema) txn_b = catalog.replace_table_transaction(test_table_identifier, schema=new_schema) @@ -728,8 +688,6 @@ def test_concurrent_replace_table(catalog: Catalog, test_table_identifier: Ident def test_replace_table_allows_subsequent_append(catalog: Catalog, test_table_identifier: Identifier) -> None: - """After `replace_table` clears the current snapshot, a separate `append` produces a new - snapshot containing only the post-replace data — the pre-replace rows are not visible.""" _, schema = _create_simple_table(catalog, test_table_identifier) catalog.load_table(test_table_identifier).append(_simple_data(num_rows=3)) diff --git a/tests/catalog/test_rest.py b/tests/catalog/test_rest.py index 23a2adb5ae..309e288bc5 100644 --- a/tests/catalog/test_rest.py +++ b/tests/catalog/test_rest.py @@ -2927,8 +2927,6 @@ def test_replace_table_transaction_wire_payload( example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any], example_table_metadata_no_snapshot_v1_rest_json: dict[str, Any], ) -> None: - """The replace request must carry exactly one `assert-table-uuid` requirement and the - Set* + Add* updates needed to transform the existing table — order is not asserted.""" table_uuid = example_table_metadata_with_snapshot_v1_rest_json["metadata"]["table-uuid"] example_table_metadata_no_snapshot_v1_rest_json["metadata"]["table-uuid"] = table_uuid _mock_replace_endpoints( @@ -2948,8 +2946,6 @@ def test_replace_table_transaction_wire_payload( catalog.replace_table_transaction(identifier=("fokko", "fokko2"), schema=new_schema).commit_transaction() request = rest_mock.last_request.json() - # Pin the requirement values to the fixture's last-column-id / last-partition-id so the - # assertion fails loudly if the fixture changes underneath us. fixture_metadata = example_table_metadata_with_snapshot_v1_rest_json["metadata"] assert request["requirements"] == [ {"type": "assert-table-uuid", "uuid": table_uuid}, @@ -2957,22 +2953,15 @@ def test_replace_table_transaction_wire_payload( {"type": "assert-last-assigned-partition-id", "last-assigned-partition-id": fixture_metadata["last-partition-id"]}, ] - # No duplicate actions in the payload (the dict-by-action collapses them — guard against it). actions = [u["action"] for u in request["updates"]] assert len(actions) == len(set(actions)), f"duplicate actions in request: {actions}" updates_by_action = {u["action"]: u for u in request["updates"]} - # main snapshot must be cleared on replace. assert updates_by_action["remove-snapshot-ref"] == {"action": "remove-snapshot-ref", "ref-name": "main"} - # A new schema is added because the column set differs; field IDs for existing columns - # are reused by name. The highest id (3) flows from the schema's fields, which the - # server uses to bump the table's last-column-id on the server side. added_schema = updates_by_action["add-schema"]["schema"] assert {f["name"]: f["id"] for f in added_schema["fields"]} == {"id": 1, "data": 2, "new_col": 3} - # Set* updates are emitted unconditionally even when their resulting id is reused. # schema-id=-1 is the wire sentinel meaning "the schema we just added in this commit". assert updates_by_action["set-current-schema"]["schema-id"] == -1 - # Spec/sort-order are reused from the fixture (default ids) since the replace doesn't change them. assert updates_by_action["set-default-spec"]["spec-id"] == fixture_metadata["default-spec-id"] assert updates_by_action["set-default-sort-order"]["sort-order-id"] == fixture_metadata["default-sort-order-id"] @@ -2999,9 +2988,7 @@ def test_replace_table_issues_commit_post_immediately( example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any], example_table_metadata_no_snapshot_v1_rest_json: dict[str, Any], ) -> None: - """`replace_table` issues a commit POST during the call — distinguishing it from - `replace_table_transaction(...)` alone, which only issues the load GET (no POST until - the caller commits the transaction).""" + """`replace_table` commits during the call; `replace_table_transaction` defers the POST until the caller commits.""" table_uuid = example_table_metadata_with_snapshot_v1_rest_json["metadata"]["table-uuid"] example_table_metadata_no_snapshot_v1_rest_json["metadata"]["table-uuid"] = table_uuid _mock_replace_endpoints( @@ -3017,8 +3004,6 @@ def test_replace_table_issues_commit_post_immediately( NestedField(field_id=2, name="data", field_type=StringType(), required=False), ) - # Baseline: opening the transaction alone makes only the load GET call; no POST is - # issued until the caller commits. catalog.replace_table_transaction(identifier=("fokko", "fokko2"), schema=new_schema) methods_after_open = [r.method for r in rest_mock.request_history] assert "POST" not in methods_after_open diff --git a/tests/integration/test_rest_catalog.py b/tests/integration/test_rest_catalog.py index 14ecffd922..8a547a9d56 100644 --- a/tests/integration/test_rest_catalog.py +++ b/tests/integration/test_rest_catalog.py @@ -74,12 +74,8 @@ def test_create_namespace_if_already_existing(catalog: RestCatalog) -> None: @pytest.mark.integration @pytest.mark.parametrize("catalog", [lf("session_catalog")]) -def test_replace_table_end_to_end_against_rest_server(catalog: Catalog) -> None: - """End-to-end smoke test: replace_table against a real REST server. - - Detailed replace_table semantics are covered against InMemoryCatalog and SqlCatalog in - `tests/catalog/test_catalog_behaviors.py`. This test verifies the REST wire path: server - accepts the commit, preserves the UUID, and clears the current snapshot.""" +def test_replace_table(catalog: Catalog) -> None: + """End-to-end smoke test: replace_table against a real REST server.""" identifier = f"default.test_replace_table_e2e_{catalog.name}" if not catalog.namespace_exists("default"): catalog.create_namespace("default") @@ -115,9 +111,8 @@ def test_replace_table_end_to_end_against_rest_server(catalog: Catalog) -> None: @pytest.mark.integration @pytest.mark.parametrize("catalog", [lf("session_catalog")]) -def test_replace_table_transaction_rtas_against_rest_server(catalog: Catalog) -> None: - """RTAS (Replace Table As Select) against a real REST server: the schema swap and the - new-data write must land atomically — the new snapshot is current on commit.""" +def test_replace_table_transaction(catalog: Catalog) -> None: + """RTAS against a real REST server: the schema swap and the new-data write land atomically.""" identifier = f"default.test_replace_rtas_{catalog.name}" if not catalog.namespace_exists("default"): catalog.create_namespace("default") From c46eca0838709e36d0c713edbe9a53cbfafc6f37 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Mon, 18 May 2026 23:12:13 +0100 Subject: [PATCH 19/24] Drop docstrings from integration replace tests --- tests/integration/test_rest_catalog.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/integration/test_rest_catalog.py b/tests/integration/test_rest_catalog.py index 8a547a9d56..52a1ace944 100644 --- a/tests/integration/test_rest_catalog.py +++ b/tests/integration/test_rest_catalog.py @@ -75,7 +75,6 @@ def test_create_namespace_if_already_existing(catalog: RestCatalog) -> None: @pytest.mark.integration @pytest.mark.parametrize("catalog", [lf("session_catalog")]) def test_replace_table(catalog: Catalog) -> None: - """End-to-end smoke test: replace_table against a real REST server.""" identifier = f"default.test_replace_table_e2e_{catalog.name}" if not catalog.namespace_exists("default"): catalog.create_namespace("default") @@ -112,7 +111,6 @@ def test_replace_table(catalog: Catalog) -> None: @pytest.mark.integration @pytest.mark.parametrize("catalog", [lf("session_catalog")]) def test_replace_table_transaction(catalog: Catalog) -> None: - """RTAS against a real REST server: the schema swap and the new-data write land atomically.""" identifier = f"default.test_replace_rtas_{catalog.name}" if not catalog.namespace_exists("default"): catalog.create_namespace("default") From 4bfd97a6793f38d86b5566ff49972fca0e76079c Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Mon, 18 May 2026 23:16:51 +0100 Subject: [PATCH 20/24] Drop misleading field-ID rationale from docs --- mkdocs/docs/api.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md index 2a3a4d6910..f15255adf8 100644 --- a/mkdocs/docs/api.md +++ b/mkdocs/docs/api.md @@ -201,7 +201,7 @@ new_schema = Schema( catalog.replace_table(identifier="docs_example.bids", schema=new_schema) ``` -Field IDs from columns whose names appear in the previous schema are reused, so existing data files remain readable when the new schema is a compatible superset. New columns get fresh IDs above `last-column-id`. +Field IDs are reused by name from the previous schema; new columns get fresh IDs above `last-column-id`. Properties passed to `replace_table` are **merged** with the existing table properties (your values override; existing keys you don't pass are preserved). To remove a property as part of the replace, use `replace_table_transaction` and remove it explicitly within the transaction. From 8d6bc1cd1b9a0929aa02b8309379ef14ad5d0018 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Mon, 18 May 2026 23:27:18 +0100 Subject: [PATCH 21/24] Sharpen properties-merge note in replace docs --- mkdocs/docs/api.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md index f15255adf8..fa331772f8 100644 --- a/mkdocs/docs/api.md +++ b/mkdocs/docs/api.md @@ -203,7 +203,7 @@ catalog.replace_table(identifier="docs_example.bids", schema=new_schema) Field IDs are reused by name from the previous schema; new columns get fresh IDs above `last-column-id`. -Properties passed to `replace_table` are **merged** with the existing table properties (your values override; existing keys you don't pass are preserved). To remove a property as part of the replace, use `replace_table_transaction` and remove it explicitly within the transaction. +Unlike the other fields, table properties are *merged* on replace: properties you don't pass are preserved on the table. To remove a property as part of the replace, use `replace_table_transaction` and drop it explicitly within the transaction. Use `replace_table_transaction` to stage additional changes (writes, property updates, schema evolution) before committing — for example, swap the schema and write new data atomically: From 24ffc242620bf3b1bce6a3f3ef17ad099f5cc733 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Tue, 19 May 2026 00:10:41 +0100 Subject: [PATCH 22/24] Move replace integration tests to tests/integration/test_catalog.py That file is the cross-catalog integration analog to Java's CatalogTests.java; every other mutation (create/rename/drop/etc.) already has a slot there parametrized over six catalog fixtures. The two replace tests have no REST-specific code and don't belong in test_rest_catalog.py. Adopts the file's existing conventions: test_catalog parameter name, database_name + table_name fixtures, no manual cleanup guards (clean_up runs between tests). --- tests/integration/test_catalog.py | 63 ++++++++++++++++++++++- tests/integration/test_rest_catalog.py | 71 -------------------------- 2 files changed, 62 insertions(+), 72 deletions(-) diff --git a/tests/integration/test_catalog.py b/tests/integration/test_catalog.py index 751dbe0479..3d723caf77 100644 --- a/tests/integration/test_catalog.py +++ b/tests/integration/test_catalog.py @@ -20,6 +20,7 @@ from collections.abc import Generator from pathlib import Path, PosixPath +import pyarrow as pa import pytest from pytest_lazy_fixtures import lf @@ -43,7 +44,7 @@ from pyiceberg.table.metadata import INITIAL_SPEC_ID from pyiceberg.table.sorting import INITIAL_SORT_ORDER_ID, SortField, SortOrder from pyiceberg.transforms import BucketTransform, DayTransform, IdentityTransform -from pyiceberg.types import IntegerType, LongType, NestedField, TimestampType, UUIDType +from pyiceberg.types import BooleanType, IntegerType, LongType, NestedField, StringType, TimestampType, UUIDType from tests.conftest import ( clean_up, does_support_atomic_concurrent_updates, @@ -866,3 +867,63 @@ def test_load_missing_table(test_catalog: Catalog, database_name: str, table_nam with pytest.raises(NoSuchTableError): test_catalog.load_table(identifier) + + +@pytest.mark.integration +@pytest.mark.parametrize("test_catalog", CATALOGS) +def test_replace_table(test_catalog: Catalog, database_name: str, table_name: str) -> None: + test_catalog.create_namespace(database_name) + identifier = (database_name, table_name) + + original_schema = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), + ) + original = test_catalog.create_table(identifier, schema=original_schema) + original.append( + pa.Table.from_pydict( + {"id": [1, 2, 3], "data": ["a", "b", "c"]}, + schema=pa.schema([pa.field("id", pa.int64()), pa.field("data", pa.large_string())]), + ) + ) + original.refresh() + original_snapshot_id = original.current_snapshot().snapshot_id # type: ignore[union-attr] + + new_schema = Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=False), + NestedField(field_id=2, name="name", field_type=StringType(), required=False), + NestedField(field_id=3, name="active", field_type=BooleanType(), required=False), + ) + replaced = test_catalog.replace_table(identifier, schema=new_schema) + + assert replaced.metadata.table_uuid == original.metadata.table_uuid + assert replaced.current_snapshot() is None + assert any(s.snapshot_id == original_snapshot_id for s in replaced.metadata.snapshots) + + +@pytest.mark.integration +@pytest.mark.parametrize("test_catalog", CATALOGS) +def test_replace_table_transaction(test_catalog: Catalog, database_name: str, table_name: str) -> None: + test_catalog.create_namespace(database_name) + identifier = (database_name, table_name) + + old_data = pa.Table.from_pydict( + {"id": [1], "data": ["old"]}, + schema=pa.schema([pa.field("id", pa.int64()), pa.field("data", pa.large_string())]), + ) + original = test_catalog.create_table(identifier, schema=old_data.schema) + original.append(old_data) + old_snapshot_id = test_catalog.load_table(identifier).current_snapshot().snapshot_id # type: ignore[union-attr] + + new_data = pa.Table.from_pydict( + {"id": [10, 20], "name": ["alice", "bob"]}, + schema=pa.schema([pa.field("id", pa.int64()), pa.field("name", pa.large_string())]), + ) + with test_catalog.replace_table_transaction(identifier, schema=new_data.schema) as txn: + txn.append(new_data) + + replaced = test_catalog.load_table(identifier) + assert replaced.current_snapshot() is not None + assert replaced.current_snapshot().snapshot_id != old_snapshot_id # type: ignore[union-attr] + assert any(s.snapshot_id == old_snapshot_id for s in replaced.metadata.snapshots) + assert replaced.scan().to_arrow().num_rows == 2 diff --git a/tests/integration/test_rest_catalog.py b/tests/integration/test_rest_catalog.py index 52a1ace944..05039a982e 100644 --- a/tests/integration/test_rest_catalog.py +++ b/tests/integration/test_rest_catalog.py @@ -18,15 +18,12 @@ import time -import pyarrow as pa import pytest from pytest_lazy_fixtures import lf -from pyiceberg.catalog import Catalog from pyiceberg.catalog.rest import RestCatalog from pyiceberg.exceptions import NoSuchViewError from pyiceberg.schema import Schema -from pyiceberg.types import BooleanType, LongType, NestedField, StringType from pyiceberg.view.metadata import SQLViewRepresentation, ViewVersion TEST_NAMESPACE_IDENTIFIER = "TEST NS" @@ -72,74 +69,6 @@ def test_create_namespace_if_already_existing(catalog: RestCatalog) -> None: assert catalog.namespace_exists(TEST_NAMESPACE_IDENTIFIER) -@pytest.mark.integration -@pytest.mark.parametrize("catalog", [lf("session_catalog")]) -def test_replace_table(catalog: Catalog) -> None: - identifier = f"default.test_replace_table_e2e_{catalog.name}" - if not catalog.namespace_exists("default"): - catalog.create_namespace("default") - if catalog.table_exists(identifier): - catalog.drop_table(identifier) - - original_schema = Schema( - NestedField(field_id=1, name="id", field_type=LongType(), required=False), - NestedField(field_id=2, name="data", field_type=StringType(), required=False), - ) - original = catalog.create_table(identifier, schema=original_schema) - original.append( - pa.Table.from_pydict( - {"id": [1, 2, 3], "data": ["a", "b", "c"]}, - schema=pa.schema([pa.field("id", pa.int64()), pa.field("data", pa.large_string())]), - ) - ) - original.refresh() - original_snapshot_id = original.current_snapshot().snapshot_id # type: ignore[union-attr] - - new_schema = Schema( - NestedField(field_id=1, name="id", field_type=LongType(), required=False), - NestedField(field_id=2, name="name", field_type=StringType(), required=False), - NestedField(field_id=3, name="active", field_type=BooleanType(), required=False), - ) - replaced = catalog.replace_table(identifier, schema=new_schema) - - assert replaced.metadata.table_uuid == original.metadata.table_uuid - assert replaced.current_snapshot() is None - assert any(s.snapshot_id == original_snapshot_id for s in replaced.metadata.snapshots) - catalog.drop_table(identifier) - - -@pytest.mark.integration -@pytest.mark.parametrize("catalog", [lf("session_catalog")]) -def test_replace_table_transaction(catalog: Catalog) -> None: - identifier = f"default.test_replace_rtas_{catalog.name}" - if not catalog.namespace_exists("default"): - catalog.create_namespace("default") - if catalog.table_exists(identifier): - catalog.drop_table(identifier) - - old_data = pa.Table.from_pydict( - {"id": [1], "data": ["old"]}, - schema=pa.schema([pa.field("id", pa.int64()), pa.field("data", pa.large_string())]), - ) - original = catalog.create_table(identifier, schema=old_data.schema) - original.append(old_data) - old_snapshot_id = catalog.load_table(identifier).current_snapshot().snapshot_id # type: ignore[union-attr] - - new_data = pa.Table.from_pydict( - {"id": [10, 20], "name": ["alice", "bob"]}, - schema=pa.schema([pa.field("id", pa.int64()), pa.field("name", pa.large_string())]), - ) - with catalog.replace_table_transaction(identifier, schema=new_data.schema) as txn: - txn.append(new_data) - - replaced = catalog.load_table(identifier) - assert replaced.current_snapshot() is not None - assert replaced.current_snapshot().snapshot_id != old_snapshot_id # type: ignore[union-attr] - assert any(s.snapshot_id == old_snapshot_id for s in replaced.metadata.snapshots) - assert replaced.scan().to_arrow().num_rows == 2 - catalog.drop_table(identifier) - - @pytest.mark.integration @pytest.mark.parametrize("catalog", [lf("session_catalog")]) def test_load_view(catalog: RestCatalog, table_schema_nested: Schema, database_name: str, view_name: str) -> None: From 2f5436ab6da62189a249e7a4973e7cdd1e50e336 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Tue, 19 May 2026 00:50:00 +0100 Subject: [PATCH 23/24] Add S3 creds to rest_catalog fixture in tests/integration/test_catalog.py The fixture had no S3 credentials, which was fine when every test in the file was metadata-only. The new replace tests do .append() and fail with ACCESS_DENIED against minio. Mirrors the credentials the hive_catalog fixture in the same file already uses. --- tests/integration/test_catalog.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_catalog.py b/tests/integration/test_catalog.py index 3d723caf77..401f3418f9 100644 --- a/tests/integration/test_catalog.py +++ b/tests/integration/test_catalog.py @@ -86,7 +86,15 @@ def sqlite_catalog_file(warehouse: Path) -> Generator[Catalog, None, None]: @pytest.fixture(scope="function") def rest_catalog() -> Generator[Catalog, None, None]: - test_catalog = RestCatalog("rest", uri="http://localhost:8181") + test_catalog = RestCatalog( + "rest", + **{ + "uri": "http://localhost:8181", + "s3.endpoint": "http://localhost:9000", + "s3.access-key-id": "admin", + "s3.secret-access-key": "password", + }, + ) yield test_catalog From 80c3c65b5c3dbcb0cebb6ce678d9275b1191a677 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Tue, 19 May 2026 01:33:57 +0100 Subject: [PATCH 24/24] Retrigger CI