Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/iceberg/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ set(ICEBERG_SOURCES
transform.cc
transform_function.cc
type.cc
update/delete_files.cc
update/expire_snapshots.cc
update/fast_append.cc
update/merging_snapshot_update.cc
Expand Down
1 change: 1 addition & 0 deletions src/iceberg/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ iceberg_sources = files(
'transform.cc',
'transform_function.cc',
'type.cc',
'update/delete_files.cc',
'update/expire_snapshots.cc',
'update/fast_append.cc',
'update/merging_snapshot_update.cc',
Expand Down
7 changes: 7 additions & 0 deletions src/iceberg/table.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include "iceberg/table_properties.h"
#include "iceberg/table_scan.h"
#include "iceberg/transaction.h"
#include "iceberg/update/delete_files.h"
#include "iceberg/update/expire_snapshots.h"
#include "iceberg/update/fast_append.h"
#include "iceberg/update/set_snapshot.h"
Expand Down Expand Up @@ -217,6 +218,12 @@ Result<std::shared_ptr<FastAppend>> Table::NewFastAppend() {
return FastAppend::Make(name().name, std::move(ctx));
}

Result<std::shared_ptr<DeleteFiles>> Table::NewDeleteFiles() {
ICEBERG_ASSIGN_OR_RAISE(
auto ctx, TransactionContext::Make(shared_from_this(), TransactionKind::kUpdate));
return DeleteFiles::Make(name().name, std::move(ctx));
}

Result<std::shared_ptr<UpdateStatistics>> Table::NewUpdateStatistics() {
ICEBERG_ASSIGN_OR_RAISE(
auto ctx, TransactionContext::Make(shared_from_this(), TransactionKind::kUpdate));
Expand Down
3 changes: 3 additions & 0 deletions src/iceberg/table.h
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,9 @@ class ICEBERG_EXPORT Table : public std::enable_shared_from_this<Table> {
/// \brief Create a new FastAppend to append data files and commit the changes.
virtual Result<std::shared_ptr<FastAppend>> NewFastAppend();

/// \brief Create a new DeleteFiles to delete data files and commit the changes.
virtual Result<std::shared_ptr<DeleteFiles>> NewDeleteFiles();

/// \brief Create a new SnapshotManager to manage snapshots and snapshot references.
virtual Result<std::shared_ptr<SnapshotManager>> NewSnapshotManager();

Expand Down
1 change: 1 addition & 0 deletions src/iceberg/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,7 @@ if(ICEBERG_BUILD_BUNDLE)
add_iceberg_test(table_update_test
USE_BUNDLE
SOURCES
delete_files_test.cc
expire_snapshots_test.cc
fast_append_test.cc
manifest_filter_manager_test.cc
Expand Down
203 changes: 203 additions & 0 deletions src/iceberg/test/delete_files_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#include "iceberg/update/delete_files.h"

#include <memory>
#include <string>
#include <vector>

#include <gmock/gmock.h>
#include <gtest/gtest.h>

#include "iceberg/avro/avro_register.h"
#include "iceberg/expression/expressions.h"
#include "iceberg/expression/literal.h"
#include "iceberg/manifest/manifest_entry.h"
#include "iceberg/partition_spec.h"
#include "iceberg/row/partition_values.h"
#include "iceberg/schema.h"
#include "iceberg/snapshot.h"
#include "iceberg/table.h"
#include "iceberg/test/matchers.h"
#include "iceberg/test/update_test_base.h"
#include "iceberg/update/fast_append.h"

namespace iceberg {

class DeleteFilesTest : public MinimalUpdateTestBase {
protected:
static void SetUpTestSuite() { avro::RegisterAll(); }

void SetUp() override {
MinimalUpdateTestBase::SetUp();

ICEBERG_UNWRAP_OR_FAIL(spec_, table_->spec());
ICEBERG_UNWRAP_OR_FAIL(schema_, table_->schema());
file_a_ = MakeDataFile("/data/file_a.parquet", /*partition_x=*/1L);
file_b_ = MakeDataFile("/data/file_b.parquet", /*partition_x=*/2L);
}

std::shared_ptr<DataFile> MakeDataFile(const std::string& path, int64_t partition_x) {
auto file = std::make_shared<DataFile>();
file->content = DataFile::Content::kData;
file->file_path = table_location_ + path;
file->file_format = FileFormatType::kParquet;
file->partition = PartitionValues(std::vector<Literal>{Literal::Long(partition_x)});
file->file_size_in_bytes = 1024;
file->record_count = 100;
file->partition_spec_id = spec_->spec_id();
return file;
}

void SetLongBounds(const std::shared_ptr<DataFile>& file, int32_t field_id,
int64_t lower, int64_t upper) {
ASSERT_NE(file, nullptr);
ICEBERG_UNWRAP_OR_FAIL(auto lower_bound, Literal::Long(lower).Serialize());
ICEBERG_UNWRAP_OR_FAIL(auto upper_bound, Literal::Long(upper).Serialize());
file->value_counts[field_id] = file->record_count;
file->null_value_counts[field_id] = 0;
file->lower_bounds[field_id] = lower_bound;
file->upper_bounds[field_id] = upper_bound;
}

void CommitFiles(const std::vector<std::shared_ptr<DataFile>>& files) {
ICEBERG_UNWRAP_OR_FAIL(auto append, table_->NewFastAppend());
for (const auto& file : files) {
append->AppendFile(file);
}
ASSERT_THAT(append->Commit(), IsOk());
ASSERT_THAT(table_->Refresh(), IsOk());
}

void CommitInitialFiles() { CommitFiles({file_a_, file_b_}); }

void ExpectOneFileDeleted() {
ASSERT_THAT(table_->Refresh(), IsOk());
ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot());
EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kOperation),
DataOperation::kDelete);
EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kDeletedDataFiles), "1");
EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kDeletedRecords), "100");
EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kRemovedFileSize), "1024");
}

std::shared_ptr<PartitionSpec> spec_;
std::shared_ptr<Schema> schema_;
std::shared_ptr<DataFile> file_a_;
std::shared_ptr<DataFile> file_b_;

static constexpr int32_t kYFieldId = 2;
};

TEST_F(DeleteFilesTest, DeleteFileByPath) {
CommitInitialFiles();

ICEBERG_UNWRAP_OR_FAIL(auto delete_files, table_->NewDeleteFiles());
delete_files->DeleteFile(file_a_->file_path);

EXPECT_THAT(delete_files->Commit(), IsOk());
ExpectOneFileDeleted();
}

TEST_F(DeleteFilesTest, DeleteFileByDataFile) {
CommitInitialFiles();

ICEBERG_UNWRAP_OR_FAIL(auto delete_files, table_->NewDeleteFiles());
delete_files->DeleteFile(file_a_);

EXPECT_THAT(delete_files->Commit(), IsOk());
ExpectOneFileDeleted();
}

TEST_F(DeleteFilesTest, DeleteFromRowFilterCaseInsensitive) {
CommitInitialFiles();

ICEBERG_UNWRAP_OR_FAIL(auto delete_files, table_->NewDeleteFiles());
delete_files->CaseSensitive(false).DeleteFromRowFilter(
Expressions::Equal("X", Literal::Long(1L)));

EXPECT_THAT(delete_files->Commit(), IsOk());
ExpectOneFileDeleted();
}

TEST_F(DeleteFilesTest, EmptyDeleteCommit) {
CommitInitialFiles();
ICEBERG_UNWRAP_OR_FAIL(auto previous_snapshot, table_->current_snapshot());

ICEBERG_UNWRAP_OR_FAIL(auto delete_files, table_->NewDeleteFiles());

EXPECT_THAT(delete_files->Commit(), IsOk());

ASSERT_THAT(table_->Refresh(), IsOk());
ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot());
ASSERT_TRUE(snapshot->parent_snapshot_id.has_value());
EXPECT_EQ(snapshot->parent_snapshot_id.value(), previous_snapshot->snapshot_id);
EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kOperation),
DataOperation::kDelete);
EXPECT_EQ(snapshot->summary.count(SnapshotSummaryFields::kDeletedDataFiles), 0U);
EXPECT_EQ(snapshot->summary.count(SnapshotSummaryFields::kDeletedRecords), 0U);
EXPECT_EQ(snapshot->summary.count(SnapshotSummaryFields::kRemovedFileSize), 0U);
}

TEST_F(DeleteFilesTest, DeleteFromRowFilter) {
CommitInitialFiles();

ICEBERG_UNWRAP_OR_FAIL(auto delete_files, table_->NewDeleteFiles());
delete_files->DeleteFromRowFilter(Expressions::Equal("x", Literal::Long(1L)));

EXPECT_THAT(delete_files->Commit(), IsOk());
ExpectOneFileDeleted();
}

TEST_F(DeleteFilesTest, DeleteFromRowFilterRejectsPartialMatchFile) {
auto partial_match_file = MakeDataFile("/data/partial_match.parquet",
/*partition_x=*/1L);
SetLongBounds(partial_match_file, kYFieldId, /*lower=*/0L, /*upper=*/10L);
CommitFiles({partial_match_file});
ICEBERG_UNWRAP_OR_FAIL(auto previous_snapshot, table_->current_snapshot());

ICEBERG_UNWRAP_OR_FAIL(auto delete_files, table_->NewDeleteFiles());
delete_files->DeleteFromRowFilter(Expressions::Equal("y", Literal::Long(5L)));

auto status = delete_files->Commit();
EXPECT_THAT(status, IsError(ErrorKind::kValidationFailed));
EXPECT_THAT(status,
HasErrorMessage("Cannot delete file where some, but not all, rows match "
"filter"));
EXPECT_THAT(status, HasErrorMessage(partial_match_file->file_path));

ASSERT_THAT(table_->Refresh(), IsOk());
ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot());
EXPECT_EQ(snapshot->snapshot_id, previous_snapshot->snapshot_id);
}

TEST_F(DeleteFilesTest, ValidateFilesExistRejectsMissingPath) {
CommitInitialFiles();

ICEBERG_UNWRAP_OR_FAIL(auto delete_files, table_->NewDeleteFiles());
delete_files->DeleteFile(table_location_ + "/data/missing.parquet")
.ValidateFilesExist();

auto status = delete_files->Commit();
EXPECT_THAT(status, IsError(ErrorKind::kValidationFailed));
EXPECT_THAT(status, HasErrorMessage("Missing required files to delete"));
}

} // namespace iceberg
8 changes: 8 additions & 0 deletions src/iceberg/transaction.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#include "iceberg/table_requirement.h"
#include "iceberg/table_requirements.h"
#include "iceberg/table_update.h"
#include "iceberg/update/delete_files.h"
#include "iceberg/update/expire_snapshots.h"
#include "iceberg/update/fast_append.h"
#include "iceberg/update/pending_update.h"
Expand Down Expand Up @@ -478,6 +479,13 @@ Result<std::shared_ptr<FastAppend>> Transaction::NewFastAppend() {
return fast_append;
}

Result<std::shared_ptr<DeleteFiles>> Transaction::NewDeleteFiles() {
ICEBERG_ASSIGN_OR_RAISE(std::shared_ptr<DeleteFiles> delete_files,
DeleteFiles::Make(ctx_->table->name().name, ctx_));
ICEBERG_RETURN_UNEXPECTED(AddUpdate(delete_files));
return delete_files;
}

Result<std::shared_ptr<UpdateStatistics>> Transaction::NewUpdateStatistics() {
ICEBERG_ASSIGN_OR_RAISE(std::shared_ptr<UpdateStatistics> update_statistics,
UpdateStatistics::Make(ctx_));
Expand Down
3 changes: 3 additions & 0 deletions src/iceberg/transaction.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,9 @@ class ICEBERG_EXPORT Transaction : public std::enable_shared_from_this<Transacti
/// \brief Create a new FastAppend to append data files and commit the changes.
Result<std::shared_ptr<FastAppend>> NewFastAppend();

/// \brief Create a new DeleteFiles to delete data files and commit the changes.
Result<std::shared_ptr<DeleteFiles>> NewDeleteFiles();

/// \brief Create a new SnapshotManager to manage snapshots.
Result<std::shared_ptr<SnapshotManager>> NewSnapshotManager();

Expand Down
1 change: 1 addition & 0 deletions src/iceberg/type_fwd.h
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,7 @@ class Transaction;
class TransactionContext;

/// \brief Update family.
class DeleteFiles;
class ExpireSnapshots;
class FastAppend;
class PendingUpdate;
Expand Down
79 changes: 79 additions & 0 deletions src/iceberg/update/delete_files.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#include "iceberg/update/delete_files.h"

#include <memory>
#include <string>
#include <string_view>

#include "iceberg/snapshot.h"
#include "iceberg/transaction.h"
#include "iceberg/util/error_collector.h"
#include "iceberg/util/macros.h"

namespace iceberg {

Result<std::unique_ptr<DeleteFiles>> DeleteFiles::Make(
std::string table_name, std::shared_ptr<TransactionContext> ctx) {
ICEBERG_PRECHECK(!table_name.empty(), "Table name cannot be empty");
ICEBERG_PRECHECK(ctx != nullptr, "Cannot create DeleteFiles without a context");
return std::unique_ptr<DeleteFiles>(
new DeleteFiles(std::move(table_name), std::move(ctx)));
}

DeleteFiles::DeleteFiles(std::string table_name, std::shared_ptr<TransactionContext> ctx)
: MergingSnapshotUpdate(std::move(table_name), std::move(ctx)) {}

DeleteFiles& DeleteFiles::DeleteFile(std::string_view path) {
ICEBERG_BUILDER_CHECK(!path.empty(), "Cannot delete an empty file path");
ICEBERG_BUILDER_RETURN_IF_ERROR(DeleteByPath(path));
return *this;
}

DeleteFiles& DeleteFiles::DeleteFile(const std::shared_ptr<DataFile>& file) {
ICEBERG_BUILDER_RETURN_IF_ERROR(DeleteDataFile(file));
return *this;
}

DeleteFiles& DeleteFiles::DeleteFromRowFilter(std::shared_ptr<Expression> expr) {
ICEBERG_BUILDER_RETURN_IF_ERROR(DeleteByRowFilter(std::move(expr)));
return *this;
}

DeleteFiles& DeleteFiles::CaseSensitive(bool case_sensitive) {
MergingSnapshotUpdate::CaseSensitive(case_sensitive);
return *this;
}

DeleteFiles& DeleteFiles::ValidateFilesExist() {
validate_files_to_delete_exist_ = true;
return *this;
}

std::string DeleteFiles::operation() { return DataOperation::kDelete; }

Status DeleteFiles::Validate(const TableMetadata&, const std::shared_ptr<Snapshot>&) {
if (validate_files_to_delete_exist_) {
FailMissingDeletePaths();
}
return {};
}

} // namespace iceberg
Loading
Loading