From d865141e7aad32d78a90b9ad5589986c5c56b429 Mon Sep 17 00:00:00 2001 From: CarlosFelipeOR Date: Thu, 30 Apr 2026 04:07:32 -0300 Subject: [PATCH] DDLWorker: don't recreate replica dirs on every main-loop tick After upstream PR ClickHouse/ClickHouse#92339, DDLWorker::markReplicasActive() unconditionally re-creates /clickhouse/task_queue/replicas/ on every iteration of runMainThread, racing with external recursive cleanup of /clickhouse and deterministically breaking test_replication_without_zookeeper::test_startup_without_zookeeper on antalya-25.8 since 25.8.21 (PR Altinity/ClickHouse#1600). Add a `should_create_dirs` parameter to markReplicasActive and only call createReplicaDirs from real events (initialization/reconnect, or when a new host ID was just observed via host_ids_updated), not on every tick. This restores the pre-#92339 invariant ("create host dirs once at startup/reconnect") while preserving the interserver-IO fix. Generated with assistance from AI model Claude Opus 4.7. See Altinity/ClickHouse#1711. Made-with: Cursor --- src/Databases/DatabaseReplicatedWorker.cpp | 2 +- src/Databases/DatabaseReplicatedWorker.h | 2 +- src/Interpreters/DDLWorker.cpp | 16 ++++++++++++---- src/Interpreters/DDLWorker.h | 6 +++++- 4 files changed, 19 insertions(+), 7 deletions(-) diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index 22cd28737a4b..2b64e2125a6f 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -257,7 +257,7 @@ void DatabaseReplicatedDDLWorker::initializeReplication() active_node_holder = zkutil::EphemeralNodeHolder::existing(active_path, *active_node_holder_zookeeper); } -void DatabaseReplicatedDDLWorker::markReplicasActive(bool reinitialized) +void DatabaseReplicatedDDLWorker::markReplicasActive(bool reinitialized, bool /*should_create_dirs*/) { if (reinitialized || !active_node_holder_zookeeper || active_node_holder_zookeeper->expired()) { diff --git a/src/Databases/DatabaseReplicatedWorker.h b/src/Databases/DatabaseReplicatedWorker.h index 49f2575a4f2c..c29472eda618 100644 --- a/src/Databases/DatabaseReplicatedWorker.h +++ b/src/Databases/DatabaseReplicatedWorker.h @@ -49,7 +49,7 @@ class DatabaseReplicatedDDLWorker : public DDLWorker void initializeReplication() override; void createReplicaDirs(const ZooKeeperPtr &, const NameSet &) override { } - void markReplicasActive(bool reinitialized) override; + void markReplicasActive(bool reinitialized, bool should_create_dirs) override; void initializeLogPointer(const String & processed_entry_name); diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 53efd9b77975..eb410ae04357 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -1234,12 +1234,14 @@ void DDLWorker::runMainThread() } if (host_ids_updated.exchange(false)) - markReplicasActive(/*reinitialized=*/false); + markReplicasActive(/*reinitialized=*/false, /*should_create_dirs=*/true); cleanup_event->set(); try { - markReplicasActive(reinitialized); + /// On per-tick invocations (i.e. not reinitialized), do NOT recreate + /// the host_id dirs - that races with external recursive cleanup. + markReplicasActive(reinitialized, /*should_create_dirs=*/reinitialized); } catch (...) { @@ -1314,7 +1316,7 @@ void DDLWorker::createReplicaDirs(const ZooKeeperPtr & zookeeper, const NameSet } } -void DDLWorker::markReplicasActive(bool reinitialized) +void DDLWorker::markReplicasActive(bool reinitialized, bool should_create_dirs) { auto zookeeper = getZooKeeper(); @@ -1356,7 +1358,13 @@ void DDLWorker::markReplicasActive(bool reinitialized) LOG_INFO(log, "Unable to get interserver IO address, error {}", e.what()); } - createReplicaDirs(zookeeper, all_host_ids); + /// Only (re)create the host_id dirs on real events (initialization, reconnect, + /// or a freshly-observed host ID). Doing it on every main-loop tick races + /// against external recursive cleanup of /clickhouse and breaks + /// test_replication_without_zookeeper::test_startup_without_zookeeper. + /// See Altinity/ClickHouse#1711. + if (should_create_dirs) + createReplicaDirs(zookeeper, all_host_ids); if (reinitialized) { diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index 46289ee92f89..c1b86ed4b8ff 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -173,7 +173,11 @@ class DDLWorker virtual void initializeReplication(); virtual void createReplicaDirs(const ZooKeeperPtr & zookeeper, const NameSet & host_ids); - virtual void markReplicasActive(bool reinitialized); + /// `should_create_dirs` controls whether `createReplicaDirs` is called. + /// We only create dirs on (re)initialization or when host IDs were just updated - + /// not on every main-loop tick, otherwise it races with external recursive deletion + /// of the DDL queue path (see Altinity/ClickHouse#1711). + virtual void markReplicasActive(bool reinitialized, bool should_create_dirs); void runMainThread(); void runCleanupThread();