diff --git a/Cargo.lock b/Cargo.lock
index ca2554e99d88c..b8457cdbc5cf9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1311,9 +1311,9 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.6.0"
+version = "4.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b193af5b67834b676abd72466a96c1024e6a6ad978a1f484bd90b85c94041351"
+checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51"
 dependencies = [
  "clap_builder",
  "clap_derive",
@@ -1333,9 +1333,9 @@ dependencies = [
 
 [[package]]
 name = "clap_derive"
-version = "4.6.0"
+version = "4.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1110bd8a634a1ab8cb04345d8d878267d57c3cf1b38d91b71af6686408bbca6a"
+checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9"
 dependencies = [
  "heck",
  "proc-macro2",
@@ -1808,6 +1808,7 @@ dependencies = [
  "async-trait",
  "bytes",
  "clap",
+ "criterion",
  "datafusion",
  "datafusion-common",
  "datafusion-proto",
@@ -1823,6 +1824,7 @@ dependencies = [
  "serde",
  "serde_json",
  "snmalloc-rs",
+ "tempfile",
  "tokio",
  "tokio-util",
 ]
@@ -2770,7 +2772,7 @@ dependencies = [
  "libc",
  "option-ext",
  "redox_users",
- "windows-sys 0.61.2",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -2914,7 +2916,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
 dependencies = [
  "libc",
- "windows-sys 0.61.2",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -4179,7 +4181,7 @@ version = "0.50.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5"
 dependencies = [
- "windows-sys 0.61.2",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -4928,7 +4930,7 @@ dependencies = [
  "once_cell",
  "socket2",
  "tracing",
- "windows-sys 0.60.2",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -5318,7 +5320,7 @@ dependencies = [
  "errno",
  "libc",
  "linux-raw-sys",
- "windows-sys 0.61.2",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -6010,7 +6012,7 @@ dependencies = [
  "getrandom 0.4.2",
  "once_cell",
  "rustix",
- "windows-sys 0.61.2",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -6955,7 +6957,7 @@ version = "0.1.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
 dependencies = [
- "windows-sys 0.61.2",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -7071,7 +7073,7 @@ version = "0.52.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
 dependencies = [
- "windows-targets 0.52.6",
+ "windows-targets",
 ]
 
 [[package]]
@@ -7080,16 +7082,7 @@ version = "0.59.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
 dependencies = [
- "windows-targets 0.52.6",
-]
-
-[[package]]
-name = "windows-sys"
-version = "0.60.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb"
-dependencies = [
- "windows-targets 0.53.5",
+ "windows-targets",
 ]
 
 [[package]]
@@ -7107,31 +7100,14 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
 dependencies = [
- "windows_aarch64_gnullvm 0.52.6",
- "windows_aarch64_msvc 0.52.6",
- "windows_i686_gnu 0.52.6",
- "windows_i686_gnullvm 0.52.6",
- "windows_i686_msvc 0.52.6",
- "windows_x86_64_gnu 0.52.6",
- "windows_x86_64_gnullvm 0.52.6",
- "windows_x86_64_msvc 0.52.6",
-]
-
-[[package]]
-name = "windows-targets"
-version = "0.53.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3"
-dependencies = [
- "windows-link",
- "windows_aarch64_gnullvm 0.53.1",
- "windows_aarch64_msvc 0.53.1",
- "windows_i686_gnu 0.53.1",
- "windows_i686_gnullvm 0.53.1",
- "windows_i686_msvc 0.53.1",
- "windows_x86_64_gnu 0.53.1",
- "windows_x86_64_gnullvm 0.53.1",
- "windows_x86_64_msvc 0.53.1",
+ "windows_aarch64_gnullvm",
+ "windows_aarch64_msvc",
+ "windows_i686_gnu",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc",
+ "windows_x86_64_gnu",
+ "windows_x86_64_gnullvm",
+ "windows_x86_64_msvc",
 ]
 
 [[package]]
@@ -7149,96 +7125,48 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
 
-[[package]]
-name = "windows_aarch64_gnullvm"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53"
-
 [[package]]
 name = "windows_aarch64_msvc"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
 
-[[package]]
-name = "windows_aarch64_msvc"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006"
-
 [[package]]
 name = "windows_i686_gnu"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
 
-[[package]]
-name = "windows_i686_gnu"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3"
-
 [[package]]
 name = "windows_i686_gnullvm"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
 
-[[package]]
-name = "windows_i686_gnullvm"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c"
-
 [[package]]
 name = "windows_i686_msvc"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
 
-[[package]]
-name = "windows_i686_msvc"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2"
-
 [[package]]
 name = "windows_x86_64_gnu"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
 
-[[package]]
-name = "windows_x86_64_gnu"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499"
-
 [[package]]
 name = "windows_x86_64_gnullvm"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
 
-[[package]]
-name = "windows_x86_64_gnullvm"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1"
-
 [[package]]
 name = "windows_x86_64_msvc"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
 
-[[package]]
-name = "windows_x86_64_msvc"
-version = "0.53.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650"
-
 [[package]]
 name = "winnow"
 version = "1.0.0"
diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore
index c35b1a7c1944f..e3de51f76e477 100644
--- a/benchmarks/.gitignore
+++ b/benchmarks/.gitignore
@@ -1,3 +1,5 @@
 data
-results
+data_csv
+./results/
 venv
+!sql_benchmarks/**/results/
diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml
index f82f1c0a03e3d..1e96da498f3d6 100644
--- a/benchmarks/Cargo.toml
+++ b/benchmarks/Cargo.toml
@@ -42,7 +42,8 @@ mimalloc_extended = ["libmimalloc-sys/extended"]
 arrow = { workspace = true }
 async-trait = "0.1"
 bytes = { workspace = true }
-clap = { version = "4.5.60", features = ["derive"] }
+clap = { version = "4.6.0", features = ["derive", "env"] }
+criterion = { workspace = true, features = ["html_reports"] }
 datafusion = { workspace = true, default-features = true }
 datafusion-common = { workspace = true, default-features = true }
 env_logger = { workspace = true }
@@ -62,3 +63,8 @@ tokio-util = { version = "0.7.17" }
 
 [dev-dependencies]
 datafusion-proto = { workspace = true }
+tempfile = { workspace = true }
+
+[[bench]]
+harness = false
+name = "sql"
diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
index aa1ec477345c6..a331191850eaa 100755
--- a/benchmarks/bench.sh
+++ b/benchmarks/bench.sh
@@ -41,6 +41,7 @@ BENCHMARK=all
 DATAFUSION_DIR=${DATAFUSION_DIR:-$SCRIPT_DIR/..}
 DATA_DIR=${DATA_DIR:-$SCRIPT_DIR/data}
 CARGO_COMMAND=${CARGO_COMMAND:-"cargo run --release"}
+SQL_CARGO_COMMAND=${SQL_CARGO_COMMAND:-"cargo bench --bench sql"}
 PREFER_HASH_JOIN=${PREFER_HASH_JOIN:-true}
 SIMULATE_LATENCY=${SIMULATE_LATENCY:-false}
 
@@ -685,14 +686,16 @@ run_tpch() {
         echo "Internal error: Scale factor not specified"
         exit 1
     fi
-    TPCH_DIR="${DATA_DIR}/tpch_sf${SCALE_FACTOR}"
-
-    RESULTS_FILE="${RESULTS_DIR}/tpch_sf${SCALE_FACTOR}.json"
-    echo "RESULTS_FILE: ${RESULTS_FILE}"
+    FORMAT=$2
     echo "Running tpch benchmark..."
 
-    FORMAT=$2
-    debug_run $CARGO_COMMAND --bin dfbench -- tpch --iterations 5 --path "${TPCH_DIR}" --scale-factor "${SCALE_FACTOR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format ${FORMAT} -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
+    debug_run env BENCH_NAME=tpch \
+      BENCH_SIZE="${SCALE_FACTOR}" \
+      PREFER_HASH_JOIN="${PREFER_HASH_JOIN}" \
+      TPCH_FILE_TYPE="${FORMAT}" \
+      SIMULATE_LATENCY="${SIMULATE_LATENCY}" \
+      ${QUERY_ARG} \
+      bash -c "$SQL_CARGO_COMMAND"
 }
 
 # Runs the tpch in memory (needs tpch parquet data)
@@ -702,13 +705,15 @@ run_tpch_mem() {
         echo "Internal error: Scale factor not specified"
         exit 1
     fi
-    TPCH_DIR="${DATA_DIR}/tpch_sf${SCALE_FACTOR}"
-
-    RESULTS_FILE="${RESULTS_DIR}/tpch_mem_sf${SCALE_FACTOR}.json"
-    echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running tpch_mem benchmark..."
-    # -m means in memory
-    debug_run $CARGO_COMMAND --bin dfbench -- tpch --iterations 5 --path "${TPCH_DIR}" --scale-factor "${SCALE_FACTOR}" --prefer_hash_join "${PREFER_HASH_JOIN}" -m --format parquet -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
+
+    debug_run env BENCH_NAME=tpch \
+      BENCH_SIZE="${SCALE_FACTOR}" \
+      TPCH_FILE_TYPE="mem" \
+      PREFER_HASH_JOIN="${PREFER_HASH_JOIN}" \
+      SIMULATE_LATENCY="${SIMULATE_LATENCY}" \
+      ${QUERY_ARG} \
+      bash -c "$SQL_CARGO_COMMAND"
 }
 
 # Runs the tpcds benchmark
diff --git a/benchmarks/benches/sql.rs b/benchmarks/benches/sql.rs
new file mode 100644
index 0000000000000..b055d619f1a7d
--- /dev/null
+++ b/benchmarks/benches/sql.rs
@@ -0,0 +1,338 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use clap::Parser;
+use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
+use datafusion::error::Result;
+use datafusion::prelude::SessionContext;
+use datafusion_benchmarks::sql_benchmark::SqlBenchmark;
+use datafusion_benchmarks::util::{CommonOpt, print_memory_stats};
+use datafusion_common::instant::Instant;
+use log::{debug, info};
+use std::collections::BTreeMap;
+use std::fs;
+use std::sync::Mutex;
+use tokio::runtime::Runtime;
+
+static SQL_BENCHMARK_DIRECTORY: &str = "sql_benchmarks";
+
+#[cfg(all(feature = "snmalloc", feature = "mimalloc"))]
+compile_error!(
+    "feature \"snmalloc\" and feature \"mimalloc\" cannot be enabled at the same time"
+);
+
+#[cfg(feature = "snmalloc")]
+#[global_allocator]
+static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc;
+
+#[derive(Debug, Parser)]
+#[command(ignore_errors = true)]
+struct EnvParser {
+    #[command(flatten)]
+    options: CommonOpt,
+
+    #[arg(
+        env = "BENCH_PERSIST_RESULTS",
+        long = "persist_results",
+        default_value = "false",
+        action = clap::ArgAction::SetTrue
+    )]
+    persist_results: bool,
+
+    #[arg(
+        env = "BENCH_VALIDATE",
+        long = "validate_results",
+        default_value = "false",
+        action = clap::ArgAction::SetTrue
+    )]
+    validate: bool,
+
+    #[arg(env = "BENCH_NAME")]
+    name: Option<String>,
+
+    #[arg(env = "BENCH_SUBGROUP")]
+    subgroup: Option<String>,
+
+    #[arg(env = "BENCH_QUERY")]
+    query: Option<i32>,
+}
+
+#[cfg(feature = "mimalloc")]
+#[global_allocator]
+static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
+
+pub fn sql(c: &mut Criterion) {
+    env_logger::init();
+
+    let start = Instant::now();
+    let args = EnvParser::parse();
+    let rt = make_tokio_runtime();
+
+    println!("Loading benchmarks...");
+
+    let benchmarks = rt.block_on(async {
+        let ctx = make_ctx(&args).expect("SessionContext creation failed");
+
+        load_benchmarks(&args, &ctx, SQL_BENCHMARK_DIRECTORY)
+            .await
+            .unwrap_or_else(|err| panic!("failed load benchmarks: {err:?}"))
+    });
+
+    println!(
+        "Loaded benchmarks in {} ms ...",
+        start.elapsed().as_millis()
+    );
+
+    for (group, benchmarks) in benchmarks {
+        let mut group = c.benchmark_group(group);
+        group.sample_size(10);
+        group.sampling_mode(SamplingMode::Flat);
+
+        for mut benchmark in benchmarks {
+            // create a context
+            let ctx = make_ctx(&args).expect("SessionContext creation failed");
+
+            // initialize the benchmark. This parses the benchmark file and does any pre-execution
+            // work such as loading data into tables
+            rt.block_on(async {
+                benchmark
+                    .initialize(&ctx)
+                    .await
+                    .expect("initialization failed");
+
+                // run assertions
+                benchmark.assert(&ctx).await.expect("assertion failed");
+            });
+
+            let mut name = benchmark.name().to_string();
+            if !benchmark.subgroup().is_empty() {
+                name.push('_');
+                name.push_str(benchmark.subgroup());
+            }
+
+            let mut benchmark = benchmark.clone();
+
+            if args.persist_results {
+                handle_persist(&rt, &ctx, &name, &mut benchmark);
+            } else if args.validate {
+                handle_verify(&rt, &ctx, &name, &mut benchmark);
+            } else {
+                info!("Running benchmark {name} ...");
+
+                let name = name.clone();
+                group.bench_function(name.clone(), |b| {
+                    b.iter(|| handle_run(&rt, &ctx, &args, &mut benchmark, &name))
+                });
+
+                print_memory_stats();
+
+                info!("Benchmark {name} completed");
+            }
+
+            // run cleanup
+            rt.block_on(async {
+                benchmark.cleanup(&ctx).await.expect("Cleanup failed");
+            });
+        }
+
+        group.finish();
+    }
+}
+
+fn handle_run(
+    rt: &Runtime,
+    ctx: &SessionContext,
+    args: &EnvParser,
+    benchmark: &mut SqlBenchmark,
+    name: &str,
+) {
+    rt.block_on(async {
+        benchmark
+            .run(ctx, args.validate)
+            .await
+            .unwrap_or_else(|err| panic!("Failed to run benchmark {name}: {err:?}"))
+    });
+}
+
+fn handle_persist(
+    rt: &Runtime,
+    ctx: &SessionContext,
+    name: &str,
+    benchmark: &mut SqlBenchmark,
+) {
+    info!("Running benchmark {name} prior to persisting results ...");
+
+    rt.block_on(async {
+        info!("Persisting benchmark {name} ...");
+
+        benchmark
+            .persist(ctx)
+            .await
+            .expect("Failed to persist results");
+    });
+
+    info!("Persisted benchmark {name} successfully");
+}
+
+fn handle_verify(
+    rt: &Runtime,
+    ctx: &SessionContext,
+    name: &str,
+    benchmark: &mut SqlBenchmark,
+) {
+    info!("Verifying benchmark {name} results ...");
+
+    rt.block_on(async {
+        benchmark
+            .run(ctx, true)
+            .await
+            .unwrap_or_else(|err| panic!("Failed to run benchmark {name}: {err:?}"));
+        benchmark
+            .verify(ctx)
+            .await
+            .unwrap_or_else(|err| panic!("Verification failed: {err:?}"));
+    });
+
+    info!("Verified benchmark {name} results successfully");
+}
+
+criterion_group!(benches, sql);
+criterion_main!(benches);
+
+fn make_tokio_runtime() -> Runtime {
+    tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()
+        .unwrap()
+}
+
+fn make_ctx(args: &EnvParser) -> Result<SessionContext> {
+    let config = args.options.config()?;
+    let rt = args.options.build_runtime()?;
+
+    Ok(SessionContext::new_with_config_rt(config, rt))
+}
+
+/// Recursively walks the directory tree starting at `path` and
+/// calls the call back function for every file encountered.
+pub fn list_files<F>(path: &str, callback: &mut F)
+where
+    F: FnMut(&str),
+{
+    let mut entries: Vec<fs::DirEntry> =
+        fs::read_dir(path).unwrap().filter_map(Result::ok).collect();
+    entries.sort_by_key(|entry| entry.path());
+
+    for dir_entry in entries {
+        let path = dir_entry.path();
+        if path.is_dir() {
+            // Recurse into the sub‑directory
+            list_files(&path.to_string_lossy(), callback);
+        } else {
+            // For files, invoke the callback with the full path as a string
+            let full_str = path.to_string_lossy();
+            callback(&full_str);
+        }
+    }
+}
+
+/// Loads all benchmark files in the `sql_benchmarks` directory.
+/// For each file ending with `.benchmark` it creates a new
+/// `SqlBenchmark` instance.
+async fn load_benchmarks(
+    args: &EnvParser,
+    ctx: &SessionContext,
+    path: &str,
+) -> Result<BTreeMap<String, Vec<SqlBenchmark>>> {
+    let benches = Mutex::new(BTreeMap::new());
+    let mut paths = Vec::new();
+
+    list_files(path, &mut |path: &str| {
+        if path.ends_with(".benchmark") {
+            paths.push(path.to_string());
+        }
+    });
+
+    for path in paths {
+        debug!("Loading benchmark from {path}");
+
+        let benchmark = SqlBenchmark::new(ctx, &path, SQL_BENCHMARK_DIRECTORY).await?;
+        let mut map = benches.lock().unwrap();
+        let entries = map.entry(benchmark.group().to_string()).or_insert(vec![]);
+
+        entries.push(benchmark);
+    }
+
+    let mut benches = benches.into_inner().unwrap();
+
+    benches = filter_benchmarks(args, benches);
+    benches.iter_mut().for_each(|(_, benchmarks)| {
+        benchmarks.sort_by(|b1, b2| b1.name().cmp(b2.name()))
+    });
+
+    Ok(benches)
+}
+
+fn filter_benchmarks(
+    args: &EnvParser,
+    benchmarks: BTreeMap<String, Vec<SqlBenchmark>>,
+) -> BTreeMap<String, Vec<SqlBenchmark>> {
+    let benchmarks_to_run: BTreeMap<String, Vec<SqlBenchmark>> = match &args.name {
+        Some(bench_name) => benchmarks
+            .into_iter()
+            // first filter to the benchmark we wish to run (corresponds to SqlBenchmark::group())
+            .filter(|(key, _val)| key.eq_ignore_ascii_case(bench_name))
+            // if provided filter to just the subgroup we wish to run (corresponds to SqlBenchmark::subgroup())
+            .map(|(key, val)| {
+                if let Some(subgroup) = &args.subgroup {
+                    let mut benches = vec![];
+                    for bench in val {
+                        if bench.subgroup().eq_ignore_ascii_case(subgroup) {
+                            benches.push(bench.clone());
+                        }
+                    }
+                    (key, benches)
+                } else {
+                    (key, val)
+                }
+            })
+            // if provided filter to just the query number we wish to run (corresponds loosely to SqlBenchmark::name())
+            .map(|(key, val)| {
+                if let Some(query_number) = &args.query {
+                    let padded = if query_number < &10 {
+                        format!("Q{query_number:0>2}")
+                    } else {
+                        format! {"Q{query_number}"}
+                    };
+                    let mut benches = vec![];
+                    for bench in val {
+                        if bench.name().eq_ignore_ascii_case(&padded) {
+                            benches.push(bench.clone());
+                        }
+                    }
+                    (key, benches)
+                } else {
+                    (key, val)
+                }
+            })
+            .map(|(key, val)| (key, val.clone()))
+            .collect(),
+        None => benchmarks,
+    };
+
+    benchmarks_to_run
+}
diff --git a/benchmarks/sql_benchmarks/README.md b/benchmarks/sql_benchmarks/README.md
new file mode 100644
index 0000000000000..5c2cf29c9c93f
--- /dev/null
+++ b/benchmarks/sql_benchmarks/README.md
@@ -0,0 +1,354 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# SQL Benchmarks
+
+This directory contains a collection of benchmarks each driven by a simple '.benchmark' text file and sql queries
+that exercise the DataFusion execution engine against a variety of benchmark suites. The sql benchmark framework
+is intentionally simple so that new benchmarks and queries can be added without touching the core engine.
+
+The sql benchmarks are organized in sub‑directories that correspond to the benchmark suites that are commonly used
+in the community:
+
+| Benchmark Suite       | Description                                                        |
+|-----------------------|--------------------------------------------------------------------|
+| `clickbench`          | ClickBench benchmark                                               |
+| `clickbench extended` | 12 additional, more complex queries against the Clickbench dataset |
+| `clickbench_sorted`   | ClickBench benchmark using a pre-sorted hits file.                 |
+| `h2o`                 | The `h2o` benchmark                                                |
+| `hj`                  | Hash join benchmark                                                |
+| `imdb`                | IMDb benchmark                                                     |
+| `nlj`                 | Nested‑loop join benchmark                                         |
+| `smj`                 | Sort‑merge join benchmark                                          |
+| `sort tpch`           | Sorting benchmarks against the TPC-H lineitem table                |
+| `taxi`                | NYC taxi dataset benchmark                                         |
+| `tpcds`               | TPC‑DS queries                                                     |
+| `tpch`                | TPC‑H queries                                                      |
+
+## How it works
+
+SQL benchmarks are run via cargo's bench command using [criterion](https://docs.rs/criterion/latest/criterion/)
+for running and gathering statistics of each sql being benchmarked. For simplicity the benchmarks/bench.sh can
+be used to execute the supported benchmarks.
+
+Each individual benchmark is represented by a `<name>.benchmark` file that contains a number of directives instructing
+the tool on how to load data, run initializations, run assertions, run the benchmark, optionally persist and
+validate results, and finally run any cleanup if required.
+
+Variables are supported in two forms:
+
+* string substitution based on environment variables (with default values if unset): \${ENV_VAR} and
+  \${ENV_VAR:-default}.
+* if / else based on whether an environment variable is true or not (\${ENV_VAR|true value|false value}). In this
+  form only the value `true` (case-insensitive) selects the true branch; any other set value selects the false branch.
+  If ENV_VAR is unset the benchmark will return an error.
+
+Comments in files are supported with lines starting with # or --.
+
+Many if not most of the benchmarks are set up using templates to reduce duplication across the .benchmark files. For
+example here is one of the benchmark files for the h2o benchmark suite:
+
+```
+subgroup groupby
+
+template sql_benchmarks/h2o/h2o.benchmark.template
+QUERY_NUMBER=1
+QUERY_NUMBER_PADDED=01
+```
+
+The template directive above defines the subgroup the benchmark is part of, sets two variables (`QUERY_NUMBER` and
+`QUERY_NUMBER_PADDED`) and points to a file containing more directives that are shared across the benchmark suite.
+
+```
+load sql_benchmarks/h2o/init/load_${BENCH_SUBGROUP:-groupby}_${BENCH_SIZE:-small}_${BENCH_FILE_TYPE:-csv}.sql
+
+name Q${QUERY_NUMBER_PADDED}
+group h2o
+
+run sql_benchmarks/h2o/queries/${BENCH_SUBGROUP:-groupby}/q${QUERY_NUMBER_PADDED}.sql
+
+result sql_benchmarks/h2o/results/${BENCH_SUBGROUP:-groupby}/${BENCH_SIZE:-small}/q${QUERY_NUMBER_PADDED}.csv
+```
+
+The above showcases the use of defaults for variables: `${NAME:-default}`
+
+# Directives
+
+<table>
+<tr><th>Directive</th><th>Description</th></tr>
+<tr>
+<td>name</td>
+<td>
+
+The name of the benchmark. This will be used as part of the display name used by criterion.<br/><br/>Example:<br/>
+<blockquote>name Q${QUERY_NUMBER_PADDED}</blockquote>
+
+The `name` directive also makes the value available to benchmark-file replacements as `BENCH_NAME`. This is separate
+from the `BENCH_NAME` environment variable used to select which benchmark group to run.
+
+</td>
+</tr>
+<tr>
+<td>group</td>
+<td>
+
+The group name of the benchmark used for grouping benchmarks together.<br/><br/>Example:<br/>
+<blockquote>group imdb</blockquote>
+
+</td>
+</tr>
+<tr>
+<td>subgroup</td>
+<td>
+
+The sub group name of the benchmark used for filtering to a specific sub group.<br/><br/>Example:<br/>
+<blockquote>subgroup window</blockquote>
+
+</td>
+</tr>
+<tr>
+<td>load</td>
+<td>
+
+The load directive called during initialization of the benchmark. If a path to a file is provided on the same
+line as the load directive that path will be parsed and any sql statements in that file will be executed during
+initialization. If no path is specified the next line is required to be the sql statement to execute. <br/> <br/> The
+load directive (including any following sql statement) must be followed by a blank line. <br/><br/>Example:<br/>
+<blockquote>load sql_benchmarks/h2o/init/load_${BENCH_SUBGROUP:-groupby}_${BENCH_SIZE:-small}_${BENCH_FILE_TYPE:-csv}.sql</blockquote>
+or
+<blockquote>
+load<br/>
+CREATE TABLE test AS (SELECT value as key FROM range(1000000) ORDER BY value);
+</blockquote>
+
+</td>
+</tr>
+<tr>
+<td>init</td>
+<td>
+
+The init directive is called after the load directive prior to benchmark execution. If a path to a file is
+provided on the same line as the init directive that path will be parsed and any sql statements in that file will be
+executed during the benchmark initialization. If no path is specified the next line is required to be the sql statement
+to execute.<br/><br/> The init directive (including any following sql statement) must be followed by a blank
+line.<br/><br/>Example:<br/>
+<blockquote>
+init<br/>
+set datafusion.execution.parquet.binary_as_string = true;  
+</blockquote>
+
+</td>
+</tr>
+<tr>
+<td>run</td>
+<td>
+
+The run directive called during execution of the benchmark. If a path to a file is provided on the same line as
+the run directive that path will be parsed and any sql statements in that file will be executed during the benchmark
+run. If no path is specified the next line is required to be the sql statement to execute. <br/><br/> Multiple
+statements are allowed within a single run directive, however a benchmark file may contain only one run directive. When
+running with `BENCH_PERSIST_RESULTS` or `BENCH_VALIDATE`, only the last `SELECT` or `WITH` statement from that run
+directive will be used for comparison. <br/><br/> The run directive (including any following sql statement) must be
+followed by a blank line.<br/><br/>Example:<br/>
+<blockquote>run sql_benchmarks/imdb/queries/${QUERY_NUMBER_PADDED}.sql</blockquote>
+
+</td>
+</tr>
+<tr>
+<td>cleanup</td>
+<td>
+
+The cleanup directive is called after all other directives and can be used to cleanup after the benchmark -
+e.g. to drop tables. If a path to a file is provided on the same line as the cleanup directive that path will be parsed
+and any sql statements in that file will be executed during cleanup. If no path is specified the next line is
+required to be the sql statement to execute. <br/> <br/> The cleanup directive (including any following sql statement)
+must be followed by a blank line. <br/><br/>Example:<br/>
+<blockquote>
+cleanup<br/>
+DROP TABLE test;
+</blockquote>
+
+</td>
+</tr>
+<tr>
+<td>expect_plan</td>
+<td>
+
+The expect_plan directive will check the physical plan for the string provided on the same line. This
+can be used to validate that a particular join was used. <br/> <br/> Example:<br/>
+<blockquote>expect_plan NestedLoopJoinExec</blockquote>
+
+</td>
+</tr>
+<tr>
+<td>assert</td>
+<td>
+
+The assert directive is run between the init and run directives and can be used to validate system state correctness
+prior to running the benchmark sql. The format is
+<blockquote>
+assert II<br/>
+SELECT name, value = 3 FROM information_schema.df_settings WHERE name IN ('datafusion.execution.target_partitions', 'datafusion.execution.planning_concurrency');<br/>
+----<br/>
+datafusion.execution.planning_concurrency true<br/>
+datafusion.execution.target_partitions true<br/>
+</blockquote>
+
+The number of I's corresponds to the number of columns in the result. The expected results can be either tab delimited
+or pipe delimited.
+
+</td>
+</tr>                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             
+<tr>
+<td>result_query</td>
+<td>
+
+The result_query directive is run during the verify phase and can be used to verify a different set of results than any
+that might come from queries executed from the `run` directive. The format is the same as the `assert` directive
+above.<br/><br/>Example:
+<blockquote>
+result_query III<br/>
+SELECT COUNT(DISTINCT id2), SUM(r4), COUNT(*) FROM answer;<br/>
+----<br/>
+123 345 45
+</blockquote>
+
+
+Note that the results of the run query are not automatically stored into a table in datafusion. If you want to
+verify a result from queries executed from the `run` directive those queries will have to be saved to a table directly
+using `CREATE TABLE AS (..)` or similar.
+
+</td>
+</tr>
+<tr>
+<td>result</td>
+<td>
+
+The result directive declares the expected result file used during verification. A path to a file is required on the
+same line as the result directive. The file is parsed when the benchmark file is loaded, and must be a pipe-delimited
+CSV file with a header row. During verification, these expected rows are compared with the rows produced by the last
+saved `SELECT` or `WITH` statement from the `run` directive. <br/><br/>Example:<br/>
+<blockquote>
+result sql_benchmarks/imdb/results/${QUERY_NUMBER_PADDED}.csv  
+</blockquote>
+
+</td>
+</tr>
+<tr>
+<td>template</td>
+<td>
+
+The template directive allows for inclusion of another file in a benchmark file. A path to a file is
+required on the same line as the template directive which will be parsed as a benchmark file. Parameters can be passed
+to the template file using the format `KEY=value`, one per line after the template directive followed by a blank line.
+<br/><br/>Example:<br/>
+<blockquote>
+template sql_benchmarks/smj/smj.benchmark.template<br/>
+QUERY_NUMBER=1<br/>
+QUERY_NUMBER_PADDED=01
+</blockquote> 
+
+</td>
+</tr>
+<tr>
+<td>include</td>
+<td>The include directive is similar to the template directive except that it does not support parameters.</td>
+</tr>
+<tr>
+<td>echo</td>
+<td>
+
+The echo directive allows for echoing a string to stdout during the execution of the benchmark and may be useful for
+debugging.<br/><br/>Example:<br/>
+<blockquote>
+echo The value for batch size is ${BATCH_SIZE:-8192}
+</blockquote>
+
+</td>
+</tr>
+</table>
+
+# Run a single benchmark suite
+
+```shell
+BENCH_NAME=tpch cargo bench --bench sql
+```
+
+As you can see above the actual benchmark suite to run is set via an environment variable. Cargo's bench command and
+criterion have an unfortunate limitation in that custom command arguments cannot be passed
+into a benchmark. The alternative is to use environment variables to pass in arguments which is what is used here.
+The SQL benchmarking tool uses the following environment variables:
+
+| Environment Variable            | Description                                                                                                                                                                                       |
+|---------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| BENCH_NAME                      | The name of the benchmark suite to run. For example 'imdb'. This should correspond to a directory name in the `sql_benchmarks` directory.                                                         |
+| BENCH_SUBGROUP                  | The subgroup with the benchmark suite to run. For example 'window' to run the window subgroup of the h2o benchmark.                                                                               |
+| BENCH_QUERY                     | A query number to run.                                                                                                                                                                            |
+| BENCH_PERSIST_RESULTS           | true/false to persist benchmark results. Results will be persisted in csv format so be cognizant of the size of the results.                                                                      |
+| BENCH_VALIDATE                  | true/false to validate benchmark results against persisted results or result_query's. If both `BENCH_PERSIST_RESULTS` and `BENCH_VALIDATE` are true, persist mode runs and validation is skipped. |
+| SIMULATE_LATENCY                | Simulate object store latency to mimic remote storage (e.g. S3). Adds random latency in the range 20-200ms to each object store operation.                                                        |
+| PARTITIONS                      | Number of partitions to process in parallel. Defaults to number of available cores.                                                                                                               |
+| BATCH_SIZE                      | Batch size when reading CSV or Parquet files.                                                                                                                                                     |
+| MEM_POOL_TYPE                   | The memory pool type to use, should be one of "fair" or "greedy".                                                                                                                                 |
+| MEMORY_LIMIT                    | Memory limit (e.g. '100M', '1.5G'). If not specified, run all pre-defined memory limits for given query if there's any, otherwise run with no memory limit.                                       |
+| DATAFUSION_RUNTIME_MEMORY_LIMIT | Used if MEMORY_LIMIT is not set.                                                                                                                                                                  |
+| SORT_SPILL_RESERVATION_BYTES    | The amount of memory to reserve for sort spill operations. DataFusion's default value will be used if not specified.                                                                              |
+
+Example – Run the H2O window benchmarks on the 'small' sized CSV data files:
+
+``` bash
+export BENCH_NAME=h2o
+export BENCH_SUBGROUP=window
+export H2O_BENCH_SIZE=small
+export H20_FILE_TYPE=csv
+cargo bench --bench sql 
+```
+
+Some benchmarks use custom environment variables as outlined below:
+
+| Name                         | Description                                                                                                              | Default value | 
+|------------------------------|--------------------------------------------------------------------------------------------------------------------------|---------------|
+| BENCH_SIZE                   | Used in the tpch, sort-tpch and tpcds benchmarks. The size corresponds to the scale factor.                              | `1`           |
+| TPCH_FILE_TYPE               | Used in the tpch benchmark to specify which file type to query against. The valid options are `csv`, `parquet` and `mem` | `parquet`     |
+| H2O_FILE_TYPE                | Used in the h2o benchmark to specify which file type to query against. The valid options are `csv` and `parquet`         | `csv`         |
+| CLICKBENCH_TYPE              | The type of partitioning for the clickbench benchmark. Valid options are `single` and `partitioned`                      | `single`      | 
+| H2O_BENCH_SIZE               | Used in the h2o benchmark. The valid options are `small`, `medium` and `big`                                             | `small`       |                           
+| PREFER_HASH_JOIN             | Control datafusion's config option `datafusion.optimizer.prefer_hash_join`                                               | true          |
+| HASH_JOIN_BUFFERING_CAPACITY | Control datafusion's config option `datafusion.execution.hash_join_buffering_capacity`                                   | 0             |
+| BENCH_SORTED                 | Used in the sort_tpch benchmark to indicate whether the lineitem table should be sorted.                                 | false         |
+| SORTED_BY                    | Used in the clickbench_sorted benchmark to indicate the column to sort by.                                               | `EventTime`   |
+| SORTED_ORDER                 | Used in the clickbench_sorted benchmark to indicate the sort order of the column.                                        | `ASC`         |
+
+# Extending an existing benchmark suite
+
+If you want to add a new query:
+
+* Create a new qXX.sql in the corresponding queries folder of the benchmark.
+* Add a new qXX.benchmark that references the appropriate template (clickbench.benchmark.template,
+  h2o.benchmark.template,
+  etc.).
+* (Optional) Add a new entry to the suite’s load script if the data set is different.
+* (Optional) Manually create a result csv to be compared against benchmark results during verification.
+
+# Adding a new benchmark suite
+
+* Create a new directory named for the new benchmark suite.
+* Within there create a `<name>.benchmark` for each individual benchmark.
+* Populate the benchmark with directives as described above. Use the other benchmarks as examples for standardization.
+* No rust files need to be updated to run the new benchmark suite.
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q01.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q01.benchmark
new file mode 100644
index 0000000000000..32d2764ec85c9
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q01.benchmark
@@ -0,0 +1,4 @@
+
+template sql_benchmarks/tpch/tpch.benchmark.template
+QUERY_NUMBER=1
+QUERY_NUMBER_PADDED=01
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q02.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q02.benchmark
new file mode 100644
index 0000000000000..7948d62edc458
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q02.benchmark
@@ -0,0 +1,4 @@
+
+template sql_benchmarks/tpch/tpch.benchmark.template
+QUERY_NUMBER=2
+QUERY_NUMBER_PADDED=02
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q03.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q03.benchmark
new file mode 100644
index 0000000000000..9b12b64558e84
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q03.benchmark
@@ -0,0 +1,4 @@
+
+template sql_benchmarks/tpch/tpch.benchmark.template
+QUERY_NUMBER=3
+QUERY_NUMBER_PADDED=03
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q04.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q04.benchmark
new file mode 100644
index 0000000000000..01f425479542e
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q04.benchmark
@@ -0,0 +1,4 @@
+
+template sql_benchmarks/tpch/tpch.benchmark.template
+QUERY_NUMBER=4
+QUERY_NUMBER_PADDED=04
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q05.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q05.benchmark
new file mode 100644
index 0000000000000..32a9a1ddfcffc
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q05.benchmark
@@ -0,0 +1,4 @@
+
+template sql_benchmarks/tpch/tpch.benchmark.template
+QUERY_NUMBER=5
+QUERY_NUMBER_PADDED=05
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q06.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q06.benchmark
new file mode 100644
index 0000000000000..dc5dd1d9c81f0
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q06.benchmark
@@ -0,0 +1,4 @@
+
+template sql_benchmarks/tpch/tpch.benchmark.template
+QUERY_NUMBER=6
+QUERY_NUMBER_PADDED=06
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q07.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q07.benchmark
new file mode 100644
index 0000000000000..29488724defc0
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q07.benchmark
@@ -0,0 +1,4 @@
+
+template sql_benchmarks/tpch/tpch.benchmark.template
+QUERY_NUMBER=7
+QUERY_NUMBER_PADDED=07
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q08.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q08.benchmark
new file mode 100644
index 0000000000000..3e452c0519aff
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q08.benchmark
@@ -0,0 +1,4 @@
+
+template sql_benchmarks/tpch/tpch.benchmark.template
+QUERY_NUMBER=8
+QUERY_NUMBER_PADDED=08
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q09.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q09.benchmark
new file mode 100644
index 0000000000000..f826789f04322
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q09.benchmark
@@ -0,0 +1,4 @@
+
+template sql_benchmarks/tpch/tpch.benchmark.template
+QUERY_NUMBER=9
+QUERY_NUMBER_PADDED=09
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q10.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q10.benchmark
new file mode 100644
index 0000000000000..9c3befcd5abb2
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q10.benchmark
@@ -0,0 +1,4 @@
+
+template sql_benchmarks/tpch/tpch.benchmark.template
+QUERY_NUMBER=10
+QUERY_NUMBER_PADDED=10
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q11.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q11.benchmark
new file mode 100644
index 0000000000000..bbade5b976bee
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q11.benchmark
@@ -0,0 +1,4 @@
+
+template sql_benchmarks/tpch/tpch.benchmark.template
+QUERY_NUMBER=11
+QUERY_NUMBER_PADDED=11
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q12.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q12.benchmark
new file mode 100644
index 0000000000000..a02a325679cb3
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q12.benchmark
@@ -0,0 +1,4 @@
+
+template sql_benchmarks/tpch/tpch.benchmark.template
+QUERY_NUMBER=12
+QUERY_NUMBER_PADDED=12
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q13.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q13.benchmark
new file mode 100644
index 0000000000000..a9842ac40cb34
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q13.benchmark
@@ -0,0 +1,4 @@
+
+template sql_benchmarks/tpch/tpch.benchmark.template
+QUERY_NUMBER=13
+QUERY_NUMBER_PADDED=13
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q14.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q14.benchmark
new file mode 100644
index 0000000000000..a88b88e0cc6ff
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q14.benchmark
@@ -0,0 +1,4 @@
+
+template sql_benchmarks/tpch/tpch.benchmark.template
+QUERY_NUMBER=14
+QUERY_NUMBER_PADDED=14
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q15.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q15.benchmark
new file mode 100644
index 0000000000000..85a2883738e66
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q15.benchmark
@@ -0,0 +1,4 @@
+
+template sql_benchmarks/tpch/tpch.benchmark.template
+QUERY_NUMBER=15
+QUERY_NUMBER_PADDED=15
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q16.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q16.benchmark
new file mode 100644
index 0000000000000..0404224cd426c
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q16.benchmark
@@ -0,0 +1,4 @@
+
+template sql_benchmarks/tpch/tpch.benchmark.template
+QUERY_NUMBER=16
+QUERY_NUMBER_PADDED=16
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q17.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q17.benchmark
new file mode 100644
index 0000000000000..ca31566a4c830
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q17.benchmark
@@ -0,0 +1,4 @@
+
+template sql_benchmarks/tpch/tpch.benchmark.template
+QUERY_NUMBER=17
+QUERY_NUMBER_PADDED=17
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q18.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q18.benchmark
new file mode 100644
index 0000000000000..e36015ee8c71b
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q18.benchmark
@@ -0,0 +1,4 @@
+
+template sql_benchmarks/tpch/tpch.benchmark.template
+QUERY_NUMBER=18
+QUERY_NUMBER_PADDED=18
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q19.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q19.benchmark
new file mode 100644
index 0000000000000..375fc70f577ca
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q19.benchmark
@@ -0,0 +1,4 @@
+
+template sql_benchmarks/tpch/tpch.benchmark.template
+QUERY_NUMBER=19
+QUERY_NUMBER_PADDED=19
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q20.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q20.benchmark
new file mode 100644
index 0000000000000..b2787fcc231a5
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q20.benchmark
@@ -0,0 +1,4 @@
+
+template sql_benchmarks/tpch/tpch.benchmark.template
+QUERY_NUMBER=20
+QUERY_NUMBER_PADDED=20
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q21.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q21.benchmark
new file mode 100644
index 0000000000000..f8a17dc54c9bc
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q21.benchmark
@@ -0,0 +1,4 @@
+
+template sql_benchmarks/tpch/tpch.benchmark.template
+QUERY_NUMBER=21
+QUERY_NUMBER_PADDED=21
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/benchmarks/q22.benchmark b/benchmarks/sql_benchmarks/tpch/benchmarks/q22.benchmark
new file mode 100644
index 0000000000000..927bb8e1bb145
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/benchmarks/q22.benchmark
@@ -0,0 +1,4 @@
+
+template sql_benchmarks/tpch/tpch.benchmark.template
+QUERY_NUMBER=22
+QUERY_NUMBER_PADDED=22
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/init/load_csv.sql b/benchmarks/sql_benchmarks/tpch/init/load_csv.sql
new file mode 100644
index 0000000000000..390de12c64881
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/init/load_csv.sql
@@ -0,0 +1,99 @@
+CREATE EXTERNAL TABLE nation
+(
+    n_nationkey INT,
+    n_name      CHAR(25),
+    n_regionkey INT,
+    n_comment   VARCHAR(152),
+    PRIMARY KEY (n_nationkey)
+) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/nation/nation.1.csv';
+
+CREATE EXTERNAL TABLE region
+(
+    r_regionkey INT,
+    r_name      CHAR(25),
+    r_comment   VARCHAR(152),
+    PRIMARY KEY (r_regionkey)
+) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/region/region.1.csv';
+
+CREATE EXTERNAL TABLE supplier
+(
+    s_suppkey   INT,
+    s_name      CHAR(25),
+    s_address   VARCHAR(40),
+    s_nationkey INT,
+    s_phone     CHAR(15),
+    s_acctbal   DECIMAL(15, 2),
+    s_comment   VARCHAR(101),
+    PRIMARY KEY (s_suppkey)
+) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/supplier/supplier.1.csv';
+
+CREATE EXTERNAL TABLE customer
+(
+    c_custkey    INT,
+    c_name       VARCHAR(25),
+    c_address    VARCHAR(40),
+    c_nationkey  INT,
+    c_phone      CHAR(15),
+    c_acctbal    DECIMAL(15, 2),
+    c_mktsegment CHAR(10),
+    c_comment    VARCHAR(117),
+    PRIMARY KEY (c_custkey)
+) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/customer/customer.1.csv';
+
+CREATE EXTERNAL TABLE part
+(
+    p_partkey     INT,
+    p_name        VARCHAR(55),
+    p_mfgr        CHAR(25),
+    p_brand       CHAR(10),
+    p_type        VARCHAR(25),
+    p_size        INT,
+    p_container   CHAR(10),
+    p_retailprice DECIMAL(15, 2),
+    p_comment     VARCHAR(23),
+    PRIMARY KEY (p_partkey)
+) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/part/part.1.csv';
+
+CREATE EXTERNAL TABLE partsupp
+(
+    ps_partkey    INT,
+    ps_suppkey    INT,
+    ps_availqty   INT,
+    ps_supplycost DECIMAL(15, 2),
+    ps_comment    VARCHAR(199),
+    PRIMARY KEY (ps_partkey)
+) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/partsupp/partsupp.1.csv';
+
+CREATE EXTERNAL TABLE orders
+(
+    o_orderkey      INT,
+    o_custkey       INT,
+    o_orderstatus   CHAR(1),
+    o_totalprice    DECIMAL(15, 2),
+    o_orderdate     DATE,
+    o_orderpriority CHAR(15),
+    o_clerk         CHAR(15),
+    o_shippriority  INT,
+    o_comment       VARCHAR(79),
+    PRIMARY KEY (o_orderkey)
+) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/order/orders.1.csv';
+
+CREATE EXTERNAL TABLE lineitem
+(
+    l_orderkey      INT,
+    l_partkey       INT,
+    l_suppkey       INT,
+    l_linenumber    INT,
+    l_quantity      DECIMAL(15, 2),
+    l_extendedprice DECIMAL(15, 2),
+    l_discount      DECIMAL(15, 2),
+    l_tax           DECIMAL(15, 2),
+    l_returnflag    CHAR(1),
+    l_linestatus    CHAR(1),
+    l_shipdate      DATE,
+    l_commitdate    DATE,
+    l_receiptdate   DATE,
+    l_shipinstruct  CHAR(25),
+    l_shipmode      CHAR(10),
+    l_comment       VARCHAR(44)
+) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/lineitem/lineitem.1.csv';
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/init/load_mem.sql b/benchmarks/sql_benchmarks/tpch/init/load_mem.sql
new file mode 100644
index 0000000000000..57d12c22f0c52
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/init/load_mem.sql
@@ -0,0 +1,31 @@
+CREATE EXTERNAL TABLE nation_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/nation/nation.1.parquet';
+
+CREATE EXTERNAL TABLE region_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/region/region.1.parquet';
+
+CREATE EXTERNAL TABLE supplier_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/supplier/supplier.1.parquet';
+
+CREATE EXTERNAL TABLE customer_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/customer/customer.1.parquet';
+
+CREATE EXTERNAL TABLE part_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/part/part.1.parquet';
+
+CREATE EXTERNAL TABLE partsupp_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/partsupp/partsupp.1.parquet';
+
+CREATE EXTERNAL TABLE orders_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/orders/orders.1.parquet';
+
+CREATE EXTERNAL TABLE lineitem_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/lineitem/lineitem.1.parquet';
+
+CREATE TABLE nation as SELECT * FROM nation_raw;
+
+CREATE TABLE region as SELECT * FROM region_raw;
+
+CREATE TABLE supplier as SELECT * FROM supplier_raw;
+
+CREATE TABLE customer as SELECT * FROM customer_raw;
+
+CREATE TABLE part as SELECT * FROM part_raw;
+
+CREATE TABLE partsupp as SELECT * FROM partsupp_raw;
+
+CREATE TABLE orders as SELECT * FROM orders_raw;
+
+CREATE TABLE lineitem as SELECT * FROM lineitem_raw;
diff --git a/benchmarks/sql_benchmarks/tpch/init/load_parquet.sql b/benchmarks/sql_benchmarks/tpch/init/load_parquet.sql
new file mode 100644
index 0000000000000..172a03d82a2cf
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/init/load_parquet.sql
@@ -0,0 +1,15 @@
+CREATE EXTERNAL TABLE nation STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/nation/nation.1.parquet';
+
+CREATE EXTERNAL TABLE region STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/region/region.1.parquet';
+
+CREATE EXTERNAL TABLE supplier STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/supplier/supplier.1.parquet';
+
+CREATE EXTERNAL TABLE customer STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/customer/customer.1.parquet';
+
+CREATE EXTERNAL TABLE part STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/part/part.1.parquet';
+
+CREATE EXTERNAL TABLE partsupp STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/partsupp/partsupp.1.parquet';
+
+CREATE EXTERNAL TABLE orders STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/orders/orders.1.parquet';
+
+CREATE EXTERNAL TABLE lineitem STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/lineitem/lineitem.1.parquet';
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/init/set_config.sql b/benchmarks/sql_benchmarks/tpch/init/set_config.sql
new file mode 100644
index 0000000000000..00457e2bca1ef
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/init/set_config.sql
@@ -0,0 +1,3 @@
+set datafusion.optimizer.prefer_hash_join=${PREFER_HASH_JOIN:-true};
+
+set datafusion.execution.hash_join_buffering_capacity=${HASH_JOIN_BUFFERING_CAPACITY:-0};
diff --git a/benchmarks/sql_benchmarks/tpch/queries/q01.sql b/benchmarks/sql_benchmarks/tpch/queries/q01.sql
new file mode 100644
index 0000000000000..b63b5c6b0ce47
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/queries/q01.sql
@@ -0,0 +1,16 @@
+select l_returnflag,
+       l_linestatus,
+       sum(l_quantity)                                       as sum_qty,
+       sum(l_extendedprice)                                  as sum_base_price,
+       sum(l_extendedprice * (1 - l_discount))               as sum_disc_price,
+       sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
+       avg(l_quantity)                                       as avg_qty,
+       avg(l_extendedprice)                                  as avg_price,
+       avg(l_discount)                                       as avg_disc,
+       count(*)                                              as count_order
+from lineitem
+where l_shipdate <= date '1998-12-01' - interval '90' day
+group by l_returnflag,
+         l_linestatus
+order by l_returnflag,
+         l_linestatus;
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/queries/q02.sql b/benchmarks/sql_benchmarks/tpch/queries/q02.sql
new file mode 100644
index 0000000000000..68e478f65d3f9
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/queries/q02.sql
@@ -0,0 +1,44 @@
+select
+    s_acctbal,
+    s_name,
+    n_name,
+    p_partkey,
+    p_mfgr,
+    s_address,
+    s_phone,
+    s_comment
+from
+    part,
+    supplier,
+    partsupp,
+    nation,
+    region
+where
+        p_partkey = ps_partkey
+  and s_suppkey = ps_suppkey
+  and p_size = 15
+  and p_type like '%BRASS'
+  and s_nationkey = n_nationkey
+  and n_regionkey = r_regionkey
+  and r_name = 'EUROPE'
+  and ps_supplycost = (
+    select
+        min(ps_supplycost)
+    from
+        partsupp,
+        supplier,
+        nation,
+        region
+    where
+            p_partkey = ps_partkey
+      and s_suppkey = ps_suppkey
+      and s_nationkey = n_nationkey
+      and n_regionkey = r_regionkey
+      and r_name = 'EUROPE'
+)
+order by
+    s_acctbal desc,
+    n_name,
+    s_name,
+    p_partkey
+limit 100;
diff --git a/benchmarks/sql_benchmarks/tpch/queries/q03.sql b/benchmarks/sql_benchmarks/tpch/queries/q03.sql
new file mode 100644
index 0000000000000..e5fa9e38664c3
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/queries/q03.sql
@@ -0,0 +1,23 @@
+select
+    l_orderkey,
+    sum(l_extendedprice * (1 - l_discount)) as revenue,
+    o_orderdate,
+    o_shippriority
+from
+    customer,
+    orders,
+    lineitem
+where
+        c_mktsegment = 'BUILDING'
+  and c_custkey = o_custkey
+  and l_orderkey = o_orderkey
+  and o_orderdate < date '1995-03-15'
+  and l_shipdate > date '1995-03-15'
+group by
+    l_orderkey,
+    o_orderdate,
+    o_shippriority
+order by
+    revenue desc,
+    o_orderdate
+limit 10;
diff --git a/benchmarks/sql_benchmarks/tpch/queries/q04.sql b/benchmarks/sql_benchmarks/tpch/queries/q04.sql
new file mode 100644
index 0000000000000..74a620dbc8a6d
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/queries/q04.sql
@@ -0,0 +1,21 @@
+select
+    o_orderpriority,
+    count(*) as order_count
+from
+    orders
+where
+        o_orderdate >= '1993-07-01'
+  and o_orderdate < date '1993-07-01' + interval '3' month
+  and exists (
+        select
+            *
+        from
+            lineitem
+        where
+                l_orderkey = o_orderkey
+          and l_commitdate < l_receiptdate
+    )
+group by
+    o_orderpriority
+order by
+    o_orderpriority;
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/queries/q05.sql b/benchmarks/sql_benchmarks/tpch/queries/q05.sql
new file mode 100644
index 0000000000000..fce03f08affc9
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/queries/q05.sql
@@ -0,0 +1,19 @@
+select n_name,
+       sum(l_extendedprice * (1 - l_discount)) as revenue
+from customer,
+     orders,
+     lineitem,
+     supplier,
+     nation,
+     region
+where c_custkey = o_custkey
+  and l_orderkey = o_orderkey
+  and l_suppkey = s_suppkey
+  and c_nationkey = s_nationkey
+  and s_nationkey = n_nationkey
+  and n_regionkey = r_regionkey
+  and r_name = 'ASIA'
+  and o_orderdate >= date '1994-01-01'
+  and o_orderdate < date '1994-01-01' + interval '1' year
+group by n_name
+order by revenue desc;
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/queries/q06.sql b/benchmarks/sql_benchmarks/tpch/queries/q06.sql
new file mode 100644
index 0000000000000..59267e041fdec
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/queries/q06.sql
@@ -0,0 +1,6 @@
+select sum(l_extendedprice * l_discount) as revenue
+from lineitem
+where l_shipdate >= date '1994-01-01'
+  and l_shipdate < date '1994-01-01' + interval '1' year
+  and l_discount between 0.06 - 0.01 and 0.06 + 0.01
+  and l_quantity < 24;
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/queries/q07.sql b/benchmarks/sql_benchmarks/tpch/queries/q07.sql
new file mode 100644
index 0000000000000..512e5be55a2d9
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/queries/q07.sql
@@ -0,0 +1,39 @@
+select
+    supp_nation,
+    cust_nation,
+    l_year,
+    sum(volume) as revenue
+from
+    (
+        select
+            n1.n_name as supp_nation,
+            n2.n_name as cust_nation,
+            extract(year from l_shipdate) as l_year,
+            l_extendedprice * (1 - l_discount) as volume
+        from
+            supplier,
+            lineitem,
+            orders,
+            customer,
+            nation n1,
+            nation n2
+        where
+                s_suppkey = l_suppkey
+          and o_orderkey = l_orderkey
+          and c_custkey = o_custkey
+          and s_nationkey = n1.n_nationkey
+          and c_nationkey = n2.n_nationkey
+          and (
+                (n1.n_name = 'FRANCE' and n2.n_name = 'GERMANY')
+                or (n1.n_name = 'GERMANY' and n2.n_name = 'FRANCE')
+            )
+          and l_shipdate between date '1995-01-01' and date '1996-12-31'
+    ) as shipping
+group by
+    supp_nation,
+    cust_nation,
+    l_year
+order by
+    supp_nation,
+    cust_nation,
+    l_year;
diff --git a/benchmarks/sql_benchmarks/tpch/queries/q08.sql b/benchmarks/sql_benchmarks/tpch/queries/q08.sql
new file mode 100644
index 0000000000000..6ddb2a6747589
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/queries/q08.sql
@@ -0,0 +1,37 @@
+select
+    o_year,
+    sum(case
+            when nation = 'BRAZIL' then volume
+            else 0
+        end) / sum(volume) as mkt_share
+from
+    (
+        select
+            extract(year from o_orderdate) as o_year,
+            l_extendedprice * (1 - l_discount) as volume,
+            n2.n_name as nation
+        from
+            part,
+            supplier,
+            lineitem,
+            orders,
+            customer,
+            nation n1,
+            nation n2,
+            region
+        where
+                p_partkey = l_partkey
+          and s_suppkey = l_suppkey
+          and l_orderkey = o_orderkey
+          and o_custkey = c_custkey
+          and c_nationkey = n1.n_nationkey
+          and n1.n_regionkey = r_regionkey
+          and r_name = 'AMERICA'
+          and s_nationkey = n2.n_nationkey
+          and o_orderdate between date '1995-01-01' and date '1996-12-31'
+          and p_type = 'ECONOMY ANODIZED STEEL'
+    ) as all_nations
+group by
+    o_year
+order by
+    o_year;
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/queries/q09.sql b/benchmarks/sql_benchmarks/tpch/queries/q09.sql
new file mode 100644
index 0000000000000..587bbc8a207ff
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/queries/q09.sql
@@ -0,0 +1,32 @@
+select
+    nation,
+    o_year,
+    sum(amount) as sum_profit
+from
+    (
+        select
+            n_name as nation,
+            extract(year from o_orderdate) as o_year,
+            l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount
+        from
+            part,
+            supplier,
+            lineitem,
+            partsupp,
+            orders,
+            nation
+        where
+                s_suppkey = l_suppkey
+          and ps_suppkey = l_suppkey
+          and ps_partkey = l_partkey
+          and p_partkey = l_partkey
+          and o_orderkey = l_orderkey
+          and s_nationkey = n_nationkey
+          and p_name like '%green%'
+    ) as profit
+group by
+    nation,
+    o_year
+order by
+    nation,
+    o_year desc;
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/queries/q10.sql b/benchmarks/sql_benchmarks/tpch/queries/q10.sql
new file mode 100644
index 0000000000000..18ebcf127bed5
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/queries/q10.sql
@@ -0,0 +1,33 @@
+select
+    c_custkey,
+    c_name,
+    sum(l_extendedprice * (1 - l_discount)) as revenue,
+    c_acctbal,
+    n_name,
+    c_address,
+    c_phone,
+    c_comment
+from
+    customer,
+    orders,
+    
+    lineitem,
+    nation
+where
+        c_custkey = o_custkey
+  and l_orderkey = o_orderkey
+  and o_orderdate >= date '1993-10-01'
+  and o_orderdate < date '1993-10-01' + interval '3' month
+  and l_returnflag = 'R'
+  and c_nationkey = n_nationkey
+group by
+    c_custkey,
+    c_name,
+    c_acctbal,
+    c_phone,
+    n_name,
+    c_address,
+    c_comment
+order by
+    revenue desc
+limit 20;
diff --git a/benchmarks/sql_benchmarks/tpch/queries/q11.sql b/benchmarks/sql_benchmarks/tpch/queries/q11.sql
new file mode 100644
index 0000000000000..f6e4980269a9c
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/queries/q11.sql
@@ -0,0 +1,27 @@
+select
+    ps_partkey,
+    sum(ps_supplycost * ps_availqty) as value
+from
+    partsupp,
+    supplier,
+    nation
+where
+    ps_suppkey = s_suppkey
+  and s_nationkey = n_nationkey
+  and n_name = 'GERMANY'
+group by
+    ps_partkey having
+    sum(ps_supplycost * ps_availqty) > (
+    select
+    sum(ps_supplycost * ps_availqty) * (0.0001/${BENCH_SIZE:-1})
+    from
+    partsupp,
+    supplier,
+    nation
+    where
+    ps_suppkey = s_suppkey
+                  and s_nationkey = n_nationkey
+                  and n_name = 'GERMANY'
+    )
+order by
+    value desc;
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/queries/q12.sql b/benchmarks/sql_benchmarks/tpch/queries/q12.sql
new file mode 100644
index 0000000000000..1f58bda94fbad
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/queries/q12.sql
@@ -0,0 +1,25 @@
+select l_shipmode,
+       sum(case
+               when o_orderpriority = '1-URGENT'
+                   or o_orderpriority = '2-HIGH'
+                   then 1
+               else 0
+           end) as high_line_count,
+       sum(case
+               when o_orderpriority <> '1-URGENT'
+                   and o_orderpriority <> '2-HIGH'
+                   then 1
+               else 0
+           end) as low_line_count
+from lineitem
+         join
+     orders
+     on
+         l_orderkey = o_orderkey
+where l_shipmode in ('MAIL', 'SHIP')
+  and l_commitdate < l_receiptdate
+  and l_shipdate < l_commitdate
+  and l_receiptdate >= date '1994-01-01'
+  and l_receiptdate < date '1994-01-01' + interval '1' year
+group by l_shipmode
+order by l_shipmode;
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/queries/q13.sql b/benchmarks/sql_benchmarks/tpch/queries/q13.sql
new file mode 100644
index 0000000000000..4bfe8c35553cb
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/queries/q13.sql
@@ -0,0 +1,20 @@
+select
+    c_count,
+    count(*) as custdist
+from
+    (
+        select
+            c_custkey,
+            count(o_orderkey)
+        from
+            customer left outer join orders on
+                        c_custkey = o_custkey
+                    and o_comment not like '%special%requests%'
+        group by
+            c_custkey
+    ) as c_orders (c_custkey, c_count)
+group by
+    c_count
+order by
+    custdist desc,
+    c_count desc;
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/queries/q14.sql b/benchmarks/sql_benchmarks/tpch/queries/q14.sql
new file mode 100644
index 0000000000000..15db66388bf43
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/queries/q14.sql
@@ -0,0 +1,10 @@
+select 100.00 * sum(case
+                        when p_type like 'PROMO%'
+                            then l_extendedprice * (1 - l_discount)
+                        else 0
+    end) / sum(l_extendedprice * (1 - l_discount)) as promo_revenue
+from lineitem,
+     part
+where l_partkey = p_partkey
+  and l_shipdate >= date '1995-09-01'
+  and l_shipdate < date '1995-09-01' + interval '1' month;;
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/queries/q15.sql b/benchmarks/sql_benchmarks/tpch/queries/q15.sql
new file mode 100644
index 0000000000000..a4d73d5e8d65a
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/queries/q15.sql
@@ -0,0 +1,33 @@
+create view revenue0 (supplier_no, total_revenue) as
+	select
+		l_suppkey,
+		sum(l_extendedprice * (1 - l_discount))
+	from
+		lineitem
+	where
+		l_shipdate >= date '1996-01-01'
+		and l_shipdate < date '1996-01-01' + interval '3' month
+	group by
+		l_suppkey;
+
+select
+	s_suppkey,
+	s_name,
+	s_address,
+	s_phone,
+	total_revenue
+from
+	supplier,
+	revenue0
+where
+	s_suppkey = supplier_no
+	and total_revenue = (
+		select
+			max(total_revenue)
+		from
+			revenue0
+	)
+order by
+	s_suppkey;
+
+drop view revenue0;
diff --git a/benchmarks/sql_benchmarks/tpch/queries/q16.sql b/benchmarks/sql_benchmarks/tpch/queries/q16.sql
new file mode 100644
index 0000000000000..36b7c07c164a2
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/queries/q16.sql
@@ -0,0 +1,30 @@
+select
+    p_brand,
+    p_type,
+    p_size,
+    count(distinct ps_suppkey) as supplier_cnt
+from
+    partsupp,
+    part
+where
+        p_partkey = ps_partkey
+  and p_brand <> 'Brand#45'
+  and p_type not like 'MEDIUM POLISHED%'
+  and p_size in (49, 14, 23, 45, 19, 3, 36, 9)
+  and ps_suppkey not in (
+    select
+        s_suppkey
+    from
+        supplier
+    where
+            s_comment like '%Customer%Complaints%'
+)
+group by
+    p_brand,
+    p_type,
+    p_size
+order by
+    supplier_cnt desc,
+    p_brand,
+    p_type,
+    p_size;
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/queries/q17.sql b/benchmarks/sql_benchmarks/tpch/queries/q17.sql
new file mode 100644
index 0000000000000..1e65550634fa2
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/queries/q17.sql
@@ -0,0 +1,17 @@
+select
+        sum(l_extendedprice) / 7.0 as avg_yearly
+from
+    lineitem,
+    part
+where
+        p_partkey = l_partkey
+  and p_brand = 'Brand#23'
+  and p_container = 'MED BOX'
+  and l_quantity < (
+    select
+            0.2 * avg(l_quantity)
+    from
+        lineitem
+    where
+            l_partkey = p_partkey
+);
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/queries/q18.sql b/benchmarks/sql_benchmarks/tpch/queries/q18.sql
new file mode 100644
index 0000000000000..ba7ee7f716cf1
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/queries/q18.sql
@@ -0,0 +1,33 @@
+select
+    c_name,
+    c_custkey,
+    o_orderkey,
+    o_orderdate,
+    o_totalprice,
+    sum(l_quantity)
+from
+    customer,
+    orders,
+    lineitem
+where
+        o_orderkey in (
+        select
+            l_orderkey
+        from
+            lineitem
+        group by
+            l_orderkey having
+                sum(l_quantity) > 300
+    )
+  and c_custkey = o_custkey
+  and o_orderkey = l_orderkey
+group by
+    c_name,
+    c_custkey,
+    o_orderkey,
+    o_orderdate,
+    o_totalprice
+order by
+    o_totalprice desc,
+    o_orderdate
+limit 100;
diff --git a/benchmarks/sql_benchmarks/tpch/queries/q19.sql b/benchmarks/sql_benchmarks/tpch/queries/q19.sql
new file mode 100644
index 0000000000000..56668e73f86f6
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/queries/q19.sql
@@ -0,0 +1,35 @@
+select
+    sum(l_extendedprice* (1 - l_discount)) as revenue
+from
+    lineitem,
+    part
+where
+    (
+                p_partkey = l_partkey
+            and p_brand = 'Brand#12'
+            and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG')
+            and l_quantity >= 1 and l_quantity <= 1 + 10
+            and p_size between 1 and 5
+            and l_shipmode in ('AIR', 'AIR REG')
+            and l_shipinstruct = 'DELIVER IN PERSON'
+        )
+   or
+    (
+                p_partkey = l_partkey
+            and p_brand = 'Brand#23'
+            and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK')
+            and l_quantity >= 10 and l_quantity <= 10 + 10
+            and p_size between 1 and 10
+            and l_shipmode in ('AIR', 'AIR REG')
+            and l_shipinstruct = 'DELIVER IN PERSON'
+        )
+   or
+    (
+                p_partkey = l_partkey
+            and p_brand = 'Brand#34'
+            and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG')
+            and l_quantity >= 20 and l_quantity <= 20 + 10
+            and p_size between 1 and 15
+            and l_shipmode in ('AIR', 'AIR REG')
+            and l_shipinstruct = 'DELIVER IN PERSON'
+        );
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/queries/q20.sql b/benchmarks/sql_benchmarks/tpch/queries/q20.sql
new file mode 100644
index 0000000000000..dd61a7d8e6ea1
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/queries/q20.sql
@@ -0,0 +1,37 @@
+select
+    s_name,
+    s_address
+from
+    supplier,
+    nation
+where
+        s_suppkey in (
+        select
+            ps_suppkey
+        from
+            partsupp
+        where
+                ps_partkey in (
+                select
+                    p_partkey
+                from
+                    part
+                where
+                        p_name like 'forest%'
+            )
+          and ps_availqty > (
+            select
+                    0.5 * sum(l_quantity)
+            from
+                lineitem
+            where
+                    l_partkey = ps_partkey
+              and l_suppkey = ps_suppkey
+              and l_shipdate >= date '1994-01-01'
+              and l_shipdate < date '1994-01-01' + interval '1' year
+        )
+    )
+  and s_nationkey = n_nationkey
+  and n_name = 'CANADA'
+order by
+    s_name;
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/queries/q21.sql b/benchmarks/sql_benchmarks/tpch/queries/q21.sql
new file mode 100644
index 0000000000000..b95e7b0dfca02
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/queries/q21.sql
@@ -0,0 +1,40 @@
+select
+    s_name,
+    count(*) as numwait
+from
+    supplier,
+    lineitem l1,
+    orders,
+    nation
+where
+        s_suppkey = l1.l_suppkey
+  and o_orderkey = l1.l_orderkey
+  and o_orderstatus = 'F'
+  and l1.l_receiptdate > l1.l_commitdate
+  and exists (
+        select
+            *
+        from
+            lineitem l2
+        where
+                l2.l_orderkey = l1.l_orderkey
+          and l2.l_suppkey <> l1.l_suppkey
+    )
+  and not exists (
+        select
+            *
+        from
+            lineitem l3
+        where
+                l3.l_orderkey = l1.l_orderkey
+          and l3.l_suppkey <> l1.l_suppkey
+          and l3.l_receiptdate > l3.l_commitdate
+    )
+  and s_nationkey = n_nationkey
+  and n_name = 'SAUDI ARABIA'
+group by
+    s_name
+order by
+    numwait desc,
+    s_name
+limit 100;
diff --git a/benchmarks/sql_benchmarks/tpch/queries/q22.sql b/benchmarks/sql_benchmarks/tpch/queries/q22.sql
new file mode 100644
index 0000000000000..90aea6fd74f5c
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/queries/q22.sql
@@ -0,0 +1,37 @@
+select
+    cntrycode,
+    count(*) as numcust,
+    sum(c_acctbal) as totacctbal
+from
+    (
+        select
+            substring(c_phone from 1 for 2) as cntrycode,
+            c_acctbal
+        from
+            customer
+        where
+                substring(c_phone from 1 for 2) in
+                ('13', '31', '23', '29', '30', '18', '17')
+          and c_acctbal > (
+            select
+                avg(c_acctbal)
+            from
+                customer
+            where
+                    c_acctbal > 0.00
+              and substring(c_phone from 1 for 2) in
+                  ('13', '31', '23', '29', '30', '18', '17')
+        )
+          and not exists (
+                select
+                    *
+                from
+                    orders
+                where
+                        o_custkey = c_custkey
+            )
+    ) as custsale
+group by
+    cntrycode
+order by
+    cntrycode;
\ No newline at end of file
diff --git a/benchmarks/sql_benchmarks/tpch/tpch.benchmark.template b/benchmarks/sql_benchmarks/tpch/tpch.benchmark.template
new file mode 100644
index 0000000000000..b48d212f84534
--- /dev/null
+++ b/benchmarks/sql_benchmarks/tpch/tpch.benchmark.template
@@ -0,0 +1,18 @@
+name Q${QUERY_NUMBER_PADDED}
+group tpch
+subgroup sf${BENCH_SIZE:-1}
+
+init sql_benchmarks/tpch/init/set_config.sql
+
+echo Loading tpch sf${BENCH_SIZE:-1} ${TPCH_FILE_TYPE:-parquet} data
+
+load sql_benchmarks/tpch/init/load_${TPCH_FILE_TYPE:-parquet}.sql
+
+assert I
+SELECT COUNT(*) > 0 from lineitem;
+----
+true
+
+run sql_benchmarks/tpch/queries/q${QUERY_NUMBER_PADDED}.sql
+
+result sql_benchmarks/tpch/results/sf${BENCH_SIZE:-1}/q${QUERY_NUMBER_PADDED}.csv
diff --git a/benchmarks/src/lib.rs b/benchmarks/src/lib.rs
index 7e2196e89e592..7c598e65d824c 100644
--- a/benchmarks/src/lib.rs
+++ b/benchmarks/src/lib.rs
@@ -25,6 +25,7 @@ pub mod nlj;
 pub mod smj;
 pub mod sort_pushdown;
 pub mod sort_tpch;
+pub mod sql_benchmark;
 pub mod tpcds;
 pub mod tpch;
 pub mod util;
diff --git a/benchmarks/src/sql_benchmark.rs b/benchmarks/src/sql_benchmark.rs
new file mode 100644
index 0000000000000..96058cd08a9a4
--- /dev/null
+++ b/benchmarks/src/sql_benchmark.rs
@@ -0,0 +1,3365 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{Array, RecordBatch};
+use arrow::datatypes::*;
+use arrow::error::ArrowError;
+use arrow::util::display::{ArrayFormatter, FormatOptions};
+use datafusion::dataframe::DataFrameWriteOptions;
+use datafusion::datasource::MemTable;
+use datafusion::physical_plan::execute_stream;
+use datafusion::prelude::{CsvReadOptions, DataFrame, SessionContext};
+use datafusion_common::config::CsvOptions;
+use datafusion_common::{DataFusionError, Result, exec_datafusion_err};
+use futures::StreamExt;
+use log::{debug, info, trace};
+use regex::Regex;
+use std::collections::HashMap;
+use std::fmt::Debug;
+use std::fs::{self, File, OpenOptions};
+use std::io::{BufRead, BufReader};
+use std::path::{Path, PathBuf};
+use std::sync::{Arc, LazyLock};
+
+/// A collection of benchmark configurations and state used by the DataFusion
+/// sql test harness.  Each benchmark is defined by a file that can contain
+/// directives such as `load`, `run`, `assert`, `result`, etc.  The
+/// `SqlBenchmark` struct holds the parsed data from that file and
+/// the impl provides methods to run, assert, persist, verify and cleanup benchmark
+/// results.
+#[derive(Debug, Clone)]
+pub struct SqlBenchmark {
+    /// Human‑readable name of the benchmark.
+    name: String,
+    /// Top‑level group name (derived from the file path or defined in a benchmark).
+    group: String,
+    /// Subgroup name, often a logical grouping.
+    subgroup: String,
+    /// Full path to the benchmark file.
+    benchmark_path: PathBuf,
+    /// Mapping of placeholder keys to concrete values (e.g. `"BENCHMARK_DIR"`).
+    replacement_mapping: HashMap<String, String>,
+    /// Expected string that must appear in the physical plan of the queries.
+    expect: Vec<String>,
+    /// All SQL queries grouped by directive (`load`, `run`, etc.).
+    queries: HashMap<QueryDirective, Vec<String>>,
+    /// Queries whose results are persisted to disk for later comparison.
+    result_queries: Vec<BenchmarkQuery>,
+    /// Queries whose results are asserted against an expected table.
+    assert_queries: Vec<BenchmarkQuery>,
+    /// Flag indicating whether the benchmark has been fully loaded
+    is_loaded: bool,
+    /// Stores the last run results if needed so they can be compared or persisted.
+    last_results: Option<Vec<RecordBatch>>,
+    /// echo statements
+    echo: Vec<String>,
+}
+
+impl SqlBenchmark {
+    pub async fn new(
+        ctx: &SessionContext,
+        full_path: impl AsRef<Path>,
+        benchmark_directory: impl AsRef<Path>,
+    ) -> Result<Self> {
+        let full_path = full_path.as_ref();
+        let benchmark_directory = benchmark_directory.as_ref();
+        let group_name = parse_group_from_path(full_path, benchmark_directory);
+        let mut bm = Self {
+            name: String::new(),
+            group: group_name,
+            subgroup: String::new(),
+            benchmark_path: full_path.to_path_buf(),
+            replacement_mapping: HashMap::new(),
+            expect: vec![],
+            queries: HashMap::new(),
+            result_queries: vec![],
+            assert_queries: vec![],
+            is_loaded: false,
+            last_results: None,
+            echo: vec![],
+        };
+        bm.replacement_mapping.insert(
+            "BENCHMARK_DIR".to_string(),
+            benchmark_directory.to_string_lossy().into_owned(),
+        );
+
+        let path = bm.benchmark_path.clone();
+        bm.process_file(ctx, &path).await?;
+
+        Ok(bm)
+    }
+
+    /// Initializes the benchmark by executing `load` and `init` queries.
+    ///
+    /// Registers any required tables or sets up state in the provided
+    /// `SessionContext` before running queries.  This method is idempotent:
+    /// calling it multiple times on the same instance returns
+    /// immediately after the first successful initialization.
+    ///
+    /// # Errors
+    /// Returns an error if any `load` or `init` query fails, or if the
+    /// benchmark file does not contain a `run` query.
+    pub async fn initialize(&mut self, ctx: &SessionContext) -> Result<()> {
+        if self.is_loaded {
+            return Ok(());
+        }
+
+        let path = self.benchmark_path.to_string_lossy().into_owned();
+
+        // validate there was a run query
+        if !self.queries.contains_key(&QueryDirective::Run) {
+            return Err(exec_datafusion_err!(
+                "Invalid benchmark file: no \"run\" query specified: {path}"
+            ));
+        }
+
+        // display any echo's
+        self.echo.iter().for_each(|txt| println!("{txt}"));
+
+        let load_queries = self.queries.get(&QueryDirective::Load);
+
+        if let Some(queries) = load_queries {
+            for query in queries {
+                debug!("Executing load query {query}");
+                ctx.sql(query).await?.collect().await?;
+            }
+        }
+
+        let init_queries = self.queries.get(&QueryDirective::Init);
+
+        if let Some(queries) = init_queries {
+            for query in queries {
+                debug!("Executing init query {query}");
+                ctx.sql(query).await?.collect().await?;
+            }
+        }
+
+        self.is_loaded = true;
+
+        Ok(())
+    }
+
+    /// Executes the `assert` queries and compares actual results against
+    /// expected values.
+    ///
+    /// Each `assert` query must be followed by a result table (separated by
+    /// `----`) in the benchmark file.  The assertion passes only if the
+    /// returned record batches exactly match the expected rows.
+    ///
+    /// # Errors
+    /// Returns an error if any `assert` query fails, or if the actual and
+    /// expected results differ in row count or cell values.
+    pub async fn assert(&mut self, ctx: &SessionContext) -> Result<()> {
+        info!("Running assertions...");
+
+        for assert_query in &self.assert_queries {
+            let query = &assert_query.query;
+
+            info!("Executing assert query {query}");
+
+            let result = ctx.sql(query).await?.collect().await?;
+            let formatted_actual_results = format_record_batches(&result)?;
+
+            Self::compare_results(
+                assert_query,
+                &formatted_actual_results,
+                &assert_query.expected_result,
+            )?;
+        }
+
+        Ok(())
+    }
+
+    /// Executes the `run` queries, optionally saving results for later
+    /// verification. If there are multiple queries only the results for
+    /// the last query are saved.
+    ///
+    /// When `save_results` is `true`, it collects `SELECT`/`WITH` query
+    /// results and stores them in `last_results`.
+    ///
+    /// When `save_results` is `false`, it streams results and counts rows
+    /// without buffering them.
+    ///
+    /// If an 'expect' string is defined this method also validates that
+    /// the physical plan contains that string.
+    ///
+    /// # Errors
+    /// Returns an error if a `run` query fails or if expected plan strings
+    /// are not found.
+    pub async fn run(&mut self, ctx: &SessionContext, save_results: bool) -> Result<()> {
+        let run_queries = self
+            .queries
+            .get(&QueryDirective::Run)
+            .ok_or_else(|| exec_datafusion_err!("Run query should be loaded by now"))?;
+
+        let mut result_count = 0;
+
+        let result: Vec<RecordBatch> = {
+            let mut local_result = vec![];
+
+            for query in run_queries {
+                match save_results {
+                    true => {
+                        debug!(
+                            "Running query (saving results) {}-{}: {query}",
+                            self.group, self.subgroup
+                        );
+
+                        let df = ctx.sql(query).await?;
+                        if !self.expect.is_empty() {
+                            let physical_plan = df.create_physical_plan().await?;
+                            self.validate_expected_plan(&physical_plan)?;
+                        }
+
+                        let result_schema = Arc::new(df.schema().as_arrow().clone());
+                        let mut batches = df.collect().await?;
+                        let trimmed = query.trim_start();
+
+                        // save the output for select/with queries
+                        if starts_with_ignore_ascii_case(trimmed, "select")
+                            || starts_with_ignore_ascii_case(trimmed, "with")
+                        {
+                            if batches.is_empty() {
+                                batches.push(RecordBatch::new_empty(result_schema));
+                            }
+                            let row_count_for_query =
+                                batches.iter().map(RecordBatch::num_rows).sum::<usize>();
+                            debug!(
+                                "Persisting {} batches ({} rows)...",
+                                batches.len(),
+                                row_count_for_query
+                            );
+
+                            result_count = row_count_for_query;
+                            local_result = batches;
+                        }
+                    }
+                    false => {
+                        debug!(
+                            "Running query (ignoring results) {}-{}: {query}",
+                            self.group, self.subgroup
+                        );
+
+                        result_count = self
+                            .execute_sql_without_result_buffering(query, ctx)
+                            .await?;
+                    }
+                }
+            }
+
+            Ok::<Vec<RecordBatch>, DataFusionError>(local_result)
+        }?;
+
+        debug!("Results have {result_count} rows");
+
+        // Store results for verification
+        self.last_results = Some(result);
+
+        Ok(())
+    }
+
+    /// Calls run and persists results to disk as a CSV file.
+    ///
+    /// Requires that the benchmark defines a `result` or `result_query`.
+    /// Registers the results in a memory table and writes them to disk with
+    /// pipe delimiters and a header row.
+    ///
+    /// # Errors
+    /// Returns an error if no results are available or if writing to the
+    /// target path fails.
+    pub async fn persist(&mut self, ctx: &SessionContext) -> Result<()> {
+        self.run(ctx, true).await?;
+
+        // Check if we have result queries to persist for
+        if self.result_queries.is_empty() {
+            info!("No result paths to persist");
+            return Ok(());
+        }
+
+        let results = self
+            .last_results
+            .as_ref()
+            .expect("run should store last_results after successful execution");
+
+        let query = &self.result_queries[0];
+        let path = query.path.as_ref().ok_or_else(|| {
+            exec_datafusion_err!(
+                "Unable to persist results from query '{}', no result specified",
+                query.query
+            )
+        })?;
+
+        info!("Persisting results for query to {path}");
+
+        let first_batch = results
+            .first()
+            .ok_or_else(|| exec_datafusion_err!("Results should be loaded"))?;
+
+        let schema = first_batch.schema();
+        let provider = MemTable::try_new(schema, vec![results.clone()])?;
+
+        ctx.register_table("persist_data", Arc::new(provider))?;
+
+        let df = ctx.table("persist_data").await?;
+        df.write_csv(
+            path,
+            DataFrameWriteOptions::new(),
+            Some(
+                CsvOptions::default()
+                    .with_delimiter(b'|')
+                    .with_has_header(true),
+            ),
+        )
+        .await?;
+
+        Ok(())
+    }
+
+    /// Verifies persisted results against expected values.
+    ///
+    /// Executes the `result_query` or uses the stored last run results, then
+    /// compares actual output rows to the expected values defined in the
+    /// benchmark file.
+    ///
+    /// # Errors
+    /// Returns an error if no results are available or if the actual and
+    /// expected results differ in count or content.
+    pub async fn verify(&mut self, ctx: &SessionContext) -> Result<()> {
+        // Check if we have result queries to verify
+        if self.result_queries.is_empty() {
+            return Ok(());
+        }
+
+        // Get the stored results from the last run
+        let Some(actual_results) = self.last_results.as_ref() else {
+            return Err(exec_datafusion_err!(
+                "No results available for verification. Run the benchmark first."
+            ));
+        };
+
+        info!("Verifying results...");
+
+        // Get the first result query (assuming only one for now)
+        let query = &self.result_queries[0];
+        let formatted_actual_results = if !query.query.trim().is_empty() {
+            let results = ctx.sql(&query.query).await?.collect().await?;
+            format_record_batches(&results)
+        } else {
+            format_record_batches(actual_results)
+        }?;
+
+        Self::compare_results(query, &formatted_actual_results, &query.expected_result)
+    }
+
+    /// Runs `cleanup` queries to reset state after the benchmark run.
+    pub async fn cleanup(&mut self, ctx: &SessionContext) -> Result<()> {
+        info!("Running cleanup...");
+
+        let cleanup_queries = self.queries.get(&QueryDirective::Cleanup);
+
+        if let Some(queries) = cleanup_queries {
+            for query in queries {
+                let _ = ctx.sql(query).await?.collect().await?;
+            }
+        }
+
+        Ok(())
+    }
+
+    fn compare_results(
+        query: &BenchmarkQuery,
+        actual_results: &[Vec<String>],
+        expected_results: &[Vec<String>],
+    ) -> Result<()> {
+        if actual_results.is_empty() && expected_results.is_empty() {
+            return Ok(());
+        }
+
+        // Compare row count
+        if actual_results.len() != expected_results.len() {
+            return Err(exec_datafusion_err!(
+                "Error in result: expected {} rows but got {} for query {}",
+                expected_results.len(),
+                actual_results.len(),
+                query.query
+            ));
+        }
+
+        // Compare values
+        let zipped = actual_results
+            .iter()
+            .enumerate()
+            .zip(expected_results.iter());
+
+        for ((row_idx, actual), expected) in zipped {
+            trace!(
+                "row {}\nactual: {actual:?}\nexpected: {expected:?}",
+                row_idx + 1
+            );
+
+            // Compare column count
+            if actual.len() != expected.len() {
+                return Err(exec_datafusion_err!(
+                    "Error in result: expected {} columns but got {} for query {}",
+                    expected.len(),
+                    actual.len(),
+                    query.query
+                ));
+            }
+
+            for (col_idx, expected_val) in
+                expected.iter().enumerate().take(query.column_count)
+            {
+                // The row-width check above guarantees this index exists.
+                let actual_val = &actual[col_idx];
+
+                trace!("actual_val = {actual_val:?}\nexpected_val = {expected_val:?}");
+
+                if (expected_val == "NULL" && actual_val.is_empty())
+                    || (expected_val == actual_val)
+                    || (expected_val == "(empty)"
+                        && (actual_val.is_empty() || actual_val == "NULL"))
+                {
+                    continue;
+                }
+
+                return Err(exec_datafusion_err!(
+                    "Error in result on row {}, column {} running query \"{}\": expected value \
+                    \"{expected_val}\" but got value \"{actual_val}\" in row: {actual:?}",
+                    row_idx + 1,
+                    col_idx + 1,
+                    query.query
+                ));
+            }
+        }
+
+        Ok(())
+    }
+
+    async fn process_file(&mut self, ctx: &SessionContext, path: &Path) -> Result<()> {
+        debug!("Processing file {}", path.display());
+
+        let mut replacement_mapping = self.replacement_mapping.clone();
+        replacement_mapping
+            .insert("FILE_PATH".to_string(), path.to_string_lossy().into_owned());
+
+        let mut reader = BenchmarkFileReader::new(path, replacement_mapping)?;
+        let mut line = String::with_capacity(1024);
+        let mut reader_result = reader.read_line(&mut line);
+
+        while let Some(result) = reader_result {
+            match result {
+                Ok(_) => {
+                    if !is_blank_or_comment_line(&line) {
+                        // boxing required because of recursion
+                        Box::pin(self.process_line(ctx, &mut reader, &mut line)).await?;
+                    }
+                }
+                Err(e) => return Err(e),
+            }
+
+            // Clear the line buffer for the next iteration.
+            line.clear();
+            reader_result = reader.read_line(&mut line);
+        }
+
+        Ok(())
+    }
+
+    async fn process_line(
+        &mut self,
+        ctx: &SessionContext,
+        reader: &mut BenchmarkFileReader,
+        line: &mut String,
+    ) -> Result<()> {
+        // Split the line into directive and arguments.
+        let cloned_line = line.trim_start().to_string();
+        let splits: Vec<&str> = cloned_line.split_whitespace().collect();
+
+        BenchmarkDirective::select(reader, splits[0])?
+            .process(ctx, self, reader, line, &splits)
+            .await
+    }
+
+    fn process_query(&mut self, splits: &[&str], mut query: String) -> Result<()> {
+        debug!("Processing query {query}");
+
+        // Trim and validate.
+        query = query.trim().to_string();
+        if query.is_empty() {
+            return Ok(());
+        }
+
+        // remove comments
+        query = query
+            .lines()
+            .filter(|line| !is_comment_line(line))
+            .collect::<Vec<_>>()
+            .join("\n");
+
+        if query.trim().is_empty() {
+            return Ok(());
+        }
+
+        query = process_replacements(&query, self.replacement_mapping())?;
+
+        let directive = QueryDirective::parse(splits[0]).ok_or_else(|| {
+            exec_datafusion_err!("Invalid query directive: {}", splits[0])
+        })?;
+
+        self.queries.entry(directive).or_default().push(query);
+
+        Ok(())
+    }
+
+    fn validate_expected_plan(&self, physical_plan: &impl Debug) -> Result<()> {
+        if self.expect.is_empty() {
+            return Ok(());
+        }
+
+        let plan_string = format!("{physical_plan:#?}");
+
+        for exp_str in &self.expect {
+            if !plan_string.contains(exp_str) {
+                return Err(exec_datafusion_err!(
+                    "The query physical plan does not contain the expected string '{exp_str}'. Physical plan: {plan_string}"
+                ));
+            }
+        }
+
+        Ok(())
+    }
+
+    async fn execute_sql_without_result_buffering(
+        &self,
+        sql: &str,
+        ctx: &SessionContext,
+    ) -> Result<usize> {
+        let mut row_count = 0;
+
+        let df = ctx.sql(sql).await?;
+        let physical_plan = df.create_physical_plan().await?;
+
+        self.validate_expected_plan(&physical_plan)?;
+        let mut stream = execute_stream(physical_plan, ctx.task_ctx())?;
+
+        while let Some(batch) = stream.next().await {
+            row_count += batch?.num_rows();
+
+            // Evaluate the result and do nothing, the result will be dropped
+            // to reduce memory pressure
+        }
+
+        Ok(row_count)
+    }
+
+    pub fn name(&self) -> &str {
+        &self.name
+    }
+
+    pub fn group(&self) -> &str {
+        &self.group
+    }
+
+    pub fn subgroup(&self) -> &str {
+        &self.subgroup
+    }
+
+    pub fn benchmark_path(&self) -> &Path {
+        &self.benchmark_path
+    }
+
+    pub fn replacement_mapping(&self) -> &HashMap<String, String> {
+        &self.replacement_mapping
+    }
+
+    pub fn queries(&self) -> &HashMap<QueryDirective, Vec<String>> {
+        &self.queries
+    }
+
+    pub fn result_queries(&self) -> &[BenchmarkQuery] {
+        &self.result_queries
+    }
+
+    pub fn assert_queries(&self) -> &[BenchmarkQuery] {
+        &self.assert_queries
+    }
+
+    pub fn is_loaded(&self) -> bool {
+        self.is_loaded
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum QueryDirective {
+    Load,
+    Run,
+    Init,
+    Cleanup,
+}
+
+impl QueryDirective {
+    fn parse(value: &str) -> Option<Self> {
+        if value.eq_ignore_ascii_case("load") {
+            Some(Self::Load)
+        } else if value.eq_ignore_ascii_case("init") {
+            Some(Self::Init)
+        } else if value.eq_ignore_ascii_case("run") {
+            Some(Self::Run)
+        } else if value.eq_ignore_ascii_case("cleanup") {
+            Some(Self::Cleanup)
+        } else {
+            None
+        }
+    }
+
+    fn as_str(self) -> &'static str {
+        match self {
+            Self::Load => "load",
+            Self::Run => "run",
+            Self::Init => "init",
+            Self::Cleanup => "cleanup",
+        }
+    }
+}
+
+enum BenchmarkDirective {
+    Load,
+    Run,
+    Init,
+    Cleanup,
+    Name,
+    Group,
+    Subgroup,
+    Expect,
+    Assert,
+    ResultQuery,
+    Results,
+    Template,
+    Include,
+    Echo,
+}
+
+impl BenchmarkDirective {
+    fn select(
+        reader: &BenchmarkFileReader,
+        directive: &str,
+    ) -> Result<BenchmarkDirective> {
+        if directive.eq_ignore_ascii_case("load") {
+            Ok(BenchmarkDirective::Load)
+        } else if directive.eq_ignore_ascii_case("run") {
+            Ok(BenchmarkDirective::Run)
+        } else if directive.eq_ignore_ascii_case("init") {
+            Ok(BenchmarkDirective::Init)
+        } else if directive.eq_ignore_ascii_case("cleanup") {
+            Ok(BenchmarkDirective::Cleanup)
+        } else if directive.eq_ignore_ascii_case("name") {
+            Ok(BenchmarkDirective::Name)
+        } else if directive.eq_ignore_ascii_case("group") {
+            Ok(BenchmarkDirective::Group)
+        } else if directive.eq_ignore_ascii_case("subgroup") {
+            Ok(BenchmarkDirective::Subgroup)
+        } else if directive.eq_ignore_ascii_case("expect_plan") {
+            Ok(BenchmarkDirective::Expect)
+        } else if directive.eq_ignore_ascii_case("assert") {
+            Ok(BenchmarkDirective::Assert)
+        } else if directive.eq_ignore_ascii_case("result_query") {
+            Ok(BenchmarkDirective::ResultQuery)
+        } else if directive.eq_ignore_ascii_case("result") {
+            Ok(BenchmarkDirective::Results)
+        } else if directive.eq_ignore_ascii_case("template") {
+            Ok(BenchmarkDirective::Template)
+        } else if directive.eq_ignore_ascii_case("include") {
+            Ok(BenchmarkDirective::Include)
+        } else if directive.eq_ignore_ascii_case("echo") {
+            Ok(BenchmarkDirective::Echo)
+        } else {
+            Err(exec_datafusion_err!(
+                "{}",
+                reader.format_exception(&format!("Unrecognized command: {directive}"))
+            ))
+        }
+    }
+
+    async fn process(
+        &self,
+        ctx: &SessionContext,
+        bench: &mut SqlBenchmark,
+        reader: &mut BenchmarkFileReader,
+        line: &mut String,
+        splits: &[&str],
+    ) -> Result<()> {
+        trace!("-- handling {}", splits[0]);
+
+        match self {
+            BenchmarkDirective::Load
+            | BenchmarkDirective::Run
+            | BenchmarkDirective::Init
+            | BenchmarkDirective::Cleanup => {
+                Self::process_query_directive(bench, reader, line, splits)
+            }
+            BenchmarkDirective::Name => Self::process_metadata_value(
+                bench,
+                reader,
+                line,
+                "name",
+                "BENCH_NAME",
+                "name must be followed by a value",
+            ),
+            BenchmarkDirective::Group => Self::process_metadata_value(
+                bench,
+                reader,
+                line,
+                "group",
+                "BENCH_GROUP",
+                "group must be followed by a value",
+            ),
+            BenchmarkDirective::Subgroup => Self::process_metadata_value(
+                bench,
+                reader,
+                line,
+                "subgroup",
+                "BENCH_SUBGROUP",
+                "subgroup must be followed by a value",
+            ),
+            BenchmarkDirective::Expect => Self::process_expect(bench, reader, splits),
+            BenchmarkDirective::Assert => {
+                Self::process_assert(bench, reader, line, splits)
+            }
+            BenchmarkDirective::ResultQuery => {
+                Self::process_result_query(bench, reader, line, splits)
+            }
+            BenchmarkDirective::Results => {
+                Self::process_results(ctx, bench, reader, splits).await
+            }
+            BenchmarkDirective::Template => {
+                Self::process_template(ctx, bench, reader, line, splits).await
+            }
+            BenchmarkDirective::Include => {
+                Self::process_include(ctx, bench, reader, splits).await
+            }
+            BenchmarkDirective::Echo => Self::process_echo(bench, reader, splits),
+        }
+    }
+
+    fn process_query_directive(
+        bench: &mut SqlBenchmark,
+        reader: &mut BenchmarkFileReader,
+        line: &mut String,
+        splits: &[&str],
+    ) -> Result<()> {
+        let directive = QueryDirective::parse(splits[0]).ok_or_else(|| {
+            exec_datafusion_err!("Invalid query directive: {}", splits[0])
+        })?;
+
+        if directive == QueryDirective::Run && bench.queries.contains_key(&directive) {
+            return Err(exec_datafusion_err!(
+                "Multiple calls to run in the same benchmark file"
+            ));
+        }
+
+        line.clear();
+
+        // Read the query body until a blank line or EOF.
+        let mut query = String::new();
+        let mut reader_result = reader.read_line(line);
+
+        loop {
+            match reader_result {
+                Some(Ok(_)) => {
+                    if is_comment_line(line) {
+                        // comment, ignore
+                    } else if is_blank_line(line) {
+                        break;
+                    } else {
+                        query.push_str(line);
+                        query.push(' ');
+                    }
+                }
+                Some(Err(e)) => return Err(e),
+                None => break,
+            }
+
+            // Clear the line buffer for the next iteration.
+            line.clear();
+            reader_result = reader.read_line(line);
+        }
+
+        // Optional file parameter.
+        if splits.len() > 1 && !splits[1].is_empty() {
+            if !query.trim().is_empty() {
+                return Err(exec_datafusion_err!(
+                    "{}",
+                    reader.format_exception(&format!(
+                        "{} directive must use either a query file or inline SQL, not both",
+                        directive.as_str()
+                    ))
+                ));
+            }
+
+            debug!("Processing {} file: {}", splits[0], splits[1]);
+
+            let query_file = fs::read_to_string(splits[1]).map_err(|e| {
+                exec_datafusion_err!("Failed to read query file {}: {e}", splits[1])
+            })?;
+            let query_file = query_file.replace("\r\n", "\n");
+
+            // some files have multiple queries, split apart
+            for query in query_file
+                .split("\n\n")
+                .flat_map(|query| query.split(";\n"))
+            {
+                bench.process_query(splits, query.to_string())?;
+            }
+        } else {
+            bench.process_query(splits, query)?;
+        }
+
+        Ok(())
+    }
+
+    fn process_metadata_value(
+        bench: &mut SqlBenchmark,
+        reader: &mut BenchmarkFileReader,
+        line: &str,
+        directive: &str,
+        replacement_key: &str,
+        message: &str,
+    ) -> Result<()> {
+        let value =
+            directive_value(reader, line.trim_start(), directive, message)?.to_string();
+
+        match directive {
+            "name" => bench.name.clone_from(&value),
+            "group" => bench.group.clone_from(&value),
+            "subgroup" => bench.subgroup.clone_from(&value),
+            _ => unreachable!("unsupported metadata directive: {directive}"),
+        }
+
+        bench
+            .replacement_mapping
+            .insert(replacement_key.to_string(), value.clone());
+        reader
+            .replacements
+            .insert(replacement_key.to_string(), value);
+
+        Ok(())
+    }
+
+    fn process_expect(
+        bench: &mut SqlBenchmark,
+        reader: &BenchmarkFileReader,
+        splits: &[&str],
+    ) -> Result<()> {
+        trace!("-- handling {}", splits[0]);
+
+        if splits.len() <= 1 || splits[1].is_empty() {
+            return Err(exec_datafusion_err!(
+                "{}",
+                reader.format_exception(
+                    "expect_plan must be followed by a string to search in the physical plan"
+                )
+            ));
+        }
+
+        bench.expect.push(splits[1..].join(" ").to_string());
+
+        Ok(())
+    }
+
+    fn process_assert(
+        bench: &mut SqlBenchmark,
+        reader: &mut BenchmarkFileReader,
+        line: &mut String,
+        splits: &[&str],
+    ) -> Result<()> {
+        // count the amount of columns based on character count. The actual
+        // character used is irrelevant.
+        if splits.len() <= 1 || splits[1].is_empty() {
+            return Err(exec_datafusion_err!(
+                "{}",
+                reader.format_exception(
+                    "assert must be followed by a column count (e.g. assert III)"
+                )
+            ));
+        }
+
+        line.clear();
+
+        // read the actual query
+        let mut found_break = false;
+        let mut sql = String::new();
+        let mut reader_result = reader.read_line(line);
+
+        loop {
+            match reader_result {
+                Some(Ok(_)) => {
+                    if line.trim() == "----" {
+                        found_break = true;
+                        break;
+                    }
+                    sql.push('\n');
+                    sql.push_str(line);
+                }
+                Some(Err(e)) => return Err(e),
+                None => break,
+            }
+
+            // Clear the line buffer for the next iteration.
+            line.clear();
+            reader_result = reader.read_line(line);
+        }
+
+        if !found_break {
+            return Err(exec_datafusion_err!(
+                "{}",
+                reader.format_exception(
+                    "assert must be followed by a query and a result (separated by ----)"
+                )
+            ));
+        }
+
+        bench
+            .assert_queries
+            .push(read_query_from_reader(reader, &sql, splits[1])?);
+
+        Ok(())
+    }
+
+    async fn process_results(
+        ctx: &SessionContext,
+        bench: &mut SqlBenchmark,
+        reader: &BenchmarkFileReader,
+        splits: &[&str],
+    ) -> Result<()> {
+        if splits.len() <= 1 || splits[1].is_empty() {
+            return Err(exec_datafusion_err!(
+                "{}",
+                reader.format_exception(
+                    "result must be followed by a path to a result file"
+                )
+            ));
+        }
+
+        let bq = read_query_from_file(ctx, splits[1], &bench.replacement_mapping).await?;
+
+        if !bench.result_queries.is_empty() {
+            return Err(exec_datafusion_err!(
+                "{}",
+                reader.format_exception("multiple results found")
+            ));
+        }
+
+        bench.result_queries.push(bq);
+
+        Ok(())
+    }
+
+    fn process_result_query(
+        bench: &mut SqlBenchmark,
+        reader: &mut BenchmarkFileReader,
+        line: &mut String,
+        splits: &[&str],
+    ) -> Result<()> {
+        if splits.len() <= 1 || splits[1].is_empty() {
+            return Err(exec_datafusion_err!(
+                "{}",
+                reader.format_exception(
+                    "result_query must be followed by a column count (e.g. result_query III)"
+                )
+            ));
+        }
+
+        line.clear();
+
+        let mut sql = String::new();
+        let mut found_break = false;
+        let mut reader_result = reader.read_line(line);
+
+        loop {
+            match reader_result {
+                Some(Ok(_)) => {
+                    if line.trim() == "----" {
+                        found_break = true;
+                        break;
+                    }
+                    sql.push_str(line);
+                    sql.push('\n');
+                }
+                Some(Err(e)) => return Err(e),
+                None => break,
+            }
+
+            // Clear the line buffer for the next iteration.
+            line.clear();
+            reader_result = reader.read_line(line);
+        }
+
+        if !found_break {
+            return Err(exec_datafusion_err!(
+                "{}",
+                reader.format_exception(
+                    "result_query must be followed by a query and a result (separated by ----)"
+                )
+            ));
+        }
+
+        let result_check = read_query_from_reader(reader, &sql, splits[1])?;
+
+        if !bench.result_queries.is_empty() {
+            return Err(exec_datafusion_err!(
+                "{}",
+                reader.format_exception("multiple results found")
+            ));
+        }
+        bench.result_queries.push(result_check);
+
+        Ok(())
+    }
+
+    async fn process_template(
+        ctx: &SessionContext,
+        bench: &mut SqlBenchmark,
+        reader: &mut BenchmarkFileReader,
+        line: &mut String,
+        splits: &[&str],
+    ) -> Result<()> {
+        if splits.len() != 2 || splits[1].is_empty() {
+            return Err(exec_datafusion_err!(
+                "{}",
+                reader.format_exception("template requires a single template path")
+            ));
+        }
+
+        // template: update the path to read
+        bench.benchmark_path = PathBuf::from(splits[1]);
+
+        line.clear();
+
+        // now read parameters
+        let mut reader_result = reader.read_line(line);
+
+        loop {
+            match reader_result {
+                Some(Ok(_)) => {
+                    if is_comment_line(line) {
+                        // Clear the line buffer for the next iteration.
+                        line.clear();
+                        reader_result = reader.read_line(line);
+                        continue;
+                    }
+                    if is_blank_line(line) {
+                        break;
+                    }
+
+                    let Some((key, value)) = line.trim_start().split_once('=') else {
+                        return Err(exec_datafusion_err!(
+                            "{}",
+                            reader.format_exception(
+                                "Expected a template parameter in the form of X=Y"
+                            )
+                        ));
+                    };
+                    bench
+                        .replacement_mapping
+                        .insert(key.trim().to_string(), value.trim().to_string());
+                }
+                Some(Err(e)) => return Err(e),
+                None => break,
+            }
+
+            // Clear the line buffer for the next iteration.
+            line.clear();
+            reader_result = reader.read_line(line);
+        }
+
+        // restart the load from the template file
+        Box::pin(bench.process_file(ctx, Path::new(splits[1]))).await
+    }
+
+    async fn process_include(
+        ctx: &SessionContext,
+        bench: &mut SqlBenchmark,
+        reader: &BenchmarkFileReader,
+        splits: &[&str],
+    ) -> Result<()> {
+        if splits.len() != 2 || splits[1].is_empty() {
+            return Err(exec_datafusion_err!(
+                "{}",
+                reader.format_exception("include requires a single argument")
+            ));
+        }
+
+        Box::pin(bench.process_file(ctx, Path::new(splits[1]))).await
+    }
+
+    fn process_echo(
+        bench: &mut SqlBenchmark,
+        reader: &BenchmarkFileReader,
+        splits: &[&str],
+    ) -> Result<()> {
+        if splits.len() < 2 {
+            return Err(exec_datafusion_err!(
+                "{}",
+                reader.format_exception("Echo requires an argument")
+            ));
+        }
+
+        bench.echo.push(splits[1..].join(" "));
+
+        Ok(())
+    }
+}
+
+struct BenchmarkFileReader {
+    path: PathBuf,
+    reader: BufReader<File>,
+    line_nr: usize,
+    replacements: HashMap<String, String>,
+}
+
+impl BenchmarkFileReader {
+    fn new<P: Into<PathBuf>>(
+        path: P,
+        replacements: HashMap<String, String>,
+    ) -> Result<Self> {
+        let path = path.into();
+        let file = OpenOptions::new().read(true).open(&path)?;
+
+        Ok(Self {
+            path,
+            reader: BufReader::new(file),
+            line_nr: 0,
+            replacements,
+        })
+    }
+
+    /// Read the next line, applying replacements and removing line terminators.
+    fn read_line(&mut self, line: &mut String) -> Option<Result<()>> {
+        match self.reader.read_line(line) {
+            Ok(0) => None,
+            Ok(_) => {
+                self.line_nr += 1;
+
+                // Trim newline and carriage return without changing other content.
+                let trimmed_len = line.trim_end_matches(['\n', '\r']).len();
+                line.truncate(trimmed_len);
+
+                match process_replacements(line, &self.replacements) {
+                    Ok(l) => {
+                        *line = l;
+                        Some(Ok(()))
+                    }
+                    Err(error) => Some(Err(error)),
+                }
+            }
+            Err(e) => Some(Err(e.into())),
+        }
+    }
+
+    fn format_exception(&self, msg: &str) -> String {
+        format!("{}:{} - {}", self.path.display(), self.line_nr, msg)
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct BenchmarkQuery {
+    path: Option<String>,
+    query: String,
+    column_count: usize,
+    expected_result: Vec<Vec<String>>,
+}
+
+impl BenchmarkQuery {}
+
+// ---- utility function below
+
+fn directive_value<'a>(
+    reader: &BenchmarkFileReader,
+    line: &'a str,
+    directive: &str,
+    message: &str,
+) -> Result<&'a str> {
+    let value = line
+        .get(..directive.len())
+        .filter(|prefix| prefix.eq_ignore_ascii_case(directive))
+        .and_then(|_| line.get(directive.len()..))
+        .map(str::trim)
+        .filter(|s| !s.is_empty())
+        .ok_or_else(|| exec_datafusion_err!("{}", reader.format_exception(message)))?;
+
+    Ok(value)
+}
+
+fn parse_group_from_path(path: &Path, benchmark_directory: &Path) -> String {
+    let mut group_name = String::new();
+    let mut parent = path.parent();
+
+    while let Some(p) = parent {
+        if path_ends_with_ignore_ascii_case(p, benchmark_directory) {
+            break;
+        }
+
+        if let Some(dir_name) = p.file_name() {
+            group_name = dir_name.to_string_lossy().into_owned();
+        }
+
+        parent = p.parent();
+    }
+
+    group_name
+}
+
+fn path_ends_with_ignore_ascii_case(path: &Path, suffix: &Path) -> bool {
+    let mut path_components = path.components().rev();
+
+    for suffix_component in suffix.components().rev() {
+        let Some(path_component) = path_components.next() else {
+            return false;
+        };
+
+        if !path_component
+            .as_os_str()
+            .to_string_lossy()
+            .eq_ignore_ascii_case(&suffix_component.as_os_str().to_string_lossy())
+        {
+            return false;
+        }
+    }
+
+    true
+}
+
+fn starts_with_ignore_ascii_case(input: &str, prefix: &str) -> bool {
+    input
+        .get(..prefix.len())
+        .is_some_and(|value| value.eq_ignore_ascii_case(prefix))
+}
+
+fn is_blank_line(line: &str) -> bool {
+    line.trim().is_empty()
+}
+
+fn is_comment_line(line: &str) -> bool {
+    let line = line.trim_start();
+    line.starts_with('#') || line.starts_with("--")
+}
+
+fn is_blank_or_comment_line(line: &str) -> bool {
+    is_blank_line(line) || is_comment_line(line)
+}
+
+fn replace_all<E>(
+    re: &Regex,
+    haystack: &str,
+    replacement: impl Fn(&regex::Captures) -> Result<String, E>,
+) -> Result<String, E> {
+    let mut new = String::with_capacity(haystack.len());
+    let mut last_match = 0;
+
+    for caps in re.captures_iter(haystack) {
+        let m = caps.get(0).unwrap();
+
+        new.push_str(&haystack[last_match..m.start()]);
+        new.push_str(&replacement(&caps)?);
+
+        last_match = m.end();
+    }
+
+    new.push_str(&haystack[last_match..]);
+
+    Ok(new)
+}
+
+static TRUE_FALSE_REPLACEMENT_RE: LazyLock<Regex> = LazyLock::new(|| {
+    Regex::new(r"\$\{(\w+)\|([^|]+)\|([^}]+)}").expect("Regex failed to compile")
+});
+
+static VARIABLE_REPLACEMENT_RE: LazyLock<Regex> = LazyLock::new(|| {
+    Regex::new(r"\$\{(\w+)(?::-([^}]+))?}").expect("Regex failed to compile")
+});
+
+/// Replace all `${KEY}` or `${KEY:-default}` placeholders in a string according to the mapping.
+/// Also handles `${KEY|True value|false value}` syntax.
+fn process_replacements(
+    input: &str,
+    replacement_map: &HashMap<String, String>,
+) -> Result<String> {
+    process_replacements_with_env(input, replacement_map, |key| std::env::var(key).ok())
+}
+
+fn process_replacements_with_env(
+    input: &str,
+    replacement_map: &HashMap<String, String>,
+    get_env: impl Fn(&str) -> Option<String>,
+) -> Result<String> {
+    debug!("processing replacements for line '{input}'");
+
+    // handle ${VAR|true value|false value} syntax
+    let replacement = |caps: &regex::Captures| -> Result<String> {
+        let key = &caps[1];
+        let true_val = &caps[2];
+        let false_val = &caps[3];
+
+        let value = lookup_replacement_value(key, replacement_map, &get_env);
+
+        match value {
+            Some(v) if v.eq_ignore_ascii_case("true") => Ok(true_val.to_string()),
+            Some(_) => Ok(false_val.to_string()),
+            None => Err(exec_datafusion_err!("Missing value for key '{key}'")),
+        }
+    };
+    let input = replace_all(&TRUE_FALSE_REPLACEMENT_RE, input, replacement)?;
+
+    // handle ${KEY} and ${KEY:-default}`
+    let replacement = |caps: &regex::Captures| -> Result<String> {
+        let key = &caps[1];
+        let default = caps.get(2);
+
+        if let Some(v) = lookup_replacement_value(key, replacement_map, &get_env) {
+            return Ok(v.to_string());
+        }
+
+        // use default if it was set
+        if let Some(def) = default {
+            Ok(def.as_str().to_string())
+        } else {
+            Err(exec_datafusion_err!("Missing value for key '{key}'"))
+        }
+    };
+
+    replace_all(&VARIABLE_REPLACEMENT_RE, &input, replacement)
+}
+
+fn lookup_replacement_value(
+    key: &str,
+    replacement_map: &HashMap<String, String>,
+    get_env: &impl Fn(&str) -> Option<String>,
+) -> Option<String> {
+    // search replacement map for key
+    for (k, v) in replacement_map {
+        if key.eq_ignore_ascii_case(k) {
+            return Some(v.to_string());
+        }
+    }
+
+    // look in env variables
+    get_env(&key.to_uppercase())
+}
+
+fn read_query_from_reader(
+    reader: &mut BenchmarkFileReader,
+    sql: &str,
+    header: &str,
+) -> Result<BenchmarkQuery> {
+    let column_count = header.len();
+    let mut expected_result = vec![];
+    let mut line = String::new();
+    let mut reader_result = reader.read_line(&mut line);
+
+    loop {
+        match reader_result {
+            Some(Ok(_)) => {
+                if is_comment_line(&line) {
+                    // comment, ignore
+                } else if is_blank_line(&line) {
+                    break;
+                } else {
+                    let result_splits: Vec<&str> = line.split(['\t', '|']).collect();
+
+                    if result_splits.len() != column_count {
+                        return Err(exec_datafusion_err!(
+                            "{} {line}",
+                            reader.format_exception(&format!(
+                                "expected {} values but got {}",
+                                column_count,
+                                result_splits.len(),
+                            ))
+                        ));
+                    }
+
+                    expected_result
+                        .push(result_splits.into_iter().map(|s| s.to_string()).collect());
+                }
+            }
+            Some(Err(e)) => return Err(e),
+            None => break,
+        }
+
+        // Clear the line buffer for the next iteration.
+        line.clear();
+        reader_result = reader.read_line(&mut line);
+    }
+
+    Ok(BenchmarkQuery {
+        path: None,
+        query: sql.to_string(),
+        column_count,
+        expected_result,
+    })
+}
+
+async fn read_query_from_file(
+    ctx: &SessionContext,
+    path: impl AsRef<Path>,
+    replacement_mapping: &HashMap<String, String>,
+) -> Result<BenchmarkQuery> {
+    // Process replacements in file path
+    let path = path.as_ref().to_string_lossy();
+    let path = process_replacements(&path, replacement_mapping)?;
+    let df: DataFrame = ctx
+        .read_csv(
+            path.clone(),
+            CsvReadOptions::new()
+                .has_header(true)
+                .delimiter(b'|')
+                .null_regex(Some("NULL".to_string()))
+                // we only want string values, we do not want to infer the schema
+                .schema_infer_max_records(0),
+        )
+        .await?;
+
+    // Get schema to determine column count
+    let schema = df.schema();
+    let column_count = schema.fields().len();
+
+    if column_count == 0 {
+        return Err(exec_datafusion_err!(
+            "Result file {path} did not contain any columns"
+        ));
+    }
+
+    // Execute and collect results
+    let batches = df.collect().await?;
+    // Convert record batches to string vectors
+    let expected_result = format_record_batches(&batches)?;
+
+    Ok(BenchmarkQuery {
+        path: Some(path),
+        query: String::new(),
+        column_count,
+        expected_result,
+    })
+}
+
+fn format_record_batches(
+    batches: &[RecordBatch],
+) -> Result<Vec<Vec<String>>, DataFusionError> {
+    let mut expected_result = vec![];
+    let arrow_format_options = FormatOptions::default()
+        .with_null("NULL")
+        .with_display_error(true);
+
+    for batch in batches {
+        let schema = batch.schema_ref();
+
+        let formatters = batch
+            .columns()
+            .iter()
+            .zip(schema.fields().iter())
+            .map(|(c, field)| make_array_formatter(c, &arrow_format_options, Some(field)))
+            .collect::<Result<Vec<_>, ArrowError>>()?;
+
+        for row in 0..batch.num_rows() {
+            let mut cells = vec![];
+            for formatter in &formatters {
+                cells.push(formatter.value(row).to_string());
+            }
+            expected_result.push(cells);
+        }
+    }
+
+    Ok(expected_result)
+}
+
+fn make_array_formatter<'a>(
+    array: &'a dyn Array,
+    options: &FormatOptions<'a>,
+    field: Option<&'a Field>,
+) -> Result<ArrayFormatter<'a>, ArrowError> {
+    match options.formatter_factory() {
+        None => ArrayFormatter::try_new(array, options),
+        Some(formatters) => formatters
+            .create_array_formatter(array, options, field)
+            .transpose()
+            .unwrap_or_else(|| ArrayFormatter::try_new(array, options)),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use datafusion::prelude::SessionContext;
+    use std::fs;
+    use std::path::{Path, PathBuf};
+    use tempfile::{TempDir, tempdir};
+
+    fn write_test_file(temp_dir: &TempDir, name: &str, contents: &str) -> PathBuf {
+        let path = temp_dir.path().join(name);
+        fs::write(&path, contents).expect("failed to write benchmark test file");
+        path
+    }
+
+    async fn parse_benchmark_file(path: &Path) -> Result<SqlBenchmark> {
+        let ctx = SessionContext::new();
+        let path_string = path.to_string_lossy().into_owned();
+        SqlBenchmark::new(&ctx, &path_string, "/tmp").await
+    }
+
+    async fn parse_benchmark(contents: &str) -> Result<SqlBenchmark> {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let path = write_test_file(&temp_dir, "parser.benchmark", contents);
+
+        parse_benchmark_file(&path).await
+    }
+
+    async fn assert_parse_error(contents: &str, expected_message: &str) {
+        let error = parse_benchmark(contents)
+            .await
+            .expect_err("benchmark parsing should fail");
+
+        let message = error.to_string();
+        assert!(
+            message.contains(expected_message),
+            "expected error containing {expected_message:?}, got {message:?}"
+        );
+    }
+
+    fn assert_result_error_contains<T: Debug>(result: Result<T>, expected_message: &str) {
+        let error = result.expect_err("operation should fail");
+        let message = error.to_string();
+        assert!(
+            message.contains(expected_message),
+            "expected error containing {expected_message:?}, got {message:?}"
+        );
+    }
+
+    fn formatted_last_results(benchmark: &SqlBenchmark) -> Vec<Vec<String>> {
+        format_record_batches(
+            benchmark
+                .last_results
+                .as_ref()
+                .expect("last results should be set"),
+        )
+        .expect("results should format")
+    }
+
+    fn read_all_files_in_dir(path: &Path) -> String {
+        let mut entries = fs::read_dir(path)
+            .expect("directory should be readable")
+            .filter_map(Result::ok)
+            .map(|entry| entry.path())
+            .filter(|path| path.is_file())
+            .collect::<Vec<_>>();
+        entries.sort();
+
+        let mut contents = String::new();
+        for path in entries {
+            contents
+                .push_str(&fs::read_to_string(path).expect("file should be readable"));
+        }
+        contents
+    }
+
+    fn replacement_map(entries: &[(&str, &str)]) -> HashMap<String, String> {
+        entries
+            .iter()
+            .map(|(key, value)| (key.to_string(), value.to_string()))
+            .collect()
+    }
+
+    // Replacement tests cover benchmark variable expansion syntax.
+
+    #[test]
+    fn process_replacements_replaces_map_values_case_insensitively() {
+        let replacements = replacement_map(&[
+            ("BENCH_NAME", "tpch"),
+            ("QUERY_NUMBER_PADDED", "01"),
+            ("format_1", "parquet"),
+        ]);
+
+        let actual = process_replacements_with_env(
+            "${bench_name}/q${query_number_padded}.${FORMAT_1}",
+            &replacements,
+            |_| None,
+        )
+        .expect("replacement should succeed");
+
+        assert_eq!(actual, "tpch/q01.parquet");
+    }
+
+    #[test]
+    fn process_replacements_uses_env_when_map_value_is_missing() {
+        let replacements = HashMap::new();
+        let env = replacement_map(&[("DATA_DIR", "/tmp/data")]);
+
+        let actual = process_replacements_with_env(
+            "${data_dir}/lineitem.parquet",
+            &replacements,
+            |key| env.get(key).cloned(),
+        )
+        .expect("replacement should succeed");
+
+        assert_eq!(actual, "/tmp/data/lineitem.parquet");
+    }
+
+    #[test]
+    fn process_replacements_prefers_map_over_env() {
+        let replacements = replacement_map(&[("BENCH_SIZE", "10")]);
+        let env = replacement_map(&[("BENCH_SIZE", "100")]);
+
+        let actual =
+            process_replacements_with_env("sf${BENCH_SIZE}", &replacements, |key| {
+                env.get(key).cloned()
+            })
+            .expect("replacement should succeed");
+
+        assert_eq!(actual, "sf10");
+    }
+
+    #[test]
+    fn process_replacements_uses_default_for_missing_variable() {
+        let replacements = HashMap::new();
+
+        let actual = process_replacements_with_env(
+            "load_${BENCH_SUBGROUP:-groupby}_${FILE_TYPE:-csv}.sql",
+            &replacements,
+            |_| None,
+        )
+        .expect("replacement should succeed");
+
+        assert_eq!(actual, "load_groupby_csv.sql");
+    }
+
+    #[test]
+    fn process_replacements_reports_missing_variable_without_default() {
+        let replacements = HashMap::new();
+
+        let error = process_replacements_with_env("${MISSING}", &replacements, |_| None)
+            .expect_err("replacement should fail");
+
+        assert!(
+            error
+                .to_string()
+                .contains("Missing value for key 'MISSING'"),
+            "unexpected error: {error}"
+        );
+    }
+
+    #[test]
+    fn process_replacements_applies_true_false_true_branch() {
+        let replacements = HashMap::new();
+        let env = replacement_map(&[("USE_PARQUET", "TrUe")]);
+
+        let actual = process_replacements_with_env(
+            "load_${USE_PARQUET|parquet|csv}.sql",
+            &replacements,
+            |key| env.get(key).cloned(),
+        )
+        .expect("replacement should succeed");
+
+        assert_eq!(actual, "load_parquet.sql");
+    }
+
+    #[test]
+    fn process_replacements_applies_true_false_false_branch() {
+        let replacements = HashMap::new();
+        let env = replacement_map(&[("USE_PARQUET", "false")]);
+
+        let actual = process_replacements_with_env(
+            "load_${USE_PARQUET|parquet|csv}.sql",
+            &replacements,
+            |key| env.get(key).cloned(),
+        )
+        .expect("replacement should succeed");
+
+        assert_eq!(actual, "load_csv.sql");
+    }
+
+    #[test]
+    fn process_replacements_uses_map_for_true_false_branch() {
+        let replacements = replacement_map(&[("USE_PARQUET", "true")]);
+
+        let actual = process_replacements_with_env(
+            "load_${USE_PARQUET|parquet|csv}.sql",
+            &replacements,
+            |_| None,
+        )
+        .expect("replacement should succeed");
+
+        assert_eq!(actual, "load_parquet.sql");
+    }
+
+    #[test]
+    fn process_replacements_prefers_map_over_env_for_true_false_branch() {
+        let replacements = replacement_map(&[("USE_PARQUET", "false")]);
+        let env = replacement_map(&[("USE_PARQUET", "true")]);
+
+        let actual = process_replacements_with_env(
+            "load_${USE_PARQUET|parquet|csv}.sql",
+            &replacements,
+            |key| env.get(key).cloned(),
+        )
+        .expect("replacement should succeed");
+
+        assert_eq!(actual, "load_csv.sql");
+    }
+
+    #[test]
+    fn process_replacements_reports_missing_true_false_env_variable() {
+        let replacements = HashMap::new();
+
+        let error = process_replacements_with_env(
+            "load_${USE_PARQUET|parquet|csv}.sql",
+            &replacements,
+            |_| None,
+        )
+        .expect_err("replacement should fail");
+
+        assert!(
+            error
+                .to_string()
+                .contains("Missing value for key 'USE_PARQUET'"),
+            "unexpected error: {error}"
+        );
+    }
+
+    #[test]
+    fn process_replacements_resolves_variables_after_true_false_replacement() {
+        let replacements = replacement_map(&[("FILE_TYPE", "parquet")]);
+        let env = replacement_map(&[("USE_TYPED_PATH", "true")]);
+
+        let actual = process_replacements_with_env(
+            "${USE_TYPED_PATH|data.${FILE_TYPE}|data.csv}",
+            &replacements,
+            |key| env.get(key).cloned(),
+        )
+        .expect("replacement should succeed");
+
+        assert_eq!(actual, "data.parquet");
+    }
+
+    #[test]
+    fn process_replacements_leaves_unsupported_placeholder_syntax_unchanged() {
+        let replacements = HashMap::new();
+
+        let actual =
+            process_replacements_with_env("${BAD-KEY:-fallback}", &replacements, |_| {
+                None
+            })
+            .expect("unsupported placeholder should not match replacement regex");
+
+        assert_eq!(actual, "${BAD-KEY:-fallback}");
+    }
+
+    // Parser tests cover benchmark directives and parse-time validation.
+
+    #[tokio::test]
+    async fn parser_accepts_metadata_expect_echo_and_sql_sections() {
+        let benchmark = parse_benchmark(
+            r#"
+# top-level comments are ignored
+name Parser Success
+group Parser Group
+subgroup Parser Subgroup
+expect_plan ProjectionExec with details
+echo hello from parser
+
+load
+-- query comments are ignored
+CREATE TABLE t AS VALUES (1);
+
+init
+CREATE VIEW v AS SELECT * FROM t;
+
+run
+SELECT * FROM v;
+
+cleanup
+DROP VIEW v;
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+
+        assert_eq!(benchmark.name(), "Parser Success");
+        assert_eq!(benchmark.group(), "Parser Group");
+        assert_eq!(benchmark.subgroup(), "Parser Subgroup");
+        assert_eq!(benchmark.expect, vec!["ProjectionExec with details"]);
+        assert_eq!(benchmark.echo, vec!["hello from parser"]);
+        assert_eq!(
+            benchmark
+                .queries()
+                .get(&QueryDirective::Load)
+                .expect("load query"),
+            &vec!["CREATE TABLE t AS VALUES (1);".to_string()]
+        );
+        assert_eq!(
+            benchmark
+                .queries()
+                .get(&QueryDirective::Init)
+                .expect("init query"),
+            &vec!["CREATE VIEW v AS SELECT * FROM t;".to_string()]
+        );
+        assert_eq!(
+            benchmark
+                .queries()
+                .get(&QueryDirective::Run)
+                .expect("run query"),
+            &vec!["SELECT * FROM v;".to_string()]
+        );
+        assert_eq!(
+            benchmark
+                .queries()
+                .get(&QueryDirective::Cleanup)
+                .expect("cleanup query"),
+            &vec!["DROP VIEW v;".to_string()]
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_accepts_assert_with_expected_rows() {
+        let benchmark = parse_benchmark(
+            r#"
+assert II
+select 1, 'one'
+----
+1|one
+2	two
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+
+        let query = benchmark
+            .assert_queries()
+            .first()
+            .expect("assert query should be parsed");
+
+        assert_eq!(query.column_count, 2);
+        assert!(query.query.contains("select 1, 'one'"));
+        assert_eq!(
+            query.expected_result,
+            vec![
+                vec!["1".to_string(), "one".to_string()],
+                vec!["2".to_string(), "two".to_string()]
+            ]
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_accepts_result_query_with_expected_rows() {
+        let benchmark = parse_benchmark(
+            r#"
+result_query II
+select 1, 'one'
+----
+1|one
+NULL|(empty)
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+
+        let query = benchmark
+            .result_queries()
+            .first()
+            .expect("result query should be parsed");
+
+        assert_eq!(query.path, None);
+        assert_eq!(query.column_count, 2);
+        assert!(query.query.contains("select 1, 'one'"));
+        assert_eq!(
+            query.expected_result,
+            vec![
+                vec!["1".to_string(), "one".to_string()],
+                vec!["NULL".to_string(), "(empty)".to_string()]
+            ]
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_accepts_result_file() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let result_path =
+            write_test_file(&temp_dir, "result.csv", "col_a|col_b\n1|one\nNULL|two\n");
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "result.benchmark",
+            &format!("result {}\n", result_path.display()),
+        );
+
+        let benchmark = parse_benchmark_file(&benchmark_path)
+            .await
+            .expect("benchmark should parse");
+
+        let query = benchmark
+            .result_queries()
+            .first()
+            .expect("result file should be parsed");
+
+        assert_eq!(query.path, Some(result_path.to_string_lossy().into_owned()));
+        assert_eq!(query.column_count, 2);
+        assert_eq!(
+            query.expected_result,
+            vec![
+                vec!["1".to_string(), "one".to_string()],
+                vec!["NULL".to_string(), "two".to_string()]
+            ]
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_accepts_include_file() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let include_path =
+            write_test_file(&temp_dir, "include.benchmark", "run\nselect 1\n");
+
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "include_driver.benchmark",
+            &format!("include {}\n", include_path.display()),
+        );
+
+        let result = parse_benchmark_file(&benchmark_path).await;
+
+        let benchmark = result.expect("benchmark should parse");
+        assert_eq!(
+            benchmark
+                .queries()
+                .get(&QueryDirective::Run)
+                .expect("run query"),
+            &vec!["select 1".to_string()]
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_accepts_template_file_with_parameters() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let template_path = write_test_file(
+            &temp_dir,
+            "template_success.benchmark",
+            "# template comments are ignored\nrun\n-- query comments are ignored\nselect '${TABLE_NAME}', '${BENCHMARK_DIR}'\n",
+        );
+
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "template_success_driver.benchmark",
+            &format!(
+                "template {}\n# parameter comments are ignored\nTABLE_NAME=orders\n",
+                template_path.display()
+            ),
+        );
+
+        let result = parse_benchmark_file(&benchmark_path).await;
+
+        let benchmark = result.expect("benchmark should parse");
+        assert_eq!(benchmark.benchmark_path(), template_path.as_path());
+        assert_eq!(
+            benchmark
+                .queries()
+                .get(&QueryDirective::Run)
+                .expect("run query"),
+            &vec!["select 'orders', '/tmp'".to_string()]
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_trims_template_parameter_keys_and_values() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let template_path = write_test_file(
+            &temp_dir,
+            "template_trim.benchmark",
+            "run\nselect '${TABLE_NAME}'\n",
+        );
+
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "template_trim_driver.benchmark",
+            &format!(
+                "template {}\n  TABLE_NAME = orders  \n",
+                template_path.display()
+            ),
+        );
+
+        let benchmark = parse_benchmark_file(&benchmark_path)
+            .await
+            .expect("benchmark should parse");
+
+        assert_eq!(
+            benchmark
+                .queries()
+                .get(&QueryDirective::Run)
+                .expect("run query"),
+            &vec!["select 'orders'".to_string()]
+        );
+        assert_eq!(
+            benchmark.replacement_mapping().get("TABLE_NAME"),
+            Some(&"orders".to_string())
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_preserves_expected_result_cell_whitespace() {
+        let benchmark = parse_benchmark("assert I\nselect '  x  '\n----\n  x  \n")
+            .await
+            .expect("benchmark should parse");
+
+        let query = benchmark
+            .assert_queries()
+            .first()
+            .expect("assert query should be parsed");
+
+        assert_eq!(query.expected_result, vec![vec!["  x  ".to_string()]]);
+    }
+
+    #[tokio::test]
+    async fn parser_accepts_indented_comments_and_blank_lines() {
+        let benchmark =
+            parse_benchmark("  # comment\n  -- comment\n  run\n  select 1\n   \n")
+                .await
+                .expect("benchmark should parse");
+
+        assert_eq!(
+            benchmark
+                .queries()
+                .get(&QueryDirective::Run)
+                .expect("run query"),
+            &vec!["select 1".to_string()]
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_accepts_case_insensitive_query_directives() {
+        let benchmark = parse_benchmark("RUN\nselect 1\n")
+            .await
+            .expect("benchmark should parse");
+
+        assert_eq!(
+            benchmark
+                .queries()
+                .get(&QueryDirective::Run)
+                .expect("run query"),
+            &vec!["select 1".to_string()]
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_accepts_query_file_and_splits_statements() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let query_path = write_test_file(
+            &temp_dir,
+            "queries.sql",
+            "-- leading comment\nSELECT 1 AS value;\nSELECT 2 AS value;\n\n# another comment\nWITH t AS (SELECT 3 AS value) SELECT * FROM t;\n",
+        );
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "query_file.benchmark",
+            &format!("run {}\n", query_path.display()),
+        );
+
+        let benchmark = parse_benchmark_file(&benchmark_path)
+            .await
+            .expect("benchmark should parse");
+
+        assert_eq!(
+            benchmark
+                .queries()
+                .get(&QueryDirective::Run)
+                .expect("run queries"),
+            &vec![
+                "SELECT 1 AS value".to_string(),
+                "SELECT 2 AS value;".to_string(),
+                "WITH t AS (SELECT 3 AS value) SELECT * FROM t".to_string(),
+            ]
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_accepts_replacements_in_query_file_path() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let query_path =
+            write_test_file(&temp_dir, "queries.sql", "SELECT 5 AS value;\n");
+        let template_path = write_test_file(
+            &temp_dir,
+            "query_file_path_template.benchmark",
+            "run ${QUERY_PATH}\n",
+        );
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "query_file_path_driver.benchmark",
+            &format!(
+                "template {}\nQUERY_PATH={}\n",
+                template_path.display(),
+                query_path.display()
+            ),
+        );
+
+        let benchmark = parse_benchmark_file(&benchmark_path)
+            .await
+            .expect("benchmark should parse");
+
+        assert_eq!(
+            benchmark
+                .queries()
+                .get(&QueryDirective::Run)
+                .expect("run query"),
+            &vec!["SELECT 5 AS value".to_string()]
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_inline_sql_when_query_file_is_provided() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let query_path =
+            write_test_file(&temp_dir, "queries.sql", "SELECT 1 AS value;\n");
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "query_file_with_inline_body.benchmark",
+            &format!("run {}\nSELECT 999 AS value;\n", query_path.display()),
+        );
+
+        let result = parse_benchmark_file(&benchmark_path).await;
+
+        assert_result_error_contains(
+            result,
+            "run directive must use either a query file or inline SQL, not both",
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_inline_sql_when_load_file_is_provided() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let query_path = write_test_file(
+            &temp_dir,
+            "load.sql",
+            "CREATE TABLE t AS SELECT 1 AS value;\n",
+        );
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "load_file_with_inline_body.benchmark",
+            &format!(
+                "load {}\nCREATE TABLE u AS SELECT 2 AS value;\n",
+                query_path.display()
+            ),
+        );
+
+        let result = parse_benchmark_file(&benchmark_path).await;
+
+        assert_result_error_contains(
+            result,
+            "load directive must use either a query file or inline SQL, not both",
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_inline_sql_when_init_file_is_provided() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let query_path = write_test_file(
+            &temp_dir,
+            "init.sql",
+            "CREATE VIEW v AS SELECT 1 AS value;\n",
+        );
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "init_file_with_inline_body.benchmark",
+            &format!(
+                "init {}\nCREATE VIEW w AS SELECT 2 AS value;\n",
+                query_path.display()
+            ),
+        );
+
+        let result = parse_benchmark_file(&benchmark_path).await;
+
+        assert_result_error_contains(
+            result,
+            "init directive must use either a query file or inline SQL, not both",
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_inline_sql_when_cleanup_file_is_provided() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let query_path = write_test_file(&temp_dir, "cleanup.sql", "DROP TABLE t;\n");
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "cleanup_file_with_inline_body.benchmark",
+            &format!("cleanup {}\nDROP TABLE u;\n", query_path.display()),
+        );
+
+        let result = parse_benchmark_file(&benchmark_path).await;
+
+        assert_result_error_contains(
+            result,
+            "cleanup directive must use either a query file or inline SQL, not both",
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_ignores_query_file_with_only_comments_and_blank_lines() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let query_path = write_test_file(
+            &temp_dir,
+            "queries.sql",
+            "# comment\n\n-- another comment\n\n",
+        );
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "empty_query_file.benchmark",
+            &format!("run {}\n", query_path.display()),
+        );
+
+        let benchmark = parse_benchmark_file(&benchmark_path)
+            .await
+            .expect("benchmark should parse");
+
+        assert!(!benchmark.queries().contains_key(&QueryDirective::Run));
+    }
+
+    #[tokio::test]
+    async fn parser_splits_query_file_with_windows_line_endings() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let query_path = write_test_file(
+            &temp_dir,
+            "queries.sql",
+            "SELECT 1 AS value;\r\nSELECT 2 AS value;\r\n",
+        );
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "windows_query_file.benchmark",
+            &format!("run {}\n", query_path.display()),
+        );
+
+        let benchmark = parse_benchmark_file(&benchmark_path)
+            .await
+            .expect("benchmark should parse");
+
+        assert_eq!(
+            benchmark
+                .queries()
+                .get(&QueryDirective::Run)
+                .expect("run queries"),
+            &vec![
+                "SELECT 1 AS value".to_string(),
+                "SELECT 2 AS value".to_string()
+            ]
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_unknown_command() {
+        assert_parse_error("wat\n", "Unrecognized command: wat").await;
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_assert_without_column_count() {
+        assert_parse_error(
+            "assert\nselect 1\n----\n1\n",
+            "assert must be followed by a column count",
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_assert_without_result_separator() {
+        assert_parse_error(
+            "assert I\nselect 1\n1\n",
+            "assert must be followed by a query and a result (separated by ----)",
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_result_query_without_separator() {
+        assert_parse_error(
+            "result_query I\nselect 1\n1\n",
+            "result_query must be followed by a query and a result (separated by ----)",
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_result_query_with_wrong_column_count() {
+        assert_parse_error(
+            "result_query II\nselect 1\n----\n1\n",
+            "expected 2 values but got 1",
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_multiple_result_queries() {
+        assert_parse_error(
+            "result_query I\nselect 1\n----\n1\n\nresult_query I\nselect 2\n----\n2\n",
+            "multiple results found",
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_duplicate_run_directives() {
+        assert_parse_error("run\nselect 1\n\nrun\nselect 2\n", "Multiple calls to run")
+            .await;
+    }
+
+    #[tokio::test]
+    async fn parser_accepts_multiple_load_directives() {
+        let benchmark = parse_benchmark(
+            "load\nCREATE TABLE t AS SELECT 1;\n\nload\nCREATE TABLE u AS SELECT 2;\n",
+        )
+        .await
+        .expect("benchmark should parse");
+
+        assert_eq!(
+            benchmark
+                .queries()
+                .get(&QueryDirective::Load)
+                .expect("load queries"),
+            &vec![
+                "CREATE TABLE t AS SELECT 1;".to_string(),
+                "CREATE TABLE u AS SELECT 2;".to_string(),
+            ]
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_accepts_multiple_init_directives() {
+        let benchmark = parse_benchmark(
+            "init\nCREATE VIEW v AS SELECT 1;\n\ninit\nCREATE VIEW w AS SELECT 2;\n",
+        )
+        .await;
+
+        let benchmark = benchmark.expect("benchmark should parse");
+        assert_eq!(
+            benchmark
+                .queries()
+                .get(&QueryDirective::Init)
+                .expect("init queries"),
+            &vec![
+                "CREATE VIEW v AS SELECT 1;".to_string(),
+                "CREATE VIEW w AS SELECT 2;".to_string(),
+            ]
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_accepts_multiple_cleanup_directives() {
+        let benchmark =
+            parse_benchmark("cleanup\nDROP TABLE t;\n\ncleanup\nDROP TABLE u;\n")
+                .await
+                .expect("benchmark should parse");
+
+        assert_eq!(
+            benchmark
+                .queries()
+                .get(&QueryDirective::Cleanup)
+                .expect("cleanup queries"),
+            &vec!["DROP TABLE t;".to_string(), "DROP TABLE u;".to_string(),]
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_missing_query_file() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let missing_path = temp_dir.path().join("missing.sql");
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "missing_query_file.benchmark",
+            &format!("run {}\n", missing_path.display()),
+        );
+
+        let result = parse_benchmark_file(&benchmark_path).await;
+
+        assert_result_error_contains(result, "Failed to read query file");
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_template_with_invalid_parameter_assignment() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let template_path =
+            write_test_file(&temp_dir, "template.benchmark", "run\nselect 1\n");
+
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "template_driver.benchmark",
+            &format!("template {}\nINVALID\n", template_path.display()),
+        );
+
+        let ctx = SessionContext::new();
+        let benchmark_path_string = benchmark_path.to_string_lossy().into_owned();
+        let result = SqlBenchmark::new(&ctx, &benchmark_path_string, "/tmp").await;
+
+        let error = result.expect_err("benchmark parsing should fail");
+        let message = error.to_string();
+        assert!(
+            message.contains("Expected a template parameter in the form of X=Y"),
+            "expected template parameter error, got {message:?}"
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_metadata_and_result_directives_without_values() {
+        assert_parse_error("name\n", "name must be followed by a value").await;
+        assert_parse_error("group\n", "group must be followed by a value").await;
+        assert_parse_error("subgroup\n", "subgroup must be followed by a value").await;
+        assert_parse_error(
+            "expect_plan\n",
+            "expect_plan must be followed by a string to search in the physical plan",
+        )
+        .await;
+        assert_parse_error("echo\n", "Echo requires an argument").await;
+        assert_parse_error(
+            "result\n",
+            "result must be followed by a path to a result file",
+        )
+        .await;
+        assert_parse_error("include\n", "include requires a single argument").await;
+        assert_parse_error("template\n", "template requires a single template path")
+            .await;
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_include_and_template_with_too_many_arguments() {
+        assert_parse_error("include a b\n", "include requires a single argument").await;
+        assert_parse_error("template a b\n", "template requires a single template path")
+            .await;
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_missing_include_file() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let missing_path = temp_dir.path().join("missing_include.benchmark");
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "missing_include_driver.benchmark",
+            &format!("include {}\n", missing_path.display()),
+        );
+
+        let result = parse_benchmark_file(&benchmark_path).await;
+
+        assert_result_error_contains(result, "No such file");
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_missing_template_file() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let missing_path = temp_dir.path().join("missing_template.benchmark");
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "missing_template_driver.benchmark",
+            &format!("template {}\n", missing_path.display()),
+        );
+
+        let result = parse_benchmark_file(&benchmark_path).await;
+
+        assert_result_error_contains(result, "No such file");
+    }
+
+    #[tokio::test]
+    async fn parser_uses_metadata_values_as_replacements() {
+        let benchmark = parse_benchmark(
+            r#"
+name Q01
+group tpch
+subgroup sf1
+
+run
+SELECT '${BENCH_NAME}', '${BENCH_GROUP}', '${BENCH_SUBGROUP}'
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+
+        assert_eq!(
+            benchmark
+                .queries()
+                .get(&QueryDirective::Run)
+                .expect("run query"),
+            &vec!["SELECT 'Q01', 'tpch', 'sf1'".to_string()]
+        );
+    }
+
+    #[tokio::test]
+    async fn parser_accepts_replacement_in_result_file_path() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let result_path = write_test_file(&temp_dir, "result.csv", "value\n1\n");
+        let template_path = write_test_file(
+            &temp_dir,
+            "result_path_template.benchmark",
+            "result ${RESULT_PATH}\n",
+        );
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "result_path_driver.benchmark",
+            &format!(
+                "template {}\nRESULT_PATH={}\n",
+                template_path.display(),
+                result_path.display()
+            ),
+        );
+
+        let benchmark = parse_benchmark_file(&benchmark_path)
+            .await
+            .expect("benchmark should parse");
+
+        let query = benchmark
+            .result_queries()
+            .first()
+            .expect("result query should be parsed");
+        assert_eq!(query.path, Some(result_path.to_string_lossy().into_owned()));
+        assert_eq!(query.expected_result, vec![vec!["1".to_string()]]);
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_missing_replacement_in_result_file_path() {
+        assert_parse_error("result ${MISSING_RESULT_PATH}\n", "Missing value for key")
+            .await;
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_missing_result_file() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let missing_path = temp_dir.path().join("missing_result.csv");
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "missing_result_file.benchmark",
+            &format!("result {}\n", missing_path.display()),
+        );
+
+        let result = parse_benchmark_file(&benchmark_path).await;
+
+        assert_result_error_contains(result, "missing_result.csv");
+    }
+
+    #[tokio::test]
+    async fn parser_rejects_malformed_result_file() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let result_path = temp_dir.path().join("malformed_result.csv");
+        fs::write(&result_path, [0xff]).expect("failed to write malformed result file");
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "malformed_result_file.benchmark",
+            &format!("result {}\n", result_path.display()),
+        );
+
+        let result = parse_benchmark_file(&benchmark_path).await;
+
+        assert_result_error_contains(result, "CSV");
+    }
+
+    // Lifecycle tests cover initialization, assertions, and cleanup execution.
+
+    #[tokio::test]
+    async fn initialize_executes_load_before_init_and_is_idempotent() {
+        let mut benchmark = parse_benchmark(
+            r#"
+load
+CREATE TABLE t AS SELECT 1 AS value;
+
+load
+CREATE TABLE u AS SELECT value + 1 AS value FROM t;
+
+init
+CREATE TABLE v AS SELECT value + 1 AS value FROM u;
+
+init
+CREATE TABLE initialized AS SELECT value + 1 AS value FROM v;
+
+run
+SELECT value FROM initialized;
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark
+            .initialize(&ctx)
+            .await
+            .expect("initialize should succeed");
+        benchmark
+            .initialize(&ctx)
+            .await
+            .expect("second initialize should be a no-op");
+
+        assert!(benchmark.is_loaded());
+
+        let rows = ctx
+            .sql("SELECT value FROM initialized")
+            .await
+            .expect("query should plan")
+            .collect()
+            .await
+            .expect("query should run");
+
+        assert_eq!(format_record_batches(&rows).unwrap(), vec![vec!["4"]]);
+    }
+
+    #[tokio::test]
+    async fn initialize_rejects_benchmark_without_run_query() {
+        let mut benchmark = parse_benchmark(
+            r#"
+load
+CREATE TABLE t AS SELECT 1 AS value;
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        assert_result_error_contains(
+            benchmark.initialize(&ctx).await,
+            "Invalid benchmark file: no \"run\" query specified",
+        );
+    }
+
+    #[tokio::test]
+    async fn initialize_propagates_load_query_failures() {
+        let mut benchmark = parse_benchmark(
+            r#"
+load
+CREATE TABLE t AS SELECT * FROM missing_load_table;
+
+run
+SELECT 1;
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        assert_result_error_contains(
+            benchmark.initialize(&ctx).await,
+            "missing_load_table",
+        );
+    }
+
+    #[tokio::test]
+    async fn initialize_propagates_init_query_failures() {
+        let mut benchmark = parse_benchmark(
+            r#"
+init
+CREATE TABLE t AS SELECT * FROM missing_init_table;
+
+run
+SELECT 1;
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        assert_result_error_contains(
+            benchmark.initialize(&ctx).await,
+            "missing_init_table",
+        );
+    }
+
+    #[tokio::test]
+    async fn cleanup_executes_cleanup_queries() {
+        let mut benchmark = parse_benchmark(
+            r#"
+run
+SELECT 1;
+
+cleanup
+CREATE TABLE cleanup_marker_a AS SELECT 7 AS value;
+
+cleanup
+CREATE TABLE cleanup_marker_b AS SELECT value + 1 AS value FROM cleanup_marker_a;
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark.cleanup(&ctx).await.expect("cleanup should run");
+
+        let rows = ctx
+            .sql("SELECT value FROM cleanup_marker_b")
+            .await
+            .expect("query should plan")
+            .collect()
+            .await
+            .expect("query should run");
+        assert_eq!(format_record_batches(&rows).unwrap(), vec![vec!["8"]]);
+    }
+
+    #[tokio::test]
+    async fn cleanup_propagates_query_failures() {
+        let mut benchmark = parse_benchmark(
+            r#"
+run
+SELECT 1;
+
+cleanup
+SELECT * FROM missing_cleanup_table;
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        assert_result_error_contains(
+            benchmark.cleanup(&ctx).await,
+            "missing_cleanup_table",
+        );
+    }
+
+    #[tokio::test]
+    async fn assert_executes_assert_queries_successfully() {
+        let mut benchmark = parse_benchmark(
+            r#"
+assert I
+SELECT 1 AS value
+----
+1
+
+run
+SELECT 1;
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark.assert(&ctx).await.expect("assert should pass");
+    }
+
+    #[tokio::test]
+    async fn assert_accepts_null_expected_for_empty_actual() {
+        let mut benchmark = parse_benchmark(
+            r#"
+assert I
+SELECT '' AS value
+----
+NULL
+
+run
+SELECT 1;
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark.assert(&ctx).await.expect("assert should pass");
+    }
+
+    #[tokio::test]
+    async fn assert_accepts_empty_marker_for_empty_actual() {
+        let mut benchmark = parse_benchmark(
+            r#"
+assert I
+SELECT '' AS value
+----
+(empty)
+
+run
+SELECT 1;
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark.assert(&ctx).await.expect("assert should pass");
+    }
+
+    #[tokio::test]
+    async fn assert_accepts_empty_marker_for_null_actual() {
+        let mut benchmark = parse_benchmark(
+            r#"
+assert I
+SELECT CAST(NULL AS VARCHAR) AS value
+----
+(empty)
+
+run
+SELECT 1;
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark.assert(&ctx).await.expect("assert should pass");
+    }
+
+    #[tokio::test]
+    async fn assert_succeeds_with_zero_actual_and_expected_rows() {
+        let mut benchmark = parse_benchmark(
+            r#"
+assert I
+SELECT 1 AS value WHERE false
+----
+
+run
+SELECT 1;
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark.assert(&ctx).await.expect("assert should pass");
+    }
+
+    #[tokio::test]
+    async fn assert_propagates_query_failures() {
+        let mut benchmark = parse_benchmark(
+            r#"
+assert I
+SELECT * FROM missing_assert_table
+----
+1
+
+run
+SELECT 1;
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        assert_result_error_contains(
+            benchmark.assert(&ctx).await,
+            "missing_assert_table",
+        );
+    }
+
+    #[tokio::test]
+    async fn assert_reports_row_count_mismatch() {
+        let mut benchmark = parse_benchmark(
+            r#"
+assert I
+SELECT 1 AS value
+----
+1
+2
+
+run
+SELECT 1;
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        assert_result_error_contains(
+            benchmark.assert(&ctx).await,
+            "expected 2 rows but got 1",
+        );
+    }
+
+    #[tokio::test]
+    async fn assert_reports_column_count_mismatch() {
+        let mut benchmark = parse_benchmark(
+            r#"
+assert I
+SELECT 1 AS a, 2 AS b
+----
+1
+
+run
+SELECT 1;
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        assert_result_error_contains(
+            benchmark.assert(&ctx).await,
+            "expected 1 columns but got 2",
+        );
+    }
+
+    #[tokio::test]
+    async fn assert_reports_value_mismatch() {
+        let mut benchmark = parse_benchmark(
+            r#"
+assert I
+SELECT 1 AS value
+----
+2
+
+run
+SELECT 1;
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        assert_result_error_contains(
+            benchmark.assert(&ctx).await,
+            "expected value \"2\" but got value \"1\"",
+        );
+    }
+
+    // Run tests cover result buffering and physical-plan expectations.
+
+    #[tokio::test]
+    async fn run_saves_uppercase_select_results() {
+        let mut benchmark = parse_benchmark("run\nSELECT 1 AS value\n")
+            .await
+            .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark.run(&ctx, true).await.expect("run should succeed");
+
+        assert_eq!(formatted_last_results(&benchmark), vec![vec!["1"]]);
+    }
+
+    #[tokio::test]
+    async fn run_saves_with_query_results() {
+        let mut benchmark =
+            parse_benchmark("run\nWITH t AS (SELECT 3 AS value) SELECT value FROM t\n")
+                .await
+                .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark.run(&ctx, true).await.expect("run should succeed");
+
+        assert_eq!(formatted_last_results(&benchmark), vec![vec!["3"]]);
+    }
+
+    #[tokio::test]
+    async fn run_only_keeps_last_select_or_with_result() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let query_path = write_test_file(
+            &temp_dir,
+            "queries.sql",
+            "SELECT 1 AS value;\nSELECT 2 AS value;\nWITH t AS (SELECT 3 AS value) SELECT value FROM t;\n",
+        );
+        let benchmark_path = write_test_file(
+            &temp_dir,
+            "run_file.benchmark",
+            &format!("run {}\n", query_path.display()),
+        );
+        let mut benchmark = parse_benchmark_file(&benchmark_path)
+            .await
+            .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark.run(&ctx, true).await.expect("run should succeed");
+
+        assert_eq!(formatted_last_results(&benchmark), vec![vec!["3"]]);
+    }
+
+    #[tokio::test]
+    async fn run_does_not_save_results_for_non_select_statement() {
+        let mut benchmark =
+            parse_benchmark("run\nCREATE TABLE run_created AS SELECT 1 AS value;\n")
+                .await
+                .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark.run(&ctx, true).await.expect("run should succeed");
+
+        assert!(
+            benchmark
+                .last_results
+                .as_ref()
+                .expect("last results should be set")
+                .is_empty()
+        );
+    }
+
+    #[tokio::test]
+    async fn run_propagates_query_failures_when_buffering_results() {
+        let mut benchmark = parse_benchmark("run\nSELECT * FROM missing_run_table\n")
+            .await
+            .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        assert_result_error_contains(
+            benchmark.run(&ctx, true).await,
+            "missing_run_table",
+        );
+    }
+
+    #[tokio::test]
+    async fn run_propagates_query_failures_when_streaming_results() {
+        let mut benchmark = parse_benchmark("run\nSELECT * FROM missing_stream_table\n")
+            .await
+            .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        assert_result_error_contains(
+            benchmark.run(&ctx, false).await,
+            "missing_stream_table",
+        );
+    }
+
+    #[tokio::test]
+    async fn run_rejects_missing_expect_plan_for_buffered_and_streaming_modes() {
+        let ctx = SessionContext::new();
+        let benchmark_text = "expect_plan definitely_not_in_plan\nrun\nSELECT 1\n";
+
+        let mut buffered = parse_benchmark(benchmark_text)
+            .await
+            .expect("benchmark should parse");
+        assert_result_error_contains(
+            buffered.run(&ctx, true).await,
+            "does not contain the expected string 'definitely_not_in_plan'",
+        );
+
+        let mut streaming = parse_benchmark(benchmark_text)
+            .await
+            .expect("benchmark should parse");
+        assert_result_error_contains(
+            streaming.run(&ctx, false).await,
+            "does not contain the expected string 'definitely_not_in_plan'",
+        );
+    }
+
+    #[tokio::test]
+    async fn run_accepts_matching_expect_plan_for_buffered_and_streaming_modes() {
+        let ctx = SessionContext::new();
+        let benchmark_text = "expect_plan PlaceholderRowExec\nrun\nSELECT 1\n";
+
+        let mut buffered = parse_benchmark(benchmark_text)
+            .await
+            .expect("benchmark should parse");
+        buffered
+            .run(&ctx, true)
+            .await
+            .expect("buffered run should accept matching plan");
+        assert_eq!(formatted_last_results(&buffered), vec![vec!["1"]]);
+
+        let mut streaming = parse_benchmark(benchmark_text)
+            .await
+            .expect("benchmark should parse");
+        streaming
+            .run(&ctx, false)
+            .await
+            .expect("streaming run should accept matching plan");
+    }
+
+    // Verification tests cover result_query and persisted-result comparison paths.
+
+    #[tokio::test]
+    async fn verify_without_result_query_returns_ok() {
+        let mut benchmark = parse_benchmark("run\nSELECT 1 AS value\n")
+            .await
+            .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark.verify(&ctx).await.expect("verify should pass");
+    }
+
+    #[tokio::test]
+    async fn verify_errors_when_benchmark_has_not_run() {
+        let mut benchmark = parse_benchmark(
+            r#"
+result_query I
+SELECT 1 AS value
+----
+1
+
+run
+SELECT 1;
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        assert_result_error_contains(
+            benchmark.verify(&ctx).await,
+            "No results available for verification. Run the benchmark first.",
+        );
+    }
+
+    #[tokio::test]
+    async fn verify_uses_last_results_for_result_file_entries() {
+        let mut benchmark = parse_benchmark("run\nSELECT 1 AS value\n")
+            .await
+            .expect("benchmark should parse");
+        benchmark.result_queries.push(BenchmarkQuery {
+            path: Some("unused.csv".to_string()),
+            query: String::new(),
+            column_count: 1,
+            expected_result: vec![vec!["1".to_string()]],
+        });
+        let ctx = SessionContext::new();
+
+        benchmark.run(&ctx, true).await.expect("run should succeed");
+        benchmark.verify(&ctx).await.expect("verify should pass");
+    }
+
+    #[tokio::test]
+    async fn verify_uses_last_results_for_zero_row_result_file_entries() {
+        let mut benchmark = parse_benchmark("run\nSELECT 1 AS value WHERE false\n")
+            .await
+            .expect("benchmark should parse");
+        benchmark.result_queries.push(BenchmarkQuery {
+            path: Some("unused.csv".to_string()),
+            query: String::new(),
+            column_count: 1,
+            expected_result: vec![],
+        });
+        let ctx = SessionContext::new();
+
+        benchmark.run(&ctx, true).await.expect("run should succeed");
+        benchmark.verify(&ctx).await.expect("verify should pass");
+    }
+
+    #[tokio::test]
+    async fn verify_executes_result_query_instead_of_last_results() {
+        let mut benchmark = parse_benchmark(
+            r#"
+run
+SELECT 100 AS value
+
+result_query I
+SELECT 1 AS value
+----
+1
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark.run(&ctx, true).await.expect("run should succeed");
+        benchmark.verify(&ctx).await.expect("verify should pass");
+    }
+
+    #[tokio::test]
+    async fn verify_propagates_result_query_failures() {
+        let mut benchmark = parse_benchmark(
+            r#"
+run
+SELECT 1 AS value
+
+result_query I
+SELECT * FROM missing_verify_table
+----
+1
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark.run(&ctx, true).await.expect("run should succeed");
+
+        assert_result_error_contains(
+            benchmark.verify(&ctx).await,
+            "missing_verify_table",
+        );
+    }
+
+    #[tokio::test]
+    async fn verify_reports_result_mismatch_context() {
+        let mut benchmark = parse_benchmark(
+            r#"
+run
+SELECT 1 AS value
+
+result_query I
+SELECT 1 AS value
+----
+2
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark.run(&ctx, true).await.expect("run should succeed");
+
+        let error = benchmark
+            .verify(&ctx)
+            .await
+            .expect_err("verify should fail");
+        let message = error.to_string();
+        assert!(
+            message.contains("row 1, column 1")
+                && message.contains("expected value \"2\"")
+                && message.contains("got value \"1\""),
+            "unexpected error: {message}"
+        );
+    }
+
+    // Persistence tests cover CSV writing and persist-time error paths.
+
+    #[tokio::test]
+    async fn persist_without_result_query_returns_ok() {
+        let mut benchmark = parse_benchmark("run\nSELECT 1 AS value\n")
+            .await
+            .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        benchmark.persist(&ctx).await.expect("persist should pass");
+    }
+
+    #[tokio::test]
+    async fn persist_rejects_result_query_without_file_path() {
+        let mut benchmark = parse_benchmark(
+            r#"
+run
+SELECT 1 AS value
+
+result_query I
+SELECT 1 AS value
+----
+1
+"#,
+        )
+        .await
+        .expect("benchmark should parse");
+        let ctx = SessionContext::new();
+
+        assert_result_error_contains(
+            benchmark.persist(&ctx).await,
+            "Unable to persist results from query",
+        );
+    }
+
+    #[tokio::test]
+    async fn persist_rejects_run_without_saved_result_batches() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let output_path = temp_dir.path().join("persisted");
+        let mut benchmark =
+            parse_benchmark("run\nCREATE TABLE persist_source AS SELECT 1 AS value;\n")
+                .await
+                .expect("benchmark should parse");
+        benchmark.result_queries.push(BenchmarkQuery {
+            path: Some(output_path.to_string_lossy().into_owned()),
+            query: String::new(),
+            column_count: 1,
+            expected_result: vec![],
+        });
+        let ctx = SessionContext::new();
+
+        assert_result_error_contains(
+            benchmark.persist(&ctx).await,
+            "Results should be loaded",
+        );
+    }
+
+    #[tokio::test]
+    async fn persist_writes_header_and_pipe_delimited_rows() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let output_path = temp_dir.path().join("persisted");
+        let mut benchmark = parse_benchmark("run\nSELECT 1 AS a, 'one' AS b\n")
+            .await
+            .expect("benchmark should parse");
+        benchmark.result_queries.push(BenchmarkQuery {
+            path: Some(output_path.to_string_lossy().into_owned()),
+            query: String::new(),
+            column_count: 2,
+            expected_result: vec![],
+        });
+        let ctx = SessionContext::new();
+
+        benchmark.persist(&ctx).await.expect("persist should pass");
+
+        let contents = read_all_files_in_dir(&output_path);
+        assert!(
+            contents.contains("a|b\n") && contents.contains("1|one\n"),
+            "unexpected persisted contents: {contents:?}"
+        );
+    }
+
+    #[tokio::test]
+    async fn persist_writes_header_for_zero_row_select_results() {
+        let temp_dir = tempdir().expect("failed to create benchmark test directory");
+        let output_path = temp_dir.path().join("persisted_empty");
+        let mut benchmark = parse_benchmark("run\nSELECT 1 AS value WHERE false\n")
+            .await
+            .expect("benchmark should parse");
+        benchmark.result_queries.push(BenchmarkQuery {
+            path: Some(output_path.to_string_lossy().into_owned()),
+            query: String::new(),
+            column_count: 1,
+            expected_result: vec![],
+        });
+        let ctx = SessionContext::new();
+
+        benchmark.persist(&ctx).await.expect("persist should pass");
+
+        let contents = read_all_files_in_dir(&output_path);
+        assert!(
+            contents.contains("value\n"),
+            "unexpected persisted contents: {contents:?}"
+        );
+    }
+
+    // Path helper tests cover group derivation from benchmark file paths.
+
+    #[test]
+    fn parse_group_from_path_returns_group_under_benchmark_directory() {
+        let group = parse_group_from_path(
+            Path::new("sql_benchmarks/tpch/benchmarks/q01.benchmark"),
+            Path::new("sql_benchmarks"),
+        );
+
+        assert_eq!(group, "tpch");
+    }
+
+    #[test]
+    fn parse_group_from_path_matches_benchmark_directory_case_insensitively() {
+        let group = parse_group_from_path(
+            Path::new("/tmp/SQL_BENCHMARKS/Tpch/benchmarks/q01.benchmark"),
+            Path::new("sql_benchmarks"),
+        );
+
+        assert_eq!(group, "Tpch");
+    }
+
+    #[test]
+    fn parse_group_from_path_handles_relative_and_absolute_paths() {
+        let relative = parse_group_from_path(
+            Path::new("sql_benchmarks/h2o/q01.benchmark"),
+            Path::new("sql_benchmarks"),
+        );
+        let absolute = parse_group_from_path(
+            Path::new("/tmp/sql_benchmarks/imdb/q01.benchmark"),
+            Path::new("sql_benchmarks"),
+        );
+
+        assert_eq!(relative, "h2o");
+        assert_eq!(absolute, "imdb");
+    }
+
+    #[test]
+    fn parse_group_from_path_pins_fallback_for_paths_outside_benchmark_directory() {
+        let group = parse_group_from_path(
+            Path::new("outside/group/q01.benchmark"),
+            Path::new("sql_benchmarks"),
+        );
+
+        assert_eq!(group, "outside");
+    }
+
+    #[test]
+    fn path_ends_with_ignore_ascii_case_matches_component_suffixes() {
+        assert!(path_ends_with_ignore_ascii_case(
+            Path::new("/tmp/SQL_BENCHMARKS"),
+            Path::new("sql_benchmarks")
+        ));
+        assert!(!path_ends_with_ignore_ascii_case(
+            Path::new("/tmp/sql_benchmarks_extra"),
+            Path::new("sql_benchmarks")
+        ));
+    }
+}