From 75e60d9100ab982fc7be9203b9a9bd6d07ab7006 Mon Sep 17 00:00:00 2001 From: "Victor M. Varela" Date: Sat, 13 Jun 2026 09:26:27 +0200 Subject: [PATCH 1/2] feat: auto-detect input format from file extension with -I override - Add input_format_explicit flag to track when -I is explicitly set - When -I is set, it overrides file extension auto-detection for all files - When -I is not set, auto-detect from .csv/.tsv/.json/.ndjson/.xml extensions - Ambiguous extensions (.txt, .dat) default to CSV - Stdin always uses -I value (no filename to inspect) - Add 8 integration tests (157a-157h) covering auto-detection and override - Update fixture test 14 to use file auto-detection instead of stdin + -I - Document auto-detection and -I override in README, man page, and --help Closes #158 --- README.md | 13 +++++- build.zig | 100 ++++++++++++++++++++++++++++++++++++++++++-- docs/sql-pipe.1.scd | 24 ++++++++++- src/args.zig | 11 ++++- 4 files changed, 139 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index e6a72b5..4f85a97 100644 --- a/README.md +++ b/README.md @@ -250,12 +250,21 @@ Pass files as positional arguments instead of piping through stdin. Each file be # Single file — no more cat $ sql-pipe orders.csv 'SELECT * FROM orders WHERE amount > 100' +# JSON file — extension tells sql-pipe the format, no -I needed +$ sql-pipe data.json 'SELECT * FROM data WHERE score > 80' + # Multi-file join — the #1 reason people reach for DuckDB $ sql-pipe orders.csv customers.csv \ 'SELECT c.name, SUM(o.amount) FROM orders o JOIN customers c ON o.cust_id = c.id GROUP BY c.name' ``` +Use `-I` to override auto-detection when the extension is wrong or ambiguous (`.txt`, `.dat`): + +```sh +$ sql-pipe -I tsv data.txt 'SELECT * FROM data' +``` + Stdin still works and is always available as table `t`. Mix stdin with file arguments: ```sh @@ -286,7 +295,7 @@ $ cat events.csv \ |------|-------------| | `-d`, `--delimiter ` | Input field delimiter (single character, default `,`) | | `--tsv` | Alias for `--delimiter '\t'` | -| `-I`, `--input-format ` | Input format: `csv` (default), `tsv`, `json`, `ndjson`, `xml` | +| `-I`, `--input-format ` | Input format: `csv` (default), `tsv`, `json`, `ndjson`, `xml`. Overrides file extension auto-detection. | | `-O`, `--output-format ` | Output format: `csv` (default), `tsv`, `json`, `ndjson`, `xml` | | `--no-type-inference` | Treat all columns as TEXT (skip auto-detection) | | `-H`, `--header` | Print column names as the first output row | @@ -563,7 +572,7 @@ The database never touches disk and vanishes when the process exits. No state, n ## Limitations -- **File format auto-detection** is based on file extension. Files without a recognized extension (`.csv`, `.tsv`, `.json`, `.ndjson`, `.xml`) fall back to the `-I` flag value (default: CSV). +- **File format auto-detection** is based on file extension. Files without a recognized extension (`.csv`, `.tsv`, `.json`, `.ndjson`, `.xml`) default to CSV. Use `-I` to override. ## Related diff --git a/build.zig b/build.zig index 174e702..d4ae472 100644 --- a/build.zig +++ b/build.zig @@ -1782,6 +1782,100 @@ pub fn build(b: *std.Build) void { test_table_output_file.step.dependOn(b.getInstallStep()); test_step.dependOn(&test_table_output_file.step); + // Integration test 157a: Auto-detect .json extension without -I flag + const test_autodetect_json = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(./zig-out/bin/sql-pipe tests/fixtures/products.json 'SELECT name FROM products WHERE CAST(stock AS INTEGER) > 0 ORDER BY name') + \\expected=$(printf 'Doohickey\nGadget\nWidget') + \\[ "$result" = "$expected" ] + }); + test_autodetect_json.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_autodetect_json.step); + + // Integration test 157b: Auto-detect .ndjson extension without -I flag + const test_autodetect_ndjson = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(./zig-out/bin/sql-pipe tests/fixtures/events.ndjson 'SELECT event, COUNT(*) FROM events GROUP BY event ORDER BY event') + \\expected=$(printf 'login,2\nlogout,1\npurchase,2') + \\[ "$result" = "$expected" ] + }); + test_autodetect_ndjson.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_autodetect_ndjson.step); + + // Integration test 157c: -I flag overrides file extension (JSON file forced to CSV) + const test_override_json_to_csv = b.addSystemCommand(&.{ + "bash", "-c", + \\tmp=/tmp/sqlpipe_test_override.json + \\printf 'name,age\nAlice,30\nBob,25' > "$tmp" + \\result=$(./zig-out/bin/sql-pipe -I csv "$tmp" 'SELECT name FROM sqlpipe_test_override ORDER BY name') + \\rm -f "$tmp" + \\expected=$(printf 'Alice\nBob') + \\[ "$result" = "$expected" ] + }); + test_override_json_to_csv.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_override_json_to_csv.step); + + // Integration test 157d: Ambiguous .txt extension defaults to CSV + const test_ambiguous_txt_csv = b.addSystemCommand(&.{ + "bash", "-c", + \\tmp=/tmp/sqlpipe_test_ambiguous.txt + \\printf 'name,age\nAlice,30\nBob,25' > "$tmp" + \\result=$(./zig-out/bin/sql-pipe "$tmp" 'SELECT name FROM sqlpipe_test_ambiguous ORDER BY name') + \\rm -f "$tmp" + \\expected=$(printf 'Alice\nBob') + \\[ "$result" = "$expected" ] + }); + test_ambiguous_txt_csv.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_ambiguous_txt_csv.step); + + // Integration test 157e: -I override with --input-format= syntax + const test_override_long_flag = b.addSystemCommand(&.{ + "bash", "-c", + \\tmp=/tmp/sqlpipe_test_long.json + \\printf 'name,age\nAlice,30\nBob,25' > "$tmp" + \\result=$(./zig-out/bin/sql-pipe --input-format=csv "$tmp" 'SELECT name FROM sqlpipe_test_long ORDER BY name') + \\rm -f "$tmp" + \\expected=$(printf 'Alice\nBob') + \\[ "$result" = "$expected" ] + }); + test_override_long_flag.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_override_long_flag.step); + + // Integration test 157f: -I override with -I= syntax + const test_override_short_eq = b.addSystemCommand(&.{ + "bash", "-c", + \\tmp=/tmp/sqlpipe_test_short.json + \\printf 'name,age\nAlice,30\nBob,25' > "$tmp" + \\result=$(./zig-out/bin/sql-pipe -I=csv "$tmp" 'SELECT name FROM sqlpipe_test_short ORDER BY name') + \\rm -f "$tmp" + \\expected=$(printf 'Alice\nBob') + \\[ "$result" = "$expected" ] + }); + test_override_short_eq.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_override_short_eq.step); + + // Integration test 157g: Ambiguous .txt extension with -I tsv override + const test_ambiguous_txt_tsv = b.addSystemCommand(&.{ + "bash", "-c", + \\tmp=/tmp/sqlpipe_test_tsv.txt + \\printf 'name\tage\nAlice\t30\nBob\t25' > "$tmp" + \\result=$(./zig-out/bin/sql-pipe -I tsv "$tmp" 'SELECT name FROM sqlpipe_test_tsv ORDER BY name') + \\rm -f "$tmp" + \\expected=$(printf 'Alice\nBob') + \\[ "$result" = "$expected" ] + }); + test_ambiguous_txt_tsv.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_ambiguous_txt_tsv.step); + + // Integration test 157h: Auto-detect .xml extension without -I flag + const test_autodetect_xml = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(./zig-out/bin/sql-pipe tests/fixtures/feed.xml --xml-root channel --xml-row item 'SELECT COUNT(*) FROM feed') + \\[ "$result" = "3" ] + }); + test_autodetect_xml.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_autodetect_xml.step); + // ─── Fixture-based integration tests ───────────────────────────────────── // These tests use sample files committed in tests/fixtures/ to exercise // the binary end-to-end with realistic data across all supported formats. @@ -1917,11 +2011,11 @@ pub fn build(b: *std.Build) void { }); fixture_test_step.dependOn(&fixture_mixed_join.step); - // Fixture test 14: CSV file + NDJSON stdin mix + // Fixture test 14: CSV file + NDJSON file mix (auto-detected from extensions) const fixture_csv_ndjson_mix = b.addSystemCommand(&.{ "bash", "-c", - \\result=$(cat tests/fixtures/events.ndjson | ./zig-out/bin/sql-pipe -I ndjson tests/fixtures/customers.csv \ - \\ 'SELECT c.name, e.event FROM t e JOIN customers c ON LOWER(e.user) = LOWER(c.name) ORDER BY c.name, e.event') + \\result=$(./zig-out/bin/sql-pipe tests/fixtures/events.ndjson tests/fixtures/customers.csv \ + \\ 'SELECT c.name, e.event FROM events e JOIN customers c ON LOWER(e.user) = LOWER(c.name) ORDER BY c.name, e.event') \\expected=$(printf 'Alice,login\nAlice,logout\nAlice,purchase\nBob,purchase\nCarol,login') \\[ "$result" = "$expected" ] }); diff --git a/docs/sql-pipe.1.scd b/docs/sql-pipe.1.scd index 82458e3..6d3c990 100644 --- a/docs/sql-pipe.1.scd +++ b/docs/sql-pipe.1.scd @@ -21,8 +21,9 @@ DESCRIPTION Stdin and file arguments can be combined — stdin is always table *t*. Input format for files is auto-detected from the file extension (*.csv*, - *.tsv*, *.json*, *.ndjson*, *.xml*). Files without a recognized extension - fall back to the *-I* flag value (default: CSV). + *.tsv*, *.json*, *.ndjson*, *.xml*). Unrecognized or missing extensions + default to CSV. Use *-I* to override auto-detection (e.g., when a + TSV file has a *.txt* extension). This tool is useful for quick data transformations, filtering, grouping, aggregations, and multi-file joins without manual SQL database setup. @@ -52,6 +53,17 @@ OPTIONS *--tsv* Alias for *--delimiter '\\t'*. Parses tab-separated input. + *-I, --input-format* + Set the input format explicitly: *csv* (default), *tsv*, *json*, + *ndjson*, or *xml*. When set, overrides file extension auto-detection + for all file arguments. Stdin always uses this value (no filename + to inspect). Useful when a file has an ambiguous extension (*.txt*, + *.dat*) or no extension at all. + + *-O, --output-format* + Set the output format: *csv* (default), *tsv*, *json*, *ndjson*, + or *xml*. + *--no-type-inference* Treat all columns as TEXT. Skips automatic type detection and uses plain TEXT affinity for all columns in the SQLite table. This can improve @@ -162,6 +174,14 @@ EXAMPLES 'SELECT c.name, SUM(o.amount) FROM orders o ++ JOIN customers c ON o.cust_id = c.id GROUP BY c.name' + Query a JSON file (format auto-detected from extension): + + $ sql-pipe data.json 'SELECT * FROM data WHERE score > 80' + + Override auto-detection when the extension is wrong: + + $ sql-pipe -I tsv data.txt 'SELECT * FROM data' + Mix stdin (as table t) with a file argument: $ cat events.csv | sql-pipe users.csv ++ diff --git a/src/args.zig b/src/args.zig index b67717d..2fdddb7 100644 --- a/src/args.zig +++ b/src/args.zig @@ -193,6 +193,7 @@ pub fn printUsage(writer: *std.Io.Writer) !void { \\ -d, --delimiter Input field delimiter for CSV: 1–8 chars (default: ,) \\ --tsv Alias for --delimiter '\t' \\ -I, --input-format Input format: csv (default), tsv, json, ndjson, xml + \\ Overrides file extension auto-detection; stdin always uses this value \\ -O, --output-format Output format: csv (default), tsv, json, ndjson, xml \\ --json Alias for --output-format json \\ --no-type-inference Treat all columns as TEXT (CSV input only) @@ -237,6 +238,8 @@ pub fn printUsage(writer: *std.Io.Writer) !void { \\ cat data.psv | sql-pipe -d '|' 'SELECT * FROM t' \\ cat data.csv | sql-pipe 'SELECT region, SUM(revenue) FROM t GROUP BY region' \\ sql-pipe orders.csv 'SELECT * FROM orders WHERE amount > 100' + \\ sql-pipe data.json 'SELECT * FROM data WHERE score > 80' + \\ sql-pipe -I tsv data.txt 'SELECT * FROM data' \\ sql-pipe orders.csv customers.csv 'SELECT c.name, SUM(o.amount) FROM orders o JOIN customers c ON o.cust_id = c.id GROUP BY c.name' \\ cat events.csv | sql-pipe users.csv 'SELECT * FROM t JOIN users ON t.uid = users.id' \\ cat data.csv | sql-pipe --output-format json 'SELECT * FROM t' @@ -276,6 +279,7 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP var delimiter: []const u8 = ","; var header = false; var input_format: InputFormat = .csv; + var input_format_explicit = false; var output_format: OutputFormat = .csv; var max_rows: ?usize = null; @@ -341,10 +345,13 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP i += 1; if (i >= args.len) return error.InvalidInputFormat; input_format = InputFormat.parse(args[i]) catch return error.InvalidInputFormat; + input_format_explicit = true; } else if (std.mem.startsWith(u8, arg, "--input-format=")) { input_format = InputFormat.parse(arg["--input-format=".len..]) catch return error.InvalidInputFormat; + input_format_explicit = true; } else if (std.mem.startsWith(u8, arg, "-I=")) { input_format = InputFormat.parse(arg["-I=".len..]) catch return error.InvalidInputFormat; + input_format_explicit = true; } else if (std.mem.eql(u8, arg, "-O") or std.mem.eql(u8, arg, "--output-format")) { i += 1; if (i >= args.len) return error.InvalidOutputFormat; @@ -445,7 +452,7 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP // Special modes: every positional arg is a file input for (pos) |p| { const name = try tableNameFromPath(allocator, p); - const fmt = InputFormat.fromExtension(p) orelse input_format; + const fmt = if (input_format_explicit) input_format else (InputFormat.fromExtension(p) orelse input_format); try files.append(allocator, .{ .path = p, .table_name = name, @@ -457,7 +464,7 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP query = pos[pos.len - 1]; for (pos[0 .. pos.len - 1]) |p| { const name = try tableNameFromPath(allocator, p); - const fmt = InputFormat.fromExtension(p) orelse input_format; + const fmt = if (input_format_explicit) input_format else (InputFormat.fromExtension(p) orelse input_format); try files.append(allocator, .{ .path = p, .table_name = name, From ea8f87345d46758ccb29717ee9ecf71748dc77c5 Mon Sep 17 00:00:00 2001 From: "Victor M. Varela" Date: Sat, 13 Jun 2026 09:46:21 +0200 Subject: [PATCH 2/2] fix: propagate auto-detected format to special modes and validation checks Compute effective_input_format from per-file auto-detection when a file argument is present, and use it for: - --columns, --validate, --sample mode dispatch (Issue A) - --json-path validation (Issue B) - --xml-root/--xml-row name validation (Issue C) Previously these paths used the global input_format (default CSV or explicit -I value), causing auto-detected .tsv/.json/.xml files to be parsed as CSV in special modes and valid --json-path invocations to be rejected. --- src/args.zig | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/args.zig b/src/args.zig index 2fdddb7..0c88a28 100644 --- a/src/args.zig +++ b/src/args.zig @@ -474,6 +474,13 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP } } + // Effective input format: per-file auto-detection when a file is present, + // else the global default (CSV) or explicit -I value. + const effective_input_format: InputFormat = if (files.items.len > 0) + (if (input_format_explicit) input_format else files.items[0].format) + else + input_format; + // Check for duplicate table names (would cause conflicting table definitions) { var seen = std.StringHashMap(void).init(allocator); @@ -533,13 +540,13 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP return error.SilentVerboseConflict; // --xml-root and --xml-row must be valid XML element names (only validated in XML mode) - if (input_format == .xml or output_format == .xml) { + if (effective_input_format == .xml or output_format == .xml) { if (!isValidXmlName(xml_root) or !isValidXmlName(xml_row)) return error.InvalidXmlName; } - // --json-path requires -I json (the flag only applies to JSON object navigation) - if (json_path != null and input_format != .json) + // --json-path requires JSON input (the flag only applies to JSON object navigation) + if (json_path != null and effective_input_format != .json) return error.JsonPathRequiresJson; // --table requires CSV or TSV output format (table formatting is visual only) @@ -552,7 +559,7 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP .files = files.items, .delimiter = delimiter, .verbose = verbose, - .input_format = input_format, + .input_format = effective_input_format, .xml_root_input = xml_root_input, .xml_row_input = xml_row_input, .json_path = json_path, @@ -564,7 +571,7 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP .files = files.items, .delimiter = delimiter, .type_inference = type_inference, - .input_format = input_format, + .input_format = effective_input_format, .xml_root_input = xml_root_input, .xml_row_input = xml_row_input, .json_path = json_path, @@ -575,7 +582,7 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP return .{ .sample = SampleArgs{ .files = files.items, .delimiter = delimiter, - .input_format = input_format, + .input_format = effective_input_format, .n = sample_n, .type_inference = type_inference, } };