From 75e60d9100ab982fc7be9203b9a9bd6d07ab7006 Mon Sep 17 00:00:00 2001
From: "Victor M. Varela" <vmvarela@gmail.com>
Date: Sat, 13 Jun 2026 09:26:27 +0200
Subject: [PATCH 1/2] feat: auto-detect input format from file extension with
 -I override

- Add input_format_explicit flag to track when -I is explicitly set
- When -I is set, it overrides file extension auto-detection for all files
- When -I is not set, auto-detect from .csv/.tsv/.json/.ndjson/.xml extensions
- Ambiguous extensions (.txt, .dat) default to CSV
- Stdin always uses -I value (no filename to inspect)
- Add 8 integration tests (157a-157h) covering auto-detection and override
- Update fixture test 14 to use file auto-detection instead of stdin + -I
- Document auto-detection and -I override in README, man page, and --help

Closes #158
---
 README.md           |  13 +++++-
 build.zig           | 100 ++++++++++++++++++++++++++++++++++++++++++--
 docs/sql-pipe.1.scd |  24 ++++++++++-
 src/args.zig        |  11 ++++-
 4 files changed, 139 insertions(+), 9 deletions(-)
diff --git a/README.md b/README.md
index e6a72b5..4f85a97 100644
--- a/README.md
+++ b/README.md
@@ -250,12 +250,21 @@ Pass files as positional arguments instead of piping through stdin. Each file be
 # Single file — no more cat
 $ sql-pipe orders.csv 'SELECT * FROM orders WHERE amount > 100'
 
+# JSON file — extension tells sql-pipe the format, no -I needed
+$ sql-pipe data.json 'SELECT * FROM data WHERE score > 80'
+
 # Multi-file join — the #1 reason people reach for DuckDB
 $ sql-pipe orders.csv customers.csv \
     'SELECT c.name, SUM(o.amount) FROM orders o
      JOIN customers c ON o.cust_id = c.id GROUP BY c.name'
 ```
 
+Use `-I` to override auto-detection when the extension is wrong or ambiguous (`.txt`, `.dat`):
+
+```sh
+$ sql-pipe -I tsv data.txt 'SELECT * FROM data'
+```
+
 Stdin still works and is always available as table `t`. Mix stdin with file arguments:
 
 ```sh
@@ -286,7 +295,7 @@ $ cat events.csv \
 |------|-------------|
 | `-d`, `--delimiter <char>` | Input field delimiter (single character, default `,`) |
 | `--tsv` | Alias for `--delimiter '\t'` |
-| `-I`, `--input-format <fmt>` | Input format: `csv` (default), `tsv`, `json`, `ndjson`, `xml` |
+| `-I`, `--input-format <fmt>` | Input format: `csv` (default), `tsv`, `json`, `ndjson`, `xml`. Overrides file extension auto-detection. |
 | `-O`, `--output-format <fmt>` | Output format: `csv` (default), `tsv`, `json`, `ndjson`, `xml` |
 | `--no-type-inference` | Treat all columns as TEXT (skip auto-detection) |
 | `-H`, `--header` | Print column names as the first output row |
@@ -563,7 +572,7 @@ The database never touches disk and vanishes when the process exits. No state, n
 
 ## Limitations
 
-- **File format auto-detection** is based on file extension. Files without a recognized extension (`.csv`, `.tsv`, `.json`, `.ndjson`, `.xml`) fall back to the `-I` flag value (default: CSV).
+- **File format auto-detection** is based on file extension. Files without a recognized extension (`.csv`, `.tsv`, `.json`, `.ndjson`, `.xml`) default to CSV. Use `-I` to override.
 
 ## Related
 
diff --git a/build.zig b/build.zig
index 174e702..d4ae472 100644
--- a/build.zig
+++ b/build.zig
@@ -1782,6 +1782,100 @@ pub fn build(b: *std.Build) void {
     test_table_output_file.step.dependOn(b.getInstallStep());
     test_step.dependOn(&test_table_output_file.step);
 
+    // Integration test 157a: Auto-detect .json extension without -I flag
+    const test_autodetect_json = b.addSystemCommand(&.{
+        "bash", "-c",
+        \\result=$(./zig-out/bin/sql-pipe tests/fixtures/products.json 'SELECT name FROM products WHERE CAST(stock AS INTEGER) > 0 ORDER BY name')
+        \\expected=$(printf 'Doohickey\nGadget\nWidget')
+        \\[ "$result" = "$expected" ]
+    });
+    test_autodetect_json.step.dependOn(b.getInstallStep());
+    test_step.dependOn(&test_autodetect_json.step);
+
+    // Integration test 157b: Auto-detect .ndjson extension without -I flag
+    const test_autodetect_ndjson = b.addSystemCommand(&.{
+        "bash", "-c",
+        \\result=$(./zig-out/bin/sql-pipe tests/fixtures/events.ndjson 'SELECT event, COUNT(*) FROM events GROUP BY event ORDER BY event')
+        \\expected=$(printf 'login,2\nlogout,1\npurchase,2')
+        \\[ "$result" = "$expected" ]
+    });
+    test_autodetect_ndjson.step.dependOn(b.getInstallStep());
+    test_step.dependOn(&test_autodetect_ndjson.step);
+
+    // Integration test 157c: -I flag overrides file extension (JSON file forced to CSV)
+    const test_override_json_to_csv = b.addSystemCommand(&.{
+        "bash", "-c",
+        \\tmp=/tmp/sqlpipe_test_override.json
+        \\printf 'name,age\nAlice,30\nBob,25' > "$tmp"
+        \\result=$(./zig-out/bin/sql-pipe -I csv "$tmp" 'SELECT name FROM sqlpipe_test_override ORDER BY name')
+        \\rm -f "$tmp"
+        \\expected=$(printf 'Alice\nBob')
+        \\[ "$result" = "$expected" ]
+    });
+    test_override_json_to_csv.step.dependOn(b.getInstallStep());
+    test_step.dependOn(&test_override_json_to_csv.step);
+
+    // Integration test 157d: Ambiguous .txt extension defaults to CSV
+    const test_ambiguous_txt_csv = b.addSystemCommand(&.{
+        "bash", "-c",
+        \\tmp=/tmp/sqlpipe_test_ambiguous.txt
+        \\printf 'name,age\nAlice,30\nBob,25' > "$tmp"
+        \\result=$(./zig-out/bin/sql-pipe "$tmp" 'SELECT name FROM sqlpipe_test_ambiguous ORDER BY name')
+        \\rm -f "$tmp"
+        \\expected=$(printf 'Alice\nBob')
+        \\[ "$result" = "$expected" ]
+    });
+    test_ambiguous_txt_csv.step.dependOn(b.getInstallStep());
+    test_step.dependOn(&test_ambiguous_txt_csv.step);
+
+    // Integration test 157e: -I override with --input-format= syntax
+    const test_override_long_flag = b.addSystemCommand(&.{
+        "bash", "-c",
+        \\tmp=/tmp/sqlpipe_test_long.json
+        \\printf 'name,age\nAlice,30\nBob,25' > "$tmp"
+        \\result=$(./zig-out/bin/sql-pipe --input-format=csv "$tmp" 'SELECT name FROM sqlpipe_test_long ORDER BY name')
+        \\rm -f "$tmp"
+        \\expected=$(printf 'Alice\nBob')
+        \\[ "$result" = "$expected" ]
+    });
+    test_override_long_flag.step.dependOn(b.getInstallStep());
+    test_step.dependOn(&test_override_long_flag.step);
+
+    // Integration test 157f: -I override with -I= syntax
+    const test_override_short_eq = b.addSystemCommand(&.{
+        "bash", "-c",
+        \\tmp=/tmp/sqlpipe_test_short.json
+        \\printf 'name,age\nAlice,30\nBob,25' > "$tmp"
+        \\result=$(./zig-out/bin/sql-pipe -I=csv "$tmp" 'SELECT name FROM sqlpipe_test_short ORDER BY name')
+        \\rm -f "$tmp"
+        \\expected=$(printf 'Alice\nBob')
+        \\[ "$result" = "$expected" ]
+    });
+    test_override_short_eq.step.dependOn(b.getInstallStep());
+    test_step.dependOn(&test_override_short_eq.step);
+
+    // Integration test 157g: Ambiguous .txt extension with -I tsv override
+    const test_ambiguous_txt_tsv = b.addSystemCommand(&.{
+        "bash", "-c",
+        \\tmp=/tmp/sqlpipe_test_tsv.txt
+        \\printf 'name\tage\nAlice\t30\nBob\t25' > "$tmp"
+        \\result=$(./zig-out/bin/sql-pipe -I tsv "$tmp" 'SELECT name FROM sqlpipe_test_tsv ORDER BY name')
+        \\rm -f "$tmp"
+        \\expected=$(printf 'Alice\nBob')
+        \\[ "$result" = "$expected" ]
+    });
+    test_ambiguous_txt_tsv.step.dependOn(b.getInstallStep());
+    test_step.dependOn(&test_ambiguous_txt_tsv.step);
+
+    // Integration test 157h: Auto-detect .xml extension without -I flag
+    const test_autodetect_xml = b.addSystemCommand(&.{
+        "bash", "-c",
+        \\result=$(./zig-out/bin/sql-pipe tests/fixtures/feed.xml --xml-root channel --xml-row item 'SELECT COUNT(*) FROM feed')
+        \\[ "$result" = "3" ]
+    });
+    test_autodetect_xml.step.dependOn(b.getInstallStep());
+    test_step.dependOn(&test_autodetect_xml.step);
+
     // ─── Fixture-based integration tests ─────────────────────────────────────
     // These tests use sample files committed in tests/fixtures/ to exercise
     // the binary end-to-end with realistic data across all supported formats.
@@ -1917,11 +2011,11 @@ pub fn build(b: *std.Build) void {
     });
     fixture_test_step.dependOn(&fixture_mixed_join.step);
 
-    // Fixture test 14: CSV file + NDJSON stdin mix
+    // Fixture test 14: CSV file + NDJSON file mix (auto-detected from extensions)
     const fixture_csv_ndjson_mix = b.addSystemCommand(&.{
         "bash", "-c",
-        \\result=$(cat tests/fixtures/events.ndjson | ./zig-out/bin/sql-pipe -I ndjson tests/fixtures/customers.csv \
-        \\    'SELECT c.name, e.event FROM t e JOIN customers c ON LOWER(e.user) = LOWER(c.name) ORDER BY c.name, e.event')
+        \\result=$(./zig-out/bin/sql-pipe tests/fixtures/events.ndjson tests/fixtures/customers.csv \
+        \\    'SELECT c.name, e.event FROM events e JOIN customers c ON LOWER(e.user) = LOWER(c.name) ORDER BY c.name, e.event')
         \\expected=$(printf 'Alice,login\nAlice,logout\nAlice,purchase\nBob,purchase\nCarol,login')
         \\[ "$result" = "$expected" ]
     });
diff --git a/docs/sql-pipe.1.scd b/docs/sql-pipe.1.scd
index 82458e3..6d3c990 100644
--- a/docs/sql-pipe.1.scd
+++ b/docs/sql-pipe.1.scd
@@ -21,8 +21,9 @@ DESCRIPTION
 	Stdin and file arguments can be combined — stdin is always table *t*.
 
 	Input format for files is auto-detected from the file extension (*.csv*,
-	*.tsv*, *.json*, *.ndjson*, *.xml*). Files without a recognized extension
-	fall back to the *-I* flag value (default: CSV).
+	*.tsv*, *.json*, *.ndjson*, *.xml*). Unrecognized or missing extensions
+	default to CSV. Use *-I* to override auto-detection (e.g., when a
+	TSV file has a *.txt* extension).
 
 	This tool is useful for quick data transformations, filtering, grouping,
 	aggregations, and multi-file joins without manual SQL database setup.
@@ -52,6 +53,17 @@ OPTIONS
 	*--tsv*
 		Alias for *--delimiter '\\t'*. Parses tab-separated input.
 
+	*-I, --input-format* <fmt>
+		Set the input format explicitly: *csv* (default), *tsv*, *json*,
+		*ndjson*, or *xml*. When set, overrides file extension auto-detection
+		for all file arguments. Stdin always uses this value (no filename
+		to inspect). Useful when a file has an ambiguous extension (*.txt*,
+		*.dat*) or no extension at all.
+
+	*-O, --output-format* <fmt>
+		Set the output format: *csv* (default), *tsv*, *json*, *ndjson*,
+		or *xml*.
+
 	*--no-type-inference*
 		Treat all columns as TEXT. Skips automatic type detection and uses plain
 		TEXT affinity for all columns in the SQLite table. This can improve
@@ -162,6 +174,14 @@ EXAMPLES
 		    'SELECT c.name, SUM(o.amount) FROM orders o ++
 		     JOIN customers c ON o.cust_id = c.id GROUP BY c.name'
 
+	Query a JSON file (format auto-detected from extension):
+
+		$ sql-pipe data.json 'SELECT * FROM data WHERE score > 80'
+
+	Override auto-detection when the extension is wrong:
+
+		$ sql-pipe -I tsv data.txt 'SELECT * FROM data'
+
 	Mix stdin (as table t) with a file argument:
 
 		$ cat events.csv | sql-pipe users.csv ++
diff --git a/src/args.zig b/src/args.zig
index b67717d..2fdddb7 100644
--- a/src/args.zig
+++ b/src/args.zig
@@ -193,6 +193,7 @@ pub fn printUsage(writer: *std.Io.Writer) !void {
         \\  -d, --delimiter <string>     Input field delimiter for CSV: 1–8 chars (default: ,)
         \\  --tsv                        Alias for --delimiter '\t'
         \\  -I, --input-format <fmt>     Input format: csv (default), tsv, json, ndjson, xml
+        \\                               Overrides file extension auto-detection; stdin always uses this value
         \\  -O, --output-format <fmt>    Output format: csv (default), tsv, json, ndjson, xml
         \\  --json                       Alias for --output-format json
         \\  --no-type-inference          Treat all columns as TEXT (CSV input only)
@@ -237,6 +238,8 @@ pub fn printUsage(writer: *std.Io.Writer) !void {
         \\  cat data.psv | sql-pipe -d '|' 'SELECT * FROM t'
         \\  cat data.csv | sql-pipe 'SELECT region, SUM(revenue) FROM t GROUP BY region'
         \\  sql-pipe orders.csv 'SELECT * FROM orders WHERE amount > 100'
+        \\  sql-pipe data.json 'SELECT * FROM data WHERE score > 80'
+        \\  sql-pipe -I tsv data.txt 'SELECT * FROM data'
         \\  sql-pipe orders.csv customers.csv 'SELECT c.name, SUM(o.amount) FROM orders o JOIN customers c ON o.cust_id = c.id GROUP BY c.name'
         \\  cat events.csv | sql-pipe users.csv 'SELECT * FROM t JOIN users ON t.uid = users.id'
         \\  cat data.csv | sql-pipe --output-format json 'SELECT * FROM t'
@@ -276,6 +279,7 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP
     var delimiter: []const u8 = ",";
     var header = false;
     var input_format: InputFormat = .csv;
+    var input_format_explicit = false;
     var output_format: OutputFormat = .csv;
 
     var max_rows: ?usize = null;
@@ -341,10 +345,13 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP
             i += 1;
             if (i >= args.len) return error.InvalidInputFormat;
             input_format = InputFormat.parse(args[i]) catch return error.InvalidInputFormat;
+            input_format_explicit = true;
         } else if (std.mem.startsWith(u8, arg, "--input-format=")) {
             input_format = InputFormat.parse(arg["--input-format=".len..]) catch return error.InvalidInputFormat;
+            input_format_explicit = true;
         } else if (std.mem.startsWith(u8, arg, "-I=")) {
             input_format = InputFormat.parse(arg["-I=".len..]) catch return error.InvalidInputFormat;
+            input_format_explicit = true;
         } else if (std.mem.eql(u8, arg, "-O") or std.mem.eql(u8, arg, "--output-format")) {
             i += 1;
             if (i >= args.len) return error.InvalidOutputFormat;
@@ -445,7 +452,7 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP
             // Special modes: every positional arg is a file input
             for (pos) |p| {
                 const name = try tableNameFromPath(allocator, p);
-                const fmt = InputFormat.fromExtension(p) orelse input_format;
+                const fmt = if (input_format_explicit) input_format else (InputFormat.fromExtension(p) orelse input_format);
                 try files.append(allocator, .{
                     .path = p,
                     .table_name = name,
@@ -457,7 +464,7 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP
             query = pos[pos.len - 1];
             for (pos[0 .. pos.len - 1]) |p| {
                 const name = try tableNameFromPath(allocator, p);
-                const fmt = InputFormat.fromExtension(p) orelse input_format;
+                const fmt = if (input_format_explicit) input_format else (InputFormat.fromExtension(p) orelse input_format);
                 try files.append(allocator, .{
                     .path = p,
                     .table_name = name,

From ea8f87345d46758ccb29717ee9ecf71748dc77c5 Mon Sep 17 00:00:00 2001
From: "Victor M. Varela" <vmvarela@gmail.com>
Date: Sat, 13 Jun 2026 09:46:21 +0200
Subject: [PATCH 2/2] fix: propagate auto-detected format to special modes and
 validation checks

Compute effective_input_format from per-file auto-detection when a file
argument is present, and use it for:
- --columns, --validate, --sample mode dispatch (Issue A)
- --json-path validation (Issue B)
- --xml-root/--xml-row name validation (Issue C)

Previously these paths used the global input_format (default CSV or
explicit -I value), causing auto-detected .tsv/.json/.xml files to be
parsed as CSV in special modes and valid --json-path invocations to be
rejected.
---
 src/args.zig | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/args.zig b/src/args.zig
index 2fdddb7..0c88a28 100644
--- a/src/args.zig
+++ b/src/args.zig
@@ -474,6 +474,13 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP
         }
     }
 
+    // Effective input format: per-file auto-detection when a file is present,
+    // else the global default (CSV) or explicit -I value.
+    const effective_input_format: InputFormat = if (files.items.len > 0)
+        (if (input_format_explicit) input_format else files.items[0].format)
+    else
+        input_format;
+
     // Check for duplicate table names (would cause conflicting table definitions)
     {
         var seen = std.StringHashMap(void).init(allocator);
@@ -533,13 +540,13 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP
         return error.SilentVerboseConflict;
 
     // --xml-root and --xml-row must be valid XML element names (only validated in XML mode)
-    if (input_format == .xml or output_format == .xml) {
+    if (effective_input_format == .xml or output_format == .xml) {
         if (!isValidXmlName(xml_root) or !isValidXmlName(xml_row))
             return error.InvalidXmlName;
     }
 
-    // --json-path requires -I json (the flag only applies to JSON object navigation)
-    if (json_path != null and input_format != .json)
+    // --json-path requires JSON input (the flag only applies to JSON object navigation)
+    if (json_path != null and effective_input_format != .json)
         return error.JsonPathRequiresJson;
 
     // --table requires CSV or TSV output format (table formatting is visual only)
@@ -552,7 +559,7 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP
             .files = files.items,
             .delimiter = delimiter,
             .verbose = verbose,
-            .input_format = input_format,
+            .input_format = effective_input_format,
             .xml_root_input = xml_root_input,
             .xml_row_input = xml_row_input,
             .json_path = json_path,
@@ -564,7 +571,7 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP
             .files = files.items,
             .delimiter = delimiter,
             .type_inference = type_inference,
-            .input_format = input_format,
+            .input_format = effective_input_format,
             .xml_root_input = xml_root_input,
             .xml_row_input = xml_row_input,
             .json_path = json_path,
@@ -575,7 +582,7 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP
         return .{ .sample = SampleArgs{
             .files = files.items,
             .delimiter = delimiter,
-            .input_format = input_format,
+            .input_format = effective_input_format,
             .n = sample_n,
             .type_inference = type_inference,
         } };