vmvarela · vmvarela · Jun 13, 2026 · Jun 13, 2026 · Jun 13, 2026
diff --git a/README.md b/README.md
@@ -250,12 +250,21 @@ Pass files as positional arguments instead of piping through stdin. Each file be
 # Single file — no more cat
 $ sql-pipe orders.csv 'SELECT * FROM orders WHERE amount > 100'
 
+# JSON file — extension tells sql-pipe the format, no -I needed
+$ sql-pipe data.json 'SELECT * FROM data WHERE score > 80'
+
 # Multi-file join — the #1 reason people reach for DuckDB
 $ sql-pipe orders.csv customers.csv \
     'SELECT c.name, SUM(o.amount) FROM orders o
      JOIN customers c ON o.cust_id = c.id GROUP BY c.name'
 ```
 
+Use `-I` to override auto-detection when the extension is wrong or ambiguous (`.txt`, `.dat`):
+
+```sh
+$ sql-pipe -I tsv data.txt 'SELECT * FROM data'
+```
+
 Stdin still works and is always available as table `t`. Mix stdin with file arguments:
 
 ```sh
@@ -286,7 +295,7 @@ $ cat events.csv \
 |------|-------------|
 | `-d`, `--delimiter <char>` | Input field delimiter (single character, default `,`) |
 | `--tsv` | Alias for `--delimiter '\t'` |
-| `-I`, `--input-format <fmt>` | Input format: `csv` (default), `tsv`, `json`, `ndjson`, `xml` |
+| `-I`, `--input-format <fmt>` | Input format: `csv` (default), `tsv`, `json`, `ndjson`, `xml`. Overrides file extension auto-detection. |
 | `-O`, `--output-format <fmt>` | Output format: `csv` (default), `tsv`, `json`, `ndjson`, `xml` |
 | `--no-type-inference` | Treat all columns as TEXT (skip auto-detection) |
 | `-H`, `--header` | Print column names as the first output row |
@@ -563,7 +572,7 @@ The database never touches disk and vanishes when the process exits. No state, n
 
 ## Limitations
 
-- **File format auto-detection** is based on file extension. Files without a recognized extension (`.csv`, `.tsv`, `.json`, `.ndjson`, `.xml`) fall back to the `-I` flag value (default: CSV).
+- **File format auto-detection** is based on file extension. Files without a recognized extension (`.csv`, `.tsv`, `.json`, `.ndjson`, `.xml`) default to CSV. Use `-I` to override.
 
 ## Related
 

diff --git a/build.zig b/build.zig
@@ -1782,6 +1782,100 @@ pub fn build(b: *std.Build) void {
     test_table_output_file.step.dependOn(b.getInstallStep());
     test_step.dependOn(&test_table_output_file.step);
 
+    // Integration test 157a: Auto-detect .json extension without -I flag
+    const test_autodetect_json = b.addSystemCommand(&.{
+        "bash", "-c",
+        \\result=$(./zig-out/bin/sql-pipe tests/fixtures/products.json 'SELECT name FROM products WHERE CAST(stock AS INTEGER) > 0 ORDER BY name')
+        \\expected=$(printf 'Doohickey\nGadget\nWidget')
+        \\[ "$result" = "$expected" ]
+    });
+    test_autodetect_json.step.dependOn(b.getInstallStep());
+    test_step.dependOn(&test_autodetect_json.step);
+
+    // Integration test 157b: Auto-detect .ndjson extension without -I flag
+    const test_autodetect_ndjson = b.addSystemCommand(&.{
+        "bash", "-c",
+        \\result=$(./zig-out/bin/sql-pipe tests/fixtures/events.ndjson 'SELECT event, COUNT(*) FROM events GROUP BY event ORDER BY event')
+        \\expected=$(printf 'login,2\nlogout,1\npurchase,2')
+        \\[ "$result" = "$expected" ]
+    });
+    test_autodetect_ndjson.step.dependOn(b.getInstallStep());
+    test_step.dependOn(&test_autodetect_ndjson.step);
+
+    // Integration test 157c: -I flag overrides file extension (JSON file forced to CSV)
+    const test_override_json_to_csv = b.addSystemCommand(&.{
+        "bash", "-c",
+        \\tmp=/tmp/sqlpipe_test_override.json
+        \\printf 'name,age\nAlice,30\nBob,25' > "$tmp"
+        \\result=$(./zig-out/bin/sql-pipe -I csv "$tmp" 'SELECT name FROM sqlpipe_test_override ORDER BY name')
+        \\rm -f "$tmp"
+        \\expected=$(printf 'Alice\nBob')
+        \\[ "$result" = "$expected" ]
+    });
+    test_override_json_to_csv.step.dependOn(b.getInstallStep());
+    test_step.dependOn(&test_override_json_to_csv.step);
+
+    // Integration test 157d: Ambiguous .txt extension defaults to CSV
+    const test_ambiguous_txt_csv = b.addSystemCommand(&.{
+        "bash", "-c",
+        \\tmp=/tmp/sqlpipe_test_ambiguous.txt
+        \\printf 'name,age\nAlice,30\nBob,25' > "$tmp"
+        \\result=$(./zig-out/bin/sql-pipe "$tmp" 'SELECT name FROM sqlpipe_test_ambiguous ORDER BY name')
+        \\rm -f "$tmp"
+        \\expected=$(printf 'Alice\nBob')
+        \\[ "$result" = "$expected" ]
+    });
+    test_ambiguous_txt_csv.step.dependOn(b.getInstallStep());
+    test_step.dependOn(&test_ambiguous_txt_csv.step);
+
+    // Integration test 157e: -I override with --input-format= syntax
+    const test_override_long_flag = b.addSystemCommand(&.{
+        "bash", "-c",
+        \\tmp=/tmp/sqlpipe_test_long.json
+        \\printf 'name,age\nAlice,30\nBob,25' > "$tmp"
+        \\result=$(./zig-out/bin/sql-pipe --input-format=csv "$tmp" 'SELECT name FROM sqlpipe_test_long ORDER BY name')
+        \\rm -f "$tmp"
+        \\expected=$(printf 'Alice\nBob')
+        \\[ "$result" = "$expected" ]
+    });
+    test_override_long_flag.step.dependOn(b.getInstallStep());
+    test_step.dependOn(&test_override_long_flag.step);
+
+    // Integration test 157f: -I override with -I= syntax
+    const test_override_short_eq = b.addSystemCommand(&.{
+        "bash", "-c",
+        \\tmp=/tmp/sqlpipe_test_short.json
+        \\printf 'name,age\nAlice,30\nBob,25' > "$tmp"
+        \\result=$(./zig-out/bin/sql-pipe -I=csv "$tmp" 'SELECT name FROM sqlpipe_test_short ORDER BY name')
+        \\rm -f "$tmp"
+        \\expected=$(printf 'Alice\nBob')
+        \\[ "$result" = "$expected" ]
+    });
+    test_override_short_eq.step.dependOn(b.getInstallStep());
+    test_step.dependOn(&test_override_short_eq.step);
+
+    // Integration test 157g: Ambiguous .txt extension with -I tsv override
+    const test_ambiguous_txt_tsv = b.addSystemCommand(&.{
+        "bash", "-c",
+        \\tmp=/tmp/sqlpipe_test_tsv.txt
+        \\printf 'name\tage\nAlice\t30\nBob\t25' > "$tmp"
+        \\result=$(./zig-out/bin/sql-pipe -I tsv "$tmp" 'SELECT name FROM sqlpipe_test_tsv ORDER BY name')
+        \\rm -f "$tmp"
+        \\expected=$(printf 'Alice\nBob')
+        \\[ "$result" = "$expected" ]
+    });
+    test_ambiguous_txt_tsv.step.dependOn(b.getInstallStep());
+    test_step.dependOn(&test_ambiguous_txt_tsv.step);
+
+    // Integration test 157h: Auto-detect .xml extension without -I flag
+    const test_autodetect_xml = b.addSystemCommand(&.{
+        "bash", "-c",
+        \\result=$(./zig-out/bin/sql-pipe tests/fixtures/feed.xml --xml-root channel --xml-row item 'SELECT COUNT(*) FROM feed')
+        \\[ "$result" = "3" ]
+    });
+    test_autodetect_xml.step.dependOn(b.getInstallStep());
+    test_step.dependOn(&test_autodetect_xml.step);
+
     // ─── Fixture-based integration tests ─────────────────────────────────────
     // These tests use sample files committed in tests/fixtures/ to exercise
     // the binary end-to-end with realistic data across all supported formats.
@@ -1917,11 +2011,11 @@ pub fn build(b: *std.Build) void {
     });
     fixture_test_step.dependOn(&fixture_mixed_join.step);
 
-    // Fixture test 14: CSV file + NDJSON stdin mix
+    // Fixture test 14: CSV file + NDJSON file mix (auto-detected from extensions)
     const fixture_csv_ndjson_mix = b.addSystemCommand(&.{
         "bash", "-c",
-        \\result=$(cat tests/fixtures/events.ndjson | ./zig-out/bin/sql-pipe -I ndjson tests/fixtures/customers.csv \
-        \\    'SELECT c.name, e.event FROM t e JOIN customers c ON LOWER(e.user) = LOWER(c.name) ORDER BY c.name, e.event')
+        \\result=$(./zig-out/bin/sql-pipe tests/fixtures/events.ndjson tests/fixtures/customers.csv \
+        \\    'SELECT c.name, e.event FROM events e JOIN customers c ON LOWER(e.user) = LOWER(c.name) ORDER BY c.name, e.event')
         \\expected=$(printf 'Alice,login\nAlice,logout\nAlice,purchase\nBob,purchase\nCarol,login')
         \\[ "$result" = "$expected" ]
     });

diff --git a/docs/sql-pipe.1.scd b/docs/sql-pipe.1.scd
@@ -21,8 +21,9 @@ DESCRIPTION
 	Stdin and file arguments can be combined — stdin is always table *t*.
 
 	Input format for files is auto-detected from the file extension (*.csv*,
-	*.tsv*, *.json*, *.ndjson*, *.xml*). Files without a recognized extension
-	fall back to the *-I* flag value (default: CSV).
+	*.tsv*, *.json*, *.ndjson*, *.xml*). Unrecognized or missing extensions
+	default to CSV. Use *-I* to override auto-detection (e.g., when a
+	TSV file has a *.txt* extension).
 
 	This tool is useful for quick data transformations, filtering, grouping,
 	aggregations, and multi-file joins without manual SQL database setup.
@@ -52,6 +53,17 @@ OPTIONS
 	*--tsv*
 		Alias for *--delimiter '\\t'*. Parses tab-separated input.
 
+	*-I, --input-format* <fmt>
+		Set the input format explicitly: *csv* (default), *tsv*, *json*,
+		*ndjson*, or *xml*. When set, overrides file extension auto-detection
+		for all file arguments. Stdin always uses this value (no filename
+		to inspect). Useful when a file has an ambiguous extension (*.txt*,
+		*.dat*) or no extension at all.
+
+	*-O, --output-format* <fmt>
+		Set the output format: *csv* (default), *tsv*, *json*, *ndjson*,
+		or *xml*.
+
 	*--no-type-inference*
 		Treat all columns as TEXT. Skips automatic type detection and uses plain
 		TEXT affinity for all columns in the SQLite table. This can improve
@@ -162,6 +174,14 @@ EXAMPLES
 		    'SELECT c.name, SUM(o.amount) FROM orders o ++
 		     JOIN customers c ON o.cust_id = c.id GROUP BY c.name'
 
+	Query a JSON file (format auto-detected from extension):
+
+		$ sql-pipe data.json 'SELECT * FROM data WHERE score > 80'
+
+	Override auto-detection when the extension is wrong:
+
+		$ sql-pipe -I tsv data.txt 'SELECT * FROM data'
+
 	Mix stdin (as table t) with a file argument:
 
 		$ cat events.csv | sql-pipe users.csv ++

diff --git a/src/args.zig b/src/args.zig
@@ -193,6 +193,7 @@ pub fn printUsage(writer: *std.Io.Writer) !void {
         \\  -d, --delimiter <string>     Input field delimiter for CSV: 1–8 chars (default: ,)
         \\  --tsv                        Alias for --delimiter '\t'
         \\  -I, --input-format <fmt>     Input format: csv (default), tsv, json, ndjson, xml
+        \\                               Overrides file extension auto-detection; stdin always uses this value
         \\  -O, --output-format <fmt>    Output format: csv (default), tsv, json, ndjson, xml
         \\  --json                       Alias for --output-format json
         \\  --no-type-inference          Treat all columns as TEXT (CSV input only)
@@ -237,6 +238,8 @@ pub fn printUsage(writer: *std.Io.Writer) !void {
         \\  cat data.psv | sql-pipe -d '|' 'SELECT * FROM t'
         \\  cat data.csv | sql-pipe 'SELECT region, SUM(revenue) FROM t GROUP BY region'
         \\  sql-pipe orders.csv 'SELECT * FROM orders WHERE amount > 100'
+        \\  sql-pipe data.json 'SELECT * FROM data WHERE score > 80'
+        \\  sql-pipe -I tsv data.txt 'SELECT * FROM data'
         \\  sql-pipe orders.csv customers.csv 'SELECT c.name, SUM(o.amount) FROM orders o JOIN customers c ON o.cust_id = c.id GROUP BY c.name'
         \\  cat events.csv | sql-pipe users.csv 'SELECT * FROM t JOIN users ON t.uid = users.id'
         \\  cat data.csv | sql-pipe --output-format json 'SELECT * FROM t'
@@ -276,6 +279,7 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP
     var delimiter: []const u8 = ",";
     var header = false;
     var input_format: InputFormat = .csv;
+    var input_format_explicit = false;
     var output_format: OutputFormat = .csv;
 
     var max_rows: ?usize = null;
@@ -341,10 +345,13 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP
             i += 1;
             if (i >= args.len) return error.InvalidInputFormat;
             input_format = InputFormat.parse(args[i]) catch return error.InvalidInputFormat;
+            input_format_explicit = true;
         } else if (std.mem.startsWith(u8, arg, "--input-format=")) {
             input_format = InputFormat.parse(arg["--input-format=".len..]) catch return error.InvalidInputFormat;
+            input_format_explicit = true;
         } else if (std.mem.startsWith(u8, arg, "-I=")) {
             input_format = InputFormat.parse(arg["-I=".len..]) catch return error.InvalidInputFormat;
+            input_format_explicit = true;
         } else if (std.mem.eql(u8, arg, "-O") or std.mem.eql(u8, arg, "--output-format")) {
             i += 1;
             if (i >= args.len) return error.InvalidOutputFormat;
@@ -445,7 +452,7 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP
             // Special modes: every positional arg is a file input
             for (pos) |p| {
                 const name = try tableNameFromPath(allocator, p);
-                const fmt = InputFormat.fromExtension(p) orelse input_format;
+                const fmt = if (input_format_explicit) input_format else (InputFormat.fromExtension(p) orelse input_format);
                 try files.append(allocator, .{
                     .path = p,
                     .table_name = name,
@@ -457,7 +464,7 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP
             query = pos[pos.len - 1];
             for (pos[0 .. pos.len - 1]) |p| {
                 const name = try tableNameFromPath(allocator, p);
-                const fmt = InputFormat.fromExtension(p) orelse input_format;
+                const fmt = if (input_format_explicit) input_format else (InputFormat.fromExtension(p) orelse input_format);
                 try files.append(allocator, .{
                     .path = p,
                     .table_name = name,
@@ -467,6 +474,13 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP
         }
     }
 
+    // Effective input format: per-file auto-detection when a file is present,
+    // else the global default (CSV) or explicit -I value.
+    const effective_input_format: InputFormat = if (files.items.len > 0)
+        (if (input_format_explicit) input_format else files.items[0].format)
+    else
+        input_format;
+
     // Check for duplicate table names (would cause conflicting table definitions)
     {
         var seen = std.StringHashMap(void).init(allocator);
@@ -526,13 +540,13 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP
         return error.SilentVerboseConflict;
 
     // --xml-root and --xml-row must be valid XML element names (only validated in XML mode)
-    if (input_format == .xml or output_format == .xml) {
+    if (effective_input_format == .xml or output_format == .xml) {
         if (!isValidXmlName(xml_root) or !isValidXmlName(xml_row))
             return error.InvalidXmlName;
     }
 
-    // --json-path requires -I json (the flag only applies to JSON object navigation)
-    if (json_path != null and input_format != .json)
+    // --json-path requires JSON input (the flag only applies to JSON object navigation)
+    if (json_path != null and effective_input_format != .json)
         return error.JsonPathRequiresJson;
 
     // --table requires CSV or TSV output format (table formatting is visual only)
@@ -545,7 +559,7 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP
             .files = files.items,
             .delimiter = delimiter,
             .verbose = verbose,
-            .input_format = input_format,
+            .input_format = effective_input_format,
             .xml_root_input = xml_root_input,
             .xml_row_input = xml_row_input,
             .json_path = json_path,
@@ -557,7 +571,7 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP
             .files = files.items,
             .delimiter = delimiter,
             .type_inference = type_inference,
-            .input_format = input_format,
+            .input_format = effective_input_format,
             .xml_root_input = xml_root_input,
             .xml_row_input = xml_row_input,
             .json_path = json_path,
@@ -568,7 +582,7 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP
         return .{ .sample = SampleArgs{
             .files = files.items,
             .delimiter = delimiter,
-            .input_format = input_format,
+            .input_format = effective_input_format,
             .n = sample_n,
             .type_inference = type_inference,
         } };