Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -250,12 +250,21 @@ Pass files as positional arguments instead of piping through stdin. Each file be
# Single file — no more cat
$ sql-pipe orders.csv 'SELECT * FROM orders WHERE amount > 100'

# JSON file — extension tells sql-pipe the format, no -I needed
$ sql-pipe data.json 'SELECT * FROM data WHERE score > 80'

# Multi-file join — the #1 reason people reach for DuckDB
$ sql-pipe orders.csv customers.csv \
'SELECT c.name, SUM(o.amount) FROM orders o
JOIN customers c ON o.cust_id = c.id GROUP BY c.name'
```

Use `-I` to override auto-detection when the extension is wrong or ambiguous (`.txt`, `.dat`):

```sh
$ sql-pipe -I tsv data.txt 'SELECT * FROM data'
```

Stdin still works and is always available as table `t`. Mix stdin with file arguments:

```sh
Expand Down Expand Up @@ -286,7 +295,7 @@ $ cat events.csv \
|------|-------------|
| `-d`, `--delimiter <char>` | Input field delimiter (single character, default `,`) |
| `--tsv` | Alias for `--delimiter '\t'` |
| `-I`, `--input-format <fmt>` | Input format: `csv` (default), `tsv`, `json`, `ndjson`, `xml` |
| `-I`, `--input-format <fmt>` | Input format: `csv` (default), `tsv`, `json`, `ndjson`, `xml`. Overrides file extension auto-detection. |
| `-O`, `--output-format <fmt>` | Output format: `csv` (default), `tsv`, `json`, `ndjson`, `xml` |
| `--no-type-inference` | Treat all columns as TEXT (skip auto-detection) |
| `-H`, `--header` | Print column names as the first output row |
Expand Down Expand Up @@ -563,7 +572,7 @@ The database never touches disk and vanishes when the process exits. No state, n

## Limitations

- **File format auto-detection** is based on file extension. Files without a recognized extension (`.csv`, `.tsv`, `.json`, `.ndjson`, `.xml`) fall back to the `-I` flag value (default: CSV).
- **File format auto-detection** is based on file extension. Files without a recognized extension (`.csv`, `.tsv`, `.json`, `.ndjson`, `.xml`) default to CSV. Use `-I` to override.

## Related

Expand Down
100 changes: 97 additions & 3 deletions build.zig
Original file line number Diff line number Diff line change
Expand Up @@ -1782,6 +1782,100 @@ pub fn build(b: *std.Build) void {
test_table_output_file.step.dependOn(b.getInstallStep());
test_step.dependOn(&test_table_output_file.step);

// Integration test 157a: Auto-detect .json extension without -I flag
const test_autodetect_json = b.addSystemCommand(&.{
"bash", "-c",
\\result=$(./zig-out/bin/sql-pipe tests/fixtures/products.json 'SELECT name FROM products WHERE CAST(stock AS INTEGER) > 0 ORDER BY name')
\\expected=$(printf 'Doohickey\nGadget\nWidget')
\\[ "$result" = "$expected" ]
});
test_autodetect_json.step.dependOn(b.getInstallStep());
test_step.dependOn(&test_autodetect_json.step);

// Integration test 157b: Auto-detect .ndjson extension without -I flag
const test_autodetect_ndjson = b.addSystemCommand(&.{
"bash", "-c",
\\result=$(./zig-out/bin/sql-pipe tests/fixtures/events.ndjson 'SELECT event, COUNT(*) FROM events GROUP BY event ORDER BY event')
\\expected=$(printf 'login,2\nlogout,1\npurchase,2')
\\[ "$result" = "$expected" ]
});
test_autodetect_ndjson.step.dependOn(b.getInstallStep());
test_step.dependOn(&test_autodetect_ndjson.step);

// Integration test 157c: -I flag overrides file extension (JSON file forced to CSV)
const test_override_json_to_csv = b.addSystemCommand(&.{
"bash", "-c",
\\tmp=/tmp/sqlpipe_test_override.json
\\printf 'name,age\nAlice,30\nBob,25' > "$tmp"
\\result=$(./zig-out/bin/sql-pipe -I csv "$tmp" 'SELECT name FROM sqlpipe_test_override ORDER BY name')
\\rm -f "$tmp"
\\expected=$(printf 'Alice\nBob')
\\[ "$result" = "$expected" ]
});
test_override_json_to_csv.step.dependOn(b.getInstallStep());
test_step.dependOn(&test_override_json_to_csv.step);

// Integration test 157d: Ambiguous .txt extension defaults to CSV
const test_ambiguous_txt_csv = b.addSystemCommand(&.{
"bash", "-c",
\\tmp=/tmp/sqlpipe_test_ambiguous.txt
\\printf 'name,age\nAlice,30\nBob,25' > "$tmp"
\\result=$(./zig-out/bin/sql-pipe "$tmp" 'SELECT name FROM sqlpipe_test_ambiguous ORDER BY name')
\\rm -f "$tmp"
\\expected=$(printf 'Alice\nBob')
\\[ "$result" = "$expected" ]
});
test_ambiguous_txt_csv.step.dependOn(b.getInstallStep());
test_step.dependOn(&test_ambiguous_txt_csv.step);

// Integration test 157e: -I override with --input-format= syntax
const test_override_long_flag = b.addSystemCommand(&.{
"bash", "-c",
\\tmp=/tmp/sqlpipe_test_long.json
\\printf 'name,age\nAlice,30\nBob,25' > "$tmp"
\\result=$(./zig-out/bin/sql-pipe --input-format=csv "$tmp" 'SELECT name FROM sqlpipe_test_long ORDER BY name')
\\rm -f "$tmp"
\\expected=$(printf 'Alice\nBob')
\\[ "$result" = "$expected" ]
});
test_override_long_flag.step.dependOn(b.getInstallStep());
test_step.dependOn(&test_override_long_flag.step);

// Integration test 157f: -I override with -I= syntax
const test_override_short_eq = b.addSystemCommand(&.{
"bash", "-c",
\\tmp=/tmp/sqlpipe_test_short.json
\\printf 'name,age\nAlice,30\nBob,25' > "$tmp"
\\result=$(./zig-out/bin/sql-pipe -I=csv "$tmp" 'SELECT name FROM sqlpipe_test_short ORDER BY name')
\\rm -f "$tmp"
\\expected=$(printf 'Alice\nBob')
\\[ "$result" = "$expected" ]
});
test_override_short_eq.step.dependOn(b.getInstallStep());
test_step.dependOn(&test_override_short_eq.step);

// Integration test 157g: Ambiguous .txt extension with -I tsv override
const test_ambiguous_txt_tsv = b.addSystemCommand(&.{
"bash", "-c",
\\tmp=/tmp/sqlpipe_test_tsv.txt
\\printf 'name\tage\nAlice\t30\nBob\t25' > "$tmp"
\\result=$(./zig-out/bin/sql-pipe -I tsv "$tmp" 'SELECT name FROM sqlpipe_test_tsv ORDER BY name')
\\rm -f "$tmp"
\\expected=$(printf 'Alice\nBob')
\\[ "$result" = "$expected" ]
});
test_ambiguous_txt_tsv.step.dependOn(b.getInstallStep());
test_step.dependOn(&test_ambiguous_txt_tsv.step);

// Integration test 157h: Auto-detect .xml extension without -I flag
const test_autodetect_xml = b.addSystemCommand(&.{
"bash", "-c",
\\result=$(./zig-out/bin/sql-pipe tests/fixtures/feed.xml --xml-root channel --xml-row item 'SELECT COUNT(*) FROM feed')
\\[ "$result" = "3" ]
});
test_autodetect_xml.step.dependOn(b.getInstallStep());
test_step.dependOn(&test_autodetect_xml.step);

// ─── Fixture-based integration tests ─────────────────────────────────────
// These tests use sample files committed in tests/fixtures/ to exercise
// the binary end-to-end with realistic data across all supported formats.
Expand Down Expand Up @@ -1917,11 +2011,11 @@ pub fn build(b: *std.Build) void {
});
fixture_test_step.dependOn(&fixture_mixed_join.step);

// Fixture test 14: CSV file + NDJSON stdin mix
// Fixture test 14: CSV file + NDJSON file mix (auto-detected from extensions)
const fixture_csv_ndjson_mix = b.addSystemCommand(&.{
"bash", "-c",
\\result=$(cat tests/fixtures/events.ndjson | ./zig-out/bin/sql-pipe -I ndjson tests/fixtures/customers.csv \
\\ 'SELECT c.name, e.event FROM t e JOIN customers c ON LOWER(e.user) = LOWER(c.name) ORDER BY c.name, e.event')
\\result=$(./zig-out/bin/sql-pipe tests/fixtures/events.ndjson tests/fixtures/customers.csv \
\\ 'SELECT c.name, e.event FROM events e JOIN customers c ON LOWER(e.user) = LOWER(c.name) ORDER BY c.name, e.event')
\\expected=$(printf 'Alice,login\nAlice,logout\nAlice,purchase\nBob,purchase\nCarol,login')
\\[ "$result" = "$expected" ]
});
Expand Down
24 changes: 22 additions & 2 deletions docs/sql-pipe.1.scd
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,9 @@ DESCRIPTION
Stdin and file arguments can be combined — stdin is always table *t*.

Input format for files is auto-detected from the file extension (*.csv*,
*.tsv*, *.json*, *.ndjson*, *.xml*). Files without a recognized extension
fall back to the *-I* flag value (default: CSV).
*.tsv*, *.json*, *.ndjson*, *.xml*). Unrecognized or missing extensions
default to CSV. Use *-I* to override auto-detection (e.g., when a
TSV file has a *.txt* extension).

This tool is useful for quick data transformations, filtering, grouping,
aggregations, and multi-file joins without manual SQL database setup.
Expand Down Expand Up @@ -52,6 +53,17 @@ OPTIONS
*--tsv*
Alias for *--delimiter '\\t'*. Parses tab-separated input.

*-I, --input-format* <fmt>
Set the input format explicitly: *csv* (default), *tsv*, *json*,
*ndjson*, or *xml*. When set, overrides file extension auto-detection
for all file arguments. Stdin always uses this value (no filename
to inspect). Useful when a file has an ambiguous extension (*.txt*,
*.dat*) or no extension at all.

*-O, --output-format* <fmt>
Set the output format: *csv* (default), *tsv*, *json*, *ndjson*,
or *xml*.

*--no-type-inference*
Treat all columns as TEXT. Skips automatic type detection and uses plain
TEXT affinity for all columns in the SQLite table. This can improve
Expand Down Expand Up @@ -162,6 +174,14 @@ EXAMPLES
'SELECT c.name, SUM(o.amount) FROM orders o ++
JOIN customers c ON o.cust_id = c.id GROUP BY c.name'

Query a JSON file (format auto-detected from extension):

$ sql-pipe data.json 'SELECT * FROM data WHERE score > 80'

Override auto-detection when the extension is wrong:

$ sql-pipe -I tsv data.txt 'SELECT * FROM data'

Mix stdin (as table t) with a file argument:

$ cat events.csv | sql-pipe users.csv ++
Expand Down
30 changes: 22 additions & 8 deletions src/args.zig
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ pub fn printUsage(writer: *std.Io.Writer) !void {
\\ -d, --delimiter <string> Input field delimiter for CSV: 1–8 chars (default: ,)
\\ --tsv Alias for --delimiter '\t'
\\ -I, --input-format <fmt> Input format: csv (default), tsv, json, ndjson, xml
\\ Overrides file extension auto-detection; stdin always uses this value
\\ -O, --output-format <fmt> Output format: csv (default), tsv, json, ndjson, xml
\\ --json Alias for --output-format json
\\ --no-type-inference Treat all columns as TEXT (CSV input only)
Expand Down Expand Up @@ -237,6 +238,8 @@ pub fn printUsage(writer: *std.Io.Writer) !void {
\\ cat data.psv | sql-pipe -d '|' 'SELECT * FROM t'
\\ cat data.csv | sql-pipe 'SELECT region, SUM(revenue) FROM t GROUP BY region'
\\ sql-pipe orders.csv 'SELECT * FROM orders WHERE amount > 100'
\\ sql-pipe data.json 'SELECT * FROM data WHERE score > 80'
\\ sql-pipe -I tsv data.txt 'SELECT * FROM data'
\\ sql-pipe orders.csv customers.csv 'SELECT c.name, SUM(o.amount) FROM orders o JOIN customers c ON o.cust_id = c.id GROUP BY c.name'
\\ cat events.csv | sql-pipe users.csv 'SELECT * FROM t JOIN users ON t.uid = users.id'
\\ cat data.csv | sql-pipe --output-format json 'SELECT * FROM t'
Expand Down Expand Up @@ -276,6 +279,7 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP
var delimiter: []const u8 = ",";
var header = false;
var input_format: InputFormat = .csv;
var input_format_explicit = false;
var output_format: OutputFormat = .csv;

var max_rows: ?usize = null;
Expand Down Expand Up @@ -341,10 +345,13 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP
i += 1;
if (i >= args.len) return error.InvalidInputFormat;
input_format = InputFormat.parse(args[i]) catch return error.InvalidInputFormat;
input_format_explicit = true;
} else if (std.mem.startsWith(u8, arg, "--input-format=")) {
input_format = InputFormat.parse(arg["--input-format=".len..]) catch return error.InvalidInputFormat;
input_format_explicit = true;
} else if (std.mem.startsWith(u8, arg, "-I=")) {
input_format = InputFormat.parse(arg["-I=".len..]) catch return error.InvalidInputFormat;
input_format_explicit = true;
} else if (std.mem.eql(u8, arg, "-O") or std.mem.eql(u8, arg, "--output-format")) {
i += 1;
if (i >= args.len) return error.InvalidOutputFormat;
Expand Down Expand Up @@ -445,7 +452,7 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP
// Special modes: every positional arg is a file input
for (pos) |p| {
const name = try tableNameFromPath(allocator, p);
const fmt = InputFormat.fromExtension(p) orelse input_format;
const fmt = if (input_format_explicit) input_format else (InputFormat.fromExtension(p) orelse input_format);
try files.append(allocator, .{
.path = p,
.table_name = name,
Expand All @@ -457,7 +464,7 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP
query = pos[pos.len - 1];
for (pos[0 .. pos.len - 1]) |p| {
const name = try tableNameFromPath(allocator, p);
const fmt = InputFormat.fromExtension(p) orelse input_format;
const fmt = if (input_format_explicit) input_format else (InputFormat.fromExtension(p) orelse input_format);
try files.append(allocator, .{
.path = p,
.table_name = name,
Expand All @@ -467,6 +474,13 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP
}
}

// Effective input format: per-file auto-detection when a file is present,
// else the global default (CSV) or explicit -I value.
const effective_input_format: InputFormat = if (files.items.len > 0)
(if (input_format_explicit) input_format else files.items[0].format)
else
input_format;

// Check for duplicate table names (would cause conflicting table definitions)
{
var seen = std.StringHashMap(void).init(allocator);
Expand Down Expand Up @@ -526,13 +540,13 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP
return error.SilentVerboseConflict;

// --xml-root and --xml-row must be valid XML element names (only validated in XML mode)
if (input_format == .xml or output_format == .xml) {
if (effective_input_format == .xml or output_format == .xml) {
if (!isValidXmlName(xml_root) or !isValidXmlName(xml_row))
return error.InvalidXmlName;
}

// --json-path requires -I json (the flag only applies to JSON object navigation)
if (json_path != null and input_format != .json)
// --json-path requires JSON input (the flag only applies to JSON object navigation)
if (json_path != null and effective_input_format != .json)
return error.JsonPathRequiresJson;

// --table requires CSV or TSV output format (table formatting is visual only)
Expand All @@ -545,7 +559,7 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP
.files = files.items,
.delimiter = delimiter,
.verbose = verbose,
.input_format = input_format,
.input_format = effective_input_format,
.xml_root_input = xml_root_input,
.xml_row_input = xml_row_input,
.json_path = json_path,
Expand All @@ -557,7 +571,7 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP
.files = files.items,
.delimiter = delimiter,
.type_inference = type_inference,
.input_format = input_format,
.input_format = effective_input_format,
.xml_root_input = xml_root_input,
.xml_row_input = xml_row_input,
.json_path = json_path,
Expand All @@ -568,7 +582,7 @@ pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlP
return .{ .sample = SampleArgs{
.files = files.items,
.delimiter = delimiter,
.input_format = input_format,
.input_format = effective_input_format,
.n = sample_n,
.type_inference = type_inference,
} };
Expand Down
Loading