diff --git a/.gitignore b/.gitignore index 88be062..d97da61 100644 --- a/.gitignore +++ b/.gitignore @@ -67,6 +67,9 @@ examples/herc2/herc2_*.tsv examples/herc2/classify_herc2_*.py rust/target/ bioscripts/output/ +test-data/ +test-reports/ +local.sh examples/**/work/* examples/**/.nextflow* diff --git a/bs b/bs index cb1beac..0b90bab 100755 --- a/bs +++ b/bs @@ -4,4 +4,4 @@ if command -v xcrun >/dev/null 2>&1; then export SDKROOT="${SDKROOT:-$(xcrun --show-sdk-path)}" export BINDGEN_EXTRA_CLANG_ARGS="${BINDGEN_EXTRA_CLANG_ARGS:--isysroot ${SDKROOT}}" fi -cargo run --manifest-path "$SCRIPT_DIR/rust/Cargo.toml" -p bioscript -- "$@" +cargo run --manifest-path "$SCRIPT_DIR/rust/Cargo.toml" -p bioscript-cli --bin bioscript -- "$@" diff --git a/docs/file-formats.md b/docs/file-formats.md new file mode 100644 index 0000000..f284fa5 --- /dev/null +++ b/docs/file-formats.md @@ -0,0 +1,410 @@ +# File Formats + +This document defines the file-format inspection layer for Bioscript: how we detect supported genomics files, what metadata we infer from them, which signals are authoritative vs heuristic, and which fixtures we use to verify that behavior. + +The current code already contains pieces of this logic in [`rust/bioscript-formats/src/genotype.rs`](/Users/madhavajay/dev/biovault-app/main/bioscript/rust/bioscript-formats/src/genotype.rs:124) and [`rust/bioscript-formats/src/prepare.rs`](/Users/madhavajay/dev/biovault-app/main/bioscript/rust/bioscript-formats/src/prepare.rs:34). The goal is to make that logic explicit and reusable through a dedicated inspection entrypoint rather than keeping detection embedded inside each loader. + +## Goals + +- Detect the broad file family from path, container, and sampled contents. +- Support SNP-array genotype exports, VCF/VCF.GZ, CRAM/BAM, and reference FASTA files. +- Return structured metadata that other parts of Bioscript can reuse. +- Separate strong evidence from best-effort guesses. +- Keep detection cheap enough to run before import, in the CLI, in tests, and eventually in UI flows. + +## Public API + +The inspection layer now lives in [`rust/bioscript-formats/src/inspect.rs`](/Users/madhavajay/dev/biovault-app/main/bioscript/rust/bioscript-formats/src/inspect.rs:1) and currently exposes one main entrypoint: + +```rust +pub fn inspect_file(path: &Path, options: &InspectOptions) -> Result +``` + +Current result shape: + +```rust +pub struct FileInspection { + pub path: PathBuf, + pub container: FileContainer, + pub detected_kind: DetectedKind, + pub confidence: DetectionConfidence, + pub source: Option, + pub assembly: Option, + pub phased: Option, + pub selected_entry: Option, + pub has_index: Option, + pub index_path: Option, + pub reference_matches: Option, + pub evidence: Vec, + pub warnings: Vec, + pub duration_ms: u128, +} +``` + +Current source/platform shape: + +```rust +pub struct SourceMetadata { + pub vendor: Option, + pub platform_version: Option, + pub confidence: DetectionConfidence, + pub evidence: Vec, +} +``` + +Use `platform_version` as a string, not an integer. These labels are vendor-specific and are not guaranteed to form a simple numeric ordering. + +The `InspectOptions` surface is intentionally small in the first pass: + +```rust +pub struct InspectOptions { + pub input_index: Option, + pub reference_file: Option, + pub reference_index: Option, +} +``` + +The CLI exposes the same functionality through: + +```text +bs inspect [--input-index ] [--reference-file ] [--reference-index ] +``` + +The current implementation prints a stable tab-separated report from `FileInspection::render_text()`, which is suitable for shell use and for downstream callers that just need a lightweight CLI probe. + +Important rule: prefer `None` over a false claim. + +Examples: +- `assembly: None` means "not enough evidence", not "not GRCh38". +- `phased: None` means "not applicable or not checked", not "unphased". +- `reference_matches: None` means "not validated", not "validated successfully". + +## Supported Families + +### Genotype text exports + +These are consumer SNP-array exports such as: +- 23andMe +- AncestryDNA +- deCODEme +- Dynamic DNA / GSAv3 +- FamilyTreeDNA +- Genes for Good +- Living DNA +- MyHeritage + +Observed format families: +- Standard 4-column layout: `rsid chromosome position genotype` +- Split-allele layout: `rsid chromosome position allele1 allele2` +- Extended metric layout: genotype columns plus metrics such as `gs`, `baf`, `lrr` + +The detector should not require fixed column numbers. It should scan candidate columns for: +- SNP id: `rs...` or `i...` +- chromosome +- integer position +- genotype or split alleles + +This is the main robustness improvement needed over the current fixed/default fallback. + +### VCF + +Supported: +- `.vcf` +- `.vcf.gz` +- `.zip` containing `.vcf` or `.vcf.gz` + +Metadata we want: +- sample count +- phased vs unphased +- contig naming style (`chr1` vs `1`) +- assembly if clearly encoded in header or reference metadata + +### Raw alignments + +Supported: +- `.cram` +- `.bam` + +Metadata we want: +- alignment subtype (`cram` / `bam`) +- index presence and location (`.crai` / `.bai`) +- whether a supplied reference is required +- whether a supplied reference can be used successfully +- whether the reference appears to mismatch the CRAM's encoding reference + +### Reference FASTA + +Supported: +- `.fa` +- `.fasta` +- compressed support can be added later if we need it + +Metadata we want: +- FASTA subtype +- `.fai` presence +- contig naming style +- assembly hint when file/path clearly encode it + +## Detection Pipeline + +Detection should run in layers from cheap to expensive. + +### 1. Path-based hints + +Use filename and extension to form an initial guess: +- `.zip` => archive container +- `.vcf`, `.vcf.gz` => VCF candidate +- `.cram`, `.bam` => alignment candidate +- `.fa`, `.fasta` => reference candidate +- filename or header tokens like `GRCh38`, `hg19`, `GSAv3`, `v5`, `23andMe`, and similar assembly/source labels are hints, not proof + +### 2. Container inspection + +If the file is a zip: +- list entries +- ignore directories and `__MACOSX/` +- prefer entries ending in `.vcf`, `.vcf.gz`, `.txt`, `.tsv`, `.csv` +- keep the selected entry name in the inspection result + +This is already partially implemented in `select_zip_entry(...)` in [`genotype.rs`](/Users/madhavajay/dev/biovault-app/main/bioscript/rust/bioscript-formats/src/genotype.rs:1189). + +### 3. Content sniffing + +Sample a bounded number of lines and extract signals. + +For genotype text files: +- delimiter detection: tab, comma, whitespace +- header alias mapping +- comment preamble scanning +- flexible row validation: rsid/id, chromosome, position, genotype +- classify as genotype only if a strong fraction of sampled rows look valid + +For VCF: +- `##fileformat=VCF` +- `#CHROM` +- parse `GT` fields just far enough to determine sample count and phasing +- scan header metadata for source/vendor fingerprints when present, e.g. `sequencing.com` + +For alignments: +- subtype comes from extension +- index presence comes from adjacent path checks +- reference compatibility requires an explicit validation step + +### 4. Metadata inference + +Once the file family is known, infer metadata with explicit precedence. + +Recommended precedence: + +1. File-internal authoritative metadata +2. File-internal human-readable header text +3. Container entry name +4. Outer filename +5. Source-specific fallback rules + +## Heuristics By Family + +### Genotype text + +Signals to infer: +- delimiter +- column mapping +- vendor/source +- platform version +- assembly + +Source/vendor detection should look at header text and column shapes. Useful fingerprints already observed in local datasets: +- 23andMe: header mentions `23andMe` +- Dynamic DNA: header mentions `Dynamic DNA`, `DDNA`, or `dynamicdnalabs` +- Extended GSAv3-style files expose `gs`, `baf`, `lrr` + +Platform-version detection should stay conservative: +- filename tokens like `v2`, `v3`, `v4`, `v5`, `GSAv3`, `GSAv3-DTC` +- vendor-specific export names +- presence or absence of a vendor-specific marker panel, using a curated set of informative rsids as evidence +- maybe later: row-count ranges as a weak signal, but not as primary evidence + +For 23andMe specifically, chip version should not be inferred from filename alone. A better approach is: +- use filename/header/export-name hints as weak evidence +- compare the file's marker set against curated version-specific rsid panels +- return the `platform_version` string with supporting evidence rather than a bare guess + +For other vendors, use the same model: +- `vendor = "23andMe"`, `platform_version = Some("v5")` +- `vendor = "Dynamic DNA"`, `platform_version = Some("GSAv3-DTC")` +- `vendor = "AncestryDNA"`, `platform_version = None` + +Assembly detection should use header text first: +- `build 36`, `GRCh36`, `hg18` +- `build 37`, `GRCh37`, `hg19` +- `build 38`, `GRCh38`, `hg38` + +Known examples in the shared data: +- 23andMe v2/v3 exports declare build 36 +- 23andMe v4/v5 exports declare build 37 +- Dynamic DNA GSAv3 export declares build 38 + +### VCF + +Signals to infer: +- compressed or plain +- sample count +- phased vs unphased +- contig naming style +- assembly, when the header contains enough information + +Phasing rule: +- any `GT` using `|` => `Some(true)` +- all parsed `GT` use `/` => `Some(false)` +- no usable `GT` seen => `None` + +### CRAM / BAM + +Signals to infer: +- subtype +- adjacent index presence +- reference requirement +- reference compatibility status + +Reference compatibility states should be explicit, for example: +- `NotChecked` +- `MissingReference` +- `Compatible` +- `Md5MismatchButReadable` +- `Incompatible` + +For CRAM this should be based on an actual open/decode attempt against the supplied reference, not just on filename matching. The existing alignment path already has a real MD5-mismatch fallback behavior documented in [`docs/architecture.md`](/Users/madhavajay/dev/biovault-app/main/bioscript/docs/architecture.md:54). + +### Reference FASTA + +Signals to infer: +- subtype +- adjacent `.fai` +- contig naming style +- assembly hint from filename/path + +## Confidence Model + +The inspector should carry confidence so callers can decide whether to trust or surface a result. + +Suggested levels: +- `Authoritative` +- `StrongHeuristic` +- `WeakHeuristic` +- `Unknown` + +Examples: +- VCF identified from `##fileformat=VCF` => `Authoritative` +- 23andMe detected from header text => `StrongHeuristic` +- 23andMe `platform_version` guessed only from a filename token like `v5` => `WeakHeuristic` +- 23andMe `platform_version` supported by a version-specific rsid panel match => `StrongHeuristic` + +## Fixture Matrix + +The test suite should cover both synthetic and real-world fixtures. + +### Fixtures already in this repo + +`rust/bioscript-formats/tests/fixtures` +- `mini.cram`, `mini.cram.crai` +- `mini.fa`, `mini.fa.fai` +- `mini_bad_ref.fa`, `mini_bad_ref.fa.fai` + +Use these for: +- CRAM subtype detection +- CRAI/FAI detection +- reference compatibility and MD5-mismatch behavior + +`old/examples/apol1/genotype_files` +- Dynamic DNA GSAv3-style plain-text files +- filenames include `GSAv3` and `GRCh38` +- header includes build 38 wording + +Use these for: +- extended genotype layout +- build 38 inference +- chip/version filename heuristics + +### Shared test data + +Repo-local shared dataset now lives under [`test-data/`](/Users/madhavajay/dev/biovault-app/main/bioscript/test-data:1), populated by [`tools/fetch_test_data.sh`](/Users/madhavajay/dev/biovault-app/main/bioscript/tools/fetch_test_data.sh:1) as symlinks into the shared cache at `~/.bioscript/cache/test-data/`: + +- `23andme/v2/.../23data20100526.txt.zip` +- `23andme/v3/.../huE4DAE4_20120522224129.txt.zip` +- `23andme/v4/.../genome__v4_Full_2016.txt.zip` +- `23andme/v5/.../genome_hu50B3F5_v5_Full.zip` +- `dynamicdna/.../100001_X_X_GSAv3-DTC_GRCh38-07-12-2025.txt.zip` +- `1k-genomes/ref/...fa`, `.fai` +- `1k-genomes/aligned/...cram`, `.crai` + +These are the right fixtures for real-world detection coverage: +- 23andMe vendor + version + build heuristics +- Dynamic DNA vendor + GSAv3 + GRCh38 heuristics +- large real CRAM/reference/index presence and compatibility checks + +The fetch tool and manifest now live in this repo: +- [`tools/fetch_test_data.sh`](/Users/madhavajay/dev/biovault-app/main/bioscript/tools/fetch_test_data.sh:1) +- [`tools/sources.yaml`](/Users/madhavajay/dev/biovault-app/main/bioscript/tools/sources.yaml:1) + +That makes the same cache-backed layout available in local development, CI, and downstream repos. + +## Tests To Add + +### Current coverage + +- generated `.vcf` and `.zip` containing VCF are covered in `bioscript-formats` +- real-world zipped 23andMe fixtures are readable through `GenotypeStore` +- real-world zipped Dynamic DNA fixture is readable through `GenotypeStore` +- `inspect_file(...)` metadata assertions now cover: + - AncestryDNA + - FamilyTreeDNA + - Genes for Good + - Dynamic DNA + - CRAM index detection + - phased VCF detection +- the `bioscript inspect` CLI has a smoke test in `bioscript-cli` +- existing mini CRAM fixtures continue to validate reference/index behavior + +### Next + +- version-specific rsid-panel heuristics for 23andMe +- source detection tests for more vendors from curated real files +- build inference tests for 36 / 37 / 38 across more sources +- missing-index and mismatched-reference tests for alignments +- JSON output mode for `bs inspect` + +### Later + +- AncestryDNA +- FamilyTreeDNA +- Living DNA +- MyHeritage +- deCODEme +- ambiguous or malformed files +- BAM fixtures + +## Shared Test-Data Layout + +To avoid duplicate downloads across repos, the shared fetch tool should populate a cache under: + +```text +~/.bioscript/cache/test-data/ +``` + +Each consuming repo can then expose a local `test-data/` tree as symlinks into that cache by running its local `tools/fetch_test_data.sh`. + +This gives us: +- one physical copy of large CRAM/FASTA fixtures +- stable local paths for tests +- the option to reuse the same data across Bioscript, ExVitae, and Biovault + +## Non-Goals + +The inspection layer should not: +- fully parse or normalize every file up front +- silently auto-fix mismatches +- claim certainty where only weak hints exist + +The file inspection API should not mutate inspected input files, but the fetch tooling is explicitly allowed to materialize a local `test-data/` tree as symlinks into the shared cache. + +Its job is to describe the file accurately enough that the caller can decide what to do next. diff --git a/rust/bioscript-cli/src/main.rs b/rust/bioscript-cli/src/main.rs index a0d63a9..3db0063 100644 --- a/rust/bioscript-cli/src/main.rs +++ b/rust/bioscript-cli/src/main.rs @@ -8,7 +8,8 @@ use std::{ }; use bioscript_formats::{ - GenotypeLoadOptions, GenotypeSourceFormat, PrepareRequest, prepare_indexes, shell_flags, + GenotypeLoadOptions, GenotypeSourceFormat, InspectOptions, PrepareRequest, inspect_file, + prepare_indexes, shell_flags, }; use bioscript_runtime::{BioscriptRuntime, RuntimeConfig, StageTiming}; use bioscript_schema::validate_variants_path; @@ -34,6 +35,9 @@ fn run_cli() -> Result<(), String> { if first == "prepare" { return run_prepare(args.collect()); } + if first == "inspect" { + return run_inspect(args.collect()); + } } let mut args = env::args().skip(1); @@ -160,7 +164,7 @@ fn run_cli() -> Result<(), String> { let Some(script_path) = script_path else { return Err( - "usage: bioscript [--root ] [--input-file ] [--output-file ] [--participant-id ] [--trace-report ] [--input-format auto|text|zip|vcf|cram] [--input-index ] [--reference-file ] [--reference-index ] [--auto-index] [--cache-dir ] [--max-duration-ms N] [--max-memory-bytes N] [--max-allocations N] [--max-recursion-depth N]\n bioscript prepare [--root ] [--input-file ] [--reference-file ] [--input-format auto|text|zip|vcf|cram] [--cache-dir ]" + "usage: bioscript [--root ] [--input-file ] [--output-file ] [--participant-id ] [--trace-report ] [--input-format auto|text|zip|vcf|cram] [--input-index ] [--reference-file ] [--reference-index ] [--auto-index] [--cache-dir ] [--max-duration-ms N] [--max-memory-bytes N] [--max-allocations N] [--max-recursion-depth N]\n bioscript prepare [--root ] [--input-file ] [--reference-file ] [--input-format auto|text|zip|vcf|cram] [--cache-dir ]\n bioscript inspect [--input-index ] [--reference-file ] [--reference-index ]" .to_owned(), ); }; @@ -329,6 +333,49 @@ fn run_prepare(args: Vec) -> Result<(), String> { Ok(()) } +fn run_inspect(args: Vec) -> Result<(), String> { + let mut path: Option = None; + let mut options = InspectOptions::default(); + + let mut iter = args.into_iter(); + while let Some(arg) = iter.next() { + match arg.as_str() { + "--input-index" => { + options.input_index = Some(PathBuf::from( + iter.next().ok_or("--input-index requires a path")?, + )); + } + "--reference-file" => { + options.reference_file = Some(PathBuf::from( + iter.next().ok_or("--reference-file requires a path")?, + )); + } + "--reference-index" => { + options.reference_index = Some(PathBuf::from( + iter.next().ok_or("--reference-index requires a path")?, + )); + } + other if path.is_none() => { + path = Some(PathBuf::from(other)); + } + other => { + return Err(format!("unexpected argument: {other}")); + } + } + } + + let Some(path) = path else { + return Err( + "usage: bioscript inspect [--input-index ] [--reference-file ] [--reference-index ]" + .to_owned(), + ); + }; + + let inspection = inspect_file(&path, &options).map_err(|err| err.to_string())?; + println!("{}", inspection.render_text()); + Ok(()) +} + fn run_validate_variants(args: Vec) -> Result<(), String> { let mut path: Option = None; let mut report_path: Option = None; diff --git a/rust/bioscript-cli/tests/cli.rs b/rust/bioscript-cli/tests/cli.rs index 36a617f..84758f8 100644 --- a/rust/bioscript-cli/tests/cli.rs +++ b/rust/bioscript-cli/tests/cli.rs @@ -101,3 +101,28 @@ fn batch_lookup_query_plan_runs_and_preserves_requested_result_order() { assert!(stdout.contains("TC")); assert!(stdout.contains("II")); } + +#[test] +fn inspect_subcommand_reports_detected_vendor_and_platform() { + let root = repo_root(); + let path = root.join("rust/bioscript-formats/tests/fixtures/ancestrydna_v2_sample.txt"); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("inspect") + .arg(path) + .output() + .unwrap(); + + assert!( + output.status.success(), + "stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("kind\tgenotype_text")); + assert!(stdout.contains("vendor\tAncestryDNA")); + assert!(stdout.contains("platform_version\tV2.0")); + assert!(stdout.contains("assembly\tgrch37")); + assert!(stdout.contains("duration_ms\t")); +} diff --git a/rust/bioscript-formats/src/inspect.rs b/rust/bioscript-formats/src/inspect.rs new file mode 100644 index 0000000..cf0b3cd --- /dev/null +++ b/rust/bioscript-formats/src/inspect.rs @@ -0,0 +1,727 @@ +use std::{ + fs::File, + io::{BufRead, BufReader, Cursor, Read}, + path::{Path, PathBuf}, + time::Instant, +}; + +use bioscript_core::{Assembly, RuntimeError}; +use noodles::bgzf; +use zip::ZipArchive; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum FileContainer { + Plain, + Zip, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum DetectedKind { + GenotypeText, + Vcf, + AlignmentCram, + AlignmentBam, + ReferenceFasta, + Unknown, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum DetectionConfidence { + Authoritative, + StrongHeuristic, + WeakHeuristic, + Unknown, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct SourceMetadata { + pub vendor: Option, + pub platform_version: Option, + pub confidence: DetectionConfidence, + pub evidence: Vec, +} + +#[derive(Debug, Clone, PartialEq, Eq, Default)] +pub struct InspectOptions { + pub input_index: Option, + pub reference_file: Option, + pub reference_index: Option, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct FileInspection { + pub path: PathBuf, + pub container: FileContainer, + pub detected_kind: DetectedKind, + pub confidence: DetectionConfidence, + pub source: Option, + pub assembly: Option, + pub phased: Option, + pub selected_entry: Option, + pub has_index: Option, + pub index_path: Option, + pub reference_matches: Option, + pub evidence: Vec, + pub warnings: Vec, + pub duration_ms: u128, +} + +impl FileInspection { + #[must_use] + pub fn render_text(&self) -> String { + let mut lines = Vec::new(); + lines.push(format!("path\t{}", self.path.display())); + lines.push(format!("container\t{}", render_container(self.container))); + lines.push(format!("kind\t{}", render_kind(self.detected_kind))); + lines.push(format!( + "confidence\t{}", + render_confidence(self.confidence) + )); + lines.push(format!("assembly\t{}", render_assembly(self.assembly))); + lines.push(format!("phased\t{}", render_bool(self.phased))); + lines.push(format!( + "selected_entry\t{}", + self.selected_entry.as_deref().unwrap_or("") + )); + lines.push(format!("has_index\t{}", render_bool(self.has_index))); + lines.push(format!( + "index_path\t{}", + self.index_path + .as_ref() + .map(|path| path.display().to_string()) + .unwrap_or_default() + )); + lines.push(format!( + "reference_matches\t{}", + render_bool(self.reference_matches) + )); + if let Some(source) = &self.source { + lines.push(format!( + "vendor\t{}", + source.vendor.as_deref().unwrap_or_default() + )); + lines.push(format!( + "platform_version\t{}", + source.platform_version.as_deref().unwrap_or_default() + )); + lines.push(format!( + "source_confidence\t{}", + render_confidence(source.confidence) + )); + lines.push(format!("source_evidence\t{}", source.evidence.join(" | "))); + } else { + lines.push("vendor\t".to_owned()); + lines.push("platform_version\t".to_owned()); + lines.push("source_confidence\t".to_owned()); + lines.push("source_evidence\t".to_owned()); + } + lines.push(format!("evidence\t{}", self.evidence.join(" | "))); + lines.push(format!("warnings\t{}", self.warnings.join(" | "))); + lines.push(format!("duration_ms\t{}", self.duration_ms)); + lines.join("\n") + } +} + +pub fn inspect_file(path: &Path, options: &InspectOptions) -> Result { + let started = Instant::now(); + let lower = path.to_string_lossy().to_ascii_lowercase(); + let mut evidence = Vec::new(); + let mut warnings = Vec::new(); + + if lower.ends_with(".zip") { + let selected_entry = select_zip_entry(path)?; + let sample_lines = read_zip_sample_lines(path, &selected_entry)?; + let mut inspection = inspect_from_textual_sample( + path, + FileContainer::Zip, + &selected_entry, + &sample_lines, + options, + ); + inspection.duration_ms = started.elapsed().as_millis(); + return Ok(inspection); + } + + let detected_kind = if lower.ends_with(".cram") { + evidence.push("extension .cram".to_owned()); + DetectedKind::AlignmentCram + } else if lower.ends_with(".bam") { + evidence.push("extension .bam".to_owned()); + DetectedKind::AlignmentBam + } else if is_reference_path(path) { + evidence.push("reference fasta extension".to_owned()); + DetectedKind::ReferenceFasta + } else { + let sample_lines = read_plain_sample_lines(path)?; + let sample_lower = sample_lines.join("\n").to_ascii_lowercase(); + if looks_like_vcf_lines(&sample_lines) { + evidence.push("vcf header markers".to_owned()); + DetectedKind::Vcf + } else if looks_like_genotype_text(&sample_lines) { + if sample_lower.contains("rsid") || sample_lower.contains("allele1") { + evidence.push("genotype-like sampled rows and headers".to_owned()); + } else { + evidence.push("genotype-like sampled rows".to_owned()); + } + DetectedKind::GenotypeText + } else { + warnings.push("file did not match known textual heuristics".to_owned()); + DetectedKind::Unknown + } + }; + + let sample_lines = match detected_kind { + DetectedKind::AlignmentCram | DetectedKind::AlignmentBam | DetectedKind::ReferenceFasta => { + Vec::new() + } + _ => read_plain_sample_lines(path)?, + }; + let source = detect_source(&lower, &sample_lines, detected_kind); + let assembly = detect_assembly(&lower, &sample_lines); + let phased = (detected_kind == DetectedKind::Vcf) + .then(|| detect_vcf_phasing(&sample_lines)) + .flatten(); + let (has_index, index_path) = detect_index(path, detected_kind, options); + let confidence = classify_confidence(detected_kind, &sample_lines, source.as_ref()); + + Ok(FileInspection { + path: path.to_path_buf(), + container: FileContainer::Plain, + detected_kind, + confidence, + source, + assembly, + phased, + selected_entry: None, + has_index, + index_path, + reference_matches: None, + evidence, + warnings, + duration_ms: started.elapsed().as_millis(), + }) +} + +fn inspect_from_textual_sample( + path: &Path, + container: FileContainer, + selected_entry: &str, + sample_lines: &[String], + options: &InspectOptions, +) -> FileInspection { + let lower = selected_entry.to_ascii_lowercase(); + let detected_kind = if lower.ends_with(".vcf") + || lower.ends_with(".vcf.gz") + || looks_like_vcf_lines(sample_lines) + { + DetectedKind::Vcf + } else if looks_like_genotype_text(sample_lines) { + DetectedKind::GenotypeText + } else { + DetectedKind::Unknown + }; + let path_lower = path.to_string_lossy().to_ascii_lowercase(); + let combined_name = format!("{path_lower}\n{lower}"); + let source = detect_source(&combined_name, sample_lines, detected_kind); + let assembly = detect_assembly(&combined_name, sample_lines); + let phased = (detected_kind == DetectedKind::Vcf) + .then(|| detect_vcf_phasing(sample_lines)) + .flatten(); + let confidence = classify_confidence(detected_kind, sample_lines, source.as_ref()); + let mut evidence = vec![format!("selected zip entry {selected_entry}")]; + if detected_kind == DetectedKind::Vcf { + evidence.push("vcf entry markers".to_owned()); + } else if detected_kind == DetectedKind::GenotypeText { + evidence.push("genotype-like sampled rows".to_owned()); + } + let (has_index, index_path) = detect_index(path, detected_kind, options); + + FileInspection { + path: path.to_path_buf(), + container, + detected_kind, + confidence, + source, + assembly, + phased, + selected_entry: Some(selected_entry.to_owned()), + has_index, + index_path, + reference_matches: None, + evidence, + warnings: Vec::new(), + duration_ms: 0, + } +} + +fn read_plain_sample_lines(path: &Path) -> Result, RuntimeError> { + let lower = path.to_string_lossy().to_ascii_lowercase(); + let file = File::open(path) + .map_err(|err| RuntimeError::Io(format!("failed to open {}: {err}", path.display())))?; + if lower.ends_with(".vcf.gz") { + return read_sample_lines_from_reader(BufReader::new(bgzf::io::Reader::new(file))); + } + read_sample_lines_from_reader(BufReader::new(file)) +} + +fn read_zip_sample_lines(path: &Path, selected_entry: &str) -> Result, RuntimeError> { + let file = File::open(path) + .map_err(|err| RuntimeError::Io(format!("failed to open zip {}: {err}", path.display())))?; + let mut archive = ZipArchive::new(file) + .map_err(|err| RuntimeError::Io(format!("failed to read zip {}: {err}", path.display())))?; + let mut entry = archive.by_name(selected_entry).map_err(|err| { + RuntimeError::Io(format!( + "failed to open zip entry {selected_entry} in {}: {err}", + path.display() + )) + })?; + + if selected_entry.to_ascii_lowercase().ends_with(".vcf.gz") { + let mut bytes = Vec::new(); + entry.read_to_end(&mut bytes).map_err(|err| { + RuntimeError::Io(format!( + "failed to read compressed zip entry {selected_entry} in {}: {err}", + path.display() + )) + })?; + let reader = bgzf::io::Reader::new(Cursor::new(bytes)); + return read_sample_lines_from_reader(BufReader::new(reader)); + } + + read_sample_lines_from_reader(BufReader::new(entry)) +} + +fn read_sample_lines_from_reader(mut reader: R) -> Result, RuntimeError> { + let mut out = Vec::new(); + let mut buf = String::new(); + for _ in 0..64 { + buf.clear(); + let bytes = reader + .read_line(&mut buf) + .map_err(|err| RuntimeError::Io(format!("failed to read sample lines: {err}")))?; + if bytes == 0 { + break; + } + out.push(buf.trim_end_matches(['\n', '\r']).to_owned()); + } + Ok(out) +} + +fn select_zip_entry(path: &Path) -> Result { + let file = File::open(path) + .map_err(|err| RuntimeError::Io(format!("failed to open zip {}: {err}", path.display())))?; + let mut archive = ZipArchive::new(file) + .map_err(|err| RuntimeError::Io(format!("failed to read zip {}: {err}", path.display())))?; + let mut fallback = None; + for idx in 0..archive.len() { + let entry = archive.by_index(idx).map_err(|err| { + RuntimeError::Io(format!("failed to inspect zip {}: {err}", path.display())) + })?; + if entry.is_dir() { + continue; + } + let name = entry.name().to_owned(); + if name.starts_with("__MACOSX/") { + continue; + } + let lower = name.to_ascii_lowercase(); + if lower.ends_with(".vcf") + || lower.ends_with(".vcf.gz") + || lower.ends_with(".txt") + || lower.ends_with(".tsv") + || lower.ends_with(".csv") + { + return Ok(name); + } + if fallback.is_none() { + fallback = Some(name); + } + } + fallback.ok_or_else(|| { + RuntimeError::Unsupported(format!( + "zip archive {} does not contain a supported file", + path.display() + )) + }) +} + +fn looks_like_vcf_lines(lines: &[String]) -> bool { + lines.iter().any(|line| { + let trimmed = line.trim_start(); + trimmed.starts_with("##fileformat=VCF") || trimmed.starts_with("#CHROM\t") + }) +} + +fn looks_like_genotype_text(lines: &[String]) -> bool { + let mut checked = 0usize; + let mut valid = 0usize; + for line in lines { + let trimmed = line.trim(); + if trimmed.is_empty() || trimmed.starts_with('#') || trimmed.starts_with("//") { + continue; + } + let fields = split_fields(trimmed); + checked += 1; + if matches_genotype_shape(&fields) { + valid += 1; + } + } + checked > 0 && valid * 10 >= checked * 7 +} + +fn split_fields(line: &str) -> Vec { + if line.contains('\t') { + return line + .split('\t') + .map(|field| field.trim().to_owned()) + .collect(); + } + if line.contains(',') { + return line + .split(',') + .map(|field| field.trim().trim_matches('"').to_owned()) + .collect(); + } + line.split_whitespace().map(str::to_owned).collect() +} + +fn matches_genotype_shape(fields: &[String]) -> bool { + if fields.len() < 4 { + return false; + } + let rsid_like = fields[0].starts_with("rs") || fields[0].starts_with('i'); + if !rsid_like { + return false; + } + let chr_idx = fields.iter().position(|field| is_valid_chromosome(field)); + let Some(chr_idx) = chr_idx else { + return false; + }; + for pos_idx in (chr_idx + 1)..fields.len() { + if fields[pos_idx].parse::().is_err() { + continue; + } + for field in fields.iter().skip(pos_idx + 1) { + if is_valid_genotype(field) { + return true; + } + } + if pos_idx + 2 < fields.len() + && is_valid_allele(&fields[pos_idx + 1]) + && is_valid_allele(&fields[pos_idx + 2]) + { + return true; + } + } + false +} + +fn is_valid_chromosome(value: &str) -> bool { + let trimmed = value.trim().trim_start_matches("chr"); + if let Ok(n) = trimmed.parse::() { + return (1..=26).contains(&n); + } + matches!( + trimmed.to_ascii_uppercase().as_str(), + "X" | "Y" | "M" | "MT" | "XY" + ) +} + +fn is_valid_genotype(value: &str) -> bool { + let trimmed = value.trim().to_ascii_uppercase(); + if trimmed.is_empty() || trimmed.len() > 4 { + return false; + } + trimmed + .chars() + .all(|ch| matches!(ch, 'A' | 'C' | 'G' | 'T' | 'I' | 'D' | '-' | '0')) +} + +fn is_valid_allele(value: &str) -> bool { + let trimmed = value.trim().to_ascii_uppercase(); + matches!( + trimmed.as_str(), + "A" | "C" | "G" | "T" | "I" | "D" | "-" | "0" + ) +} + +fn detect_source( + lower_name: &str, + sample_lines: &[String], + kind: DetectedKind, +) -> Option { + let header = sample_lines + .iter() + .filter(|line| line.starts_with('#') || line.starts_with("//")) + .map(|line| line.to_ascii_lowercase()) + .collect::>() + .join("\n"); + let combined = format!("{lower_name}\n{header}"); + let normalized = combined.replace(['_', '-', '.'], " "); + let mut evidence = Vec::new(); + let mut vendor = None; + let mut platform_version = None; + let mut confidence = DetectionConfidence::Unknown; + + if normalized.contains("genes for good") || normalized.contains("geneforgood") { + vendor = Some("Genes for Good".to_owned()); + confidence = DetectionConfidence::StrongHeuristic; + evidence.push("Genes for Good header".to_owned()); + if let Some(version) = extract_token_after_marker(&header, "genes for good ") { + platform_version = Some(version); + evidence.push("Genes for Good version header".to_owned()); + } + } else if normalized.contains("23andme") || normalized.contains("23&me") { + vendor = Some("23andMe".to_owned()); + confidence = DetectionConfidence::StrongHeuristic; + evidence.push("23andMe header/export name".to_owned()); + if normalized.contains(" v2 ") || lower_name.contains("/v2/") { + platform_version = Some("v2".to_owned()); + evidence.push("v2 token".to_owned()); + } else if normalized.contains(" v3 ") || lower_name.contains("/v3/") { + platform_version = Some("v3".to_owned()); + evidence.push("v3 token".to_owned()); + } else if normalized.contains(" v4 ") || lower_name.contains("/v4/") { + platform_version = Some("v4".to_owned()); + evidence.push("v4 token".to_owned()); + } else if normalized.contains(" v5 ") || lower_name.contains("/v5/") { + platform_version = Some("v5".to_owned()); + evidence.push("v5 token".to_owned()); + } + } else if normalized.contains("ancestrydna") || normalized.contains("ancestry com dna") { + vendor = Some("AncestryDNA".to_owned()); + confidence = DetectionConfidence::StrongHeuristic; + evidence.push("AncestryDNA header/export name".to_owned()); + if let Some(version) = extract_after_marker(&header, "array version:") { + platform_version = Some(canonicalize_ancestry_version(&version)); + evidence.push("AncestryDNA array version header".to_owned()); + } + } else if normalized.contains("family tree dna") + || normalized.contains("familytreedna") + || normalized.contains("ftdna") + { + vendor = Some("FamilyTreeDNA".to_owned()); + confidence = DetectionConfidence::StrongHeuristic; + evidence.push("FamilyTreeDNA header/export name".to_owned()); + } else if normalized.contains("dynamic dna") + || normalized.contains("dynamicdnalabs") + || normalized.contains("ddna laboratories") + || normalized.contains("ddna") + { + vendor = Some("Dynamic DNA".to_owned()); + confidence = DetectionConfidence::StrongHeuristic; + evidence.push("Dynamic DNA header".to_owned()); + if normalized.contains("gsav3 dtc") { + platform_version = Some("GSAv3-DTC".to_owned()); + evidence.push("GSAv3-DTC token".to_owned()); + } else if normalized.contains("gsav3") { + platform_version = Some("GSAv3".to_owned()); + evidence.push("GSAv3 token".to_owned()); + } + } else if normalized.contains("myheritage") { + vendor = Some("MyHeritage".to_owned()); + confidence = DetectionConfidence::StrongHeuristic; + evidence.push("MyHeritage header/export name".to_owned()); + } else if normalized.contains("sequencing com") && kind == DetectedKind::Vcf { + vendor = Some("Sequencing.com".to_owned()); + confidence = DetectionConfidence::WeakHeuristic; + evidence.push("sequencing.com header text".to_owned()); + } + + vendor.map(|vendor| SourceMetadata { + vendor: Some(vendor), + platform_version, + confidence, + evidence, + }) +} + +fn extract_after_marker(text: &str, marker: &str) -> Option { + text.lines().find_map(|line| { + let trimmed = line.trim(); + let lower = trimmed.to_ascii_lowercase(); + lower.find(marker).map(|idx| { + trimmed[idx + marker.len()..] + .trim() + .trim_end_matches('.') + .to_owned() + }) + }) +} + +fn extract_token_after_marker(text: &str, marker: &str) -> Option { + extract_after_marker(text, marker).map(|value| { + value + .split_whitespace() + .next() + .unwrap_or_default() + .trim_end_matches(':') + .to_owned() + }) +} + +fn canonicalize_ancestry_version(value: &str) -> String { + let trimmed = value.trim(); + if let Some(rest) = trimmed.strip_prefix('v') { + return format!("V{rest}"); + } + trimmed.to_owned() +} + +fn detect_assembly(lower_name: &str, sample_lines: &[String]) -> Option { + let header = sample_lines.join("\n").to_ascii_lowercase(); + let combined = format!("{lower_name}\n{header}"); + if combined.contains("build 38") || combined.contains("grch38") || combined.contains("hg38") { + Some(Assembly::Grch38) + } else if combined.contains("build 37") + || combined.contains("grch37") + || combined.contains("hg19") + || combined.contains("37.1") + { + Some(Assembly::Grch37) + } else { + None + } +} + +fn detect_vcf_phasing(lines: &[String]) -> Option { + let mut saw_slash = false; + for line in lines { + if line.starts_with('#') { + continue; + } + let fields: Vec<&str> = line.split('\t').collect(); + if fields.len() < 10 { + continue; + } + let gt = fields[9].split(':').next().unwrap_or_default().trim(); + if gt.contains('|') { + return Some(true); + } + if gt.contains('/') { + saw_slash = true; + } + } + saw_slash.then_some(false) +} + +fn detect_index( + path: &Path, + kind: DetectedKind, + options: &InspectOptions, +) -> (Option, Option) { + if let Some(index) = options + .input_index + .as_ref() + .or(options.reference_index.as_ref()) + { + return (Some(index.exists()), Some(index.clone())); + } + + match kind { + DetectedKind::AlignmentCram => { + let candidate = if path + .to_string_lossy() + .to_ascii_lowercase() + .ends_with(".cram") + { + let first = path.with_extension("cram.crai"); + if first.exists() { + Some(first) + } else { + Some(path.with_extension("crai")) + } + } else { + None + }; + match candidate { + Some(candidate) => (Some(candidate.exists()), Some(candidate)), + None => (Some(false), None), + } + } + DetectedKind::AlignmentBam => { + let first = path.with_extension("bam.bai"); + if first.exists() { + return (Some(true), Some(first)); + } + let second = path.with_extension("bai"); + (Some(second.exists()), Some(second)) + } + DetectedKind::ReferenceFasta => { + let candidate = if let Some(ext) = path.extension().and_then(|ext| ext.to_str()) { + path.with_extension(format!("{ext}.fai")) + } else { + path.with_extension("fai") + }; + (Some(candidate.exists()), Some(candidate)) + } + _ => (None, None), + } +} + +fn is_reference_path(path: &Path) -> bool { + let lower = path.to_string_lossy().to_ascii_lowercase(); + lower.ends_with(".fa") || lower.ends_with(".fasta") +} + +fn classify_confidence( + kind: DetectedKind, + sample_lines: &[String], + source: Option<&SourceMetadata>, +) -> DetectionConfidence { + match kind { + DetectedKind::Vcf if looks_like_vcf_lines(sample_lines) => { + DetectionConfidence::Authoritative + } + DetectedKind::AlignmentCram | DetectedKind::AlignmentBam | DetectedKind::ReferenceFasta => { + DetectionConfidence::Authoritative + } + DetectedKind::GenotypeText if source.is_some() => DetectionConfidence::StrongHeuristic, + DetectedKind::GenotypeText => DetectionConfidence::WeakHeuristic, + DetectedKind::Unknown => DetectionConfidence::Unknown, + DetectedKind::Vcf => DetectionConfidence::StrongHeuristic, + } +} + +fn render_container(value: FileContainer) -> &'static str { + match value { + FileContainer::Plain => "plain", + FileContainer::Zip => "zip", + } +} + +fn render_kind(value: DetectedKind) -> &'static str { + match value { + DetectedKind::GenotypeText => "genotype_text", + DetectedKind::Vcf => "vcf", + DetectedKind::AlignmentCram => "alignment_cram", + DetectedKind::AlignmentBam => "alignment_bam", + DetectedKind::ReferenceFasta => "reference_fasta", + DetectedKind::Unknown => "unknown", + } +} + +fn render_confidence(value: DetectionConfidence) -> &'static str { + match value { + DetectionConfidence::Authoritative => "authoritative", + DetectionConfidence::StrongHeuristic => "strong_heuristic", + DetectionConfidence::WeakHeuristic => "weak_heuristic", + DetectionConfidence::Unknown => "unknown", + } +} + +fn render_assembly(value: Option) -> &'static str { + match value { + Some(Assembly::Grch37) => "grch37", + Some(Assembly::Grch38) => "grch38", + None => "", + } +} + +fn render_bool(value: Option) -> &'static str { + match value { + Some(true) => "true", + Some(false) => "false", + None => "", + } +} diff --git a/rust/bioscript-formats/src/lib.rs b/rust/bioscript-formats/src/lib.rs index df274b3..687c969 100644 --- a/rust/bioscript-formats/src/lib.rs +++ b/rust/bioscript-formats/src/lib.rs @@ -9,9 +9,14 @@ mod alignment; mod genotype; +mod inspect; mod prepare; pub use genotype::{ BackendCapabilities, GenotypeLoadOptions, GenotypeSourceFormat, GenotypeStore, QueryKind, }; +pub use inspect::{ + DetectedKind, DetectionConfidence, FileContainer, FileInspection, InspectOptions, + SourceMetadata, inspect_file, +}; pub use prepare::{PrepareRequest, PreparedPaths, prepare_indexes, shell_flags}; diff --git a/rust/bioscript-formats/tests/file_formats.rs b/rust/bioscript-formats/tests/file_formats.rs index 645a6b9..74b73dc 100644 --- a/rust/bioscript-formats/tests/file_formats.rs +++ b/rust/bioscript-formats/tests/file_formats.rs @@ -1,5 +1,5 @@ use std::{ - fs, + env, fs, io::Write, path::PathBuf, time::{SystemTime, UNIX_EPOCH}, @@ -31,6 +31,35 @@ fn repo_root() -> PathBuf { .to_path_buf() } +fn shared_test_data_root() -> Option { + if let Some(path) = env::var_os("BIOSCRIPT_TEST_DATA_DIR") { + let candidate = PathBuf::from(path); + if candidate.exists() { + return Some(candidate); + } + } + + let local = repo_root().join("test-data"); + if local.exists() { + return Some(local); + } + + let home_cache = env::var_os("HOME") + .map(PathBuf::from) + .map(|home| home.join(".bioscript/cache/test-data")); + home_cache.filter(|path| path.exists()) +} + +fn shared_fixture_or_skip(test_name: &str, relative: &str) -> Option { + let root = shared_test_data_root()?; + let path = root.join(relative); + if !path.exists() { + eprintln!("skipping {test_name}: missing {}", path.display()); + return None; + } + Some(path) +} + #[test] fn zip_genotype_file_is_auto_detected_and_readable() { let dir = temp_dir("zip-auto"); @@ -81,6 +110,112 @@ fn zip_genotype_file_can_be_forced_by_format() { assert_eq!(store.get("rs73885319").unwrap().as_deref(), Some("AG")); } +#[test] +fn zip_vcf_entry_is_auto_detected_and_readable() { + let dir = temp_dir("zip-vcf"); + let zip_path = dir.join("apol1-sample.zip"); + + let file = fs::File::create(&zip_path).unwrap(); + let mut writer = zip::ZipWriter::new(file); + writer + .start_file("nested/sample.vcf", SimpleFileOptions::default()) + .unwrap(); + writer + .write_all( + b"##fileformat=VCFv4.2\n\ +##FORMAT=\n\ +#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ +22\t36265860\trs73885319\tA\tG\t.\tPASS\t.\tGT\t0/1\n", + ) + .unwrap(); + writer.finish().unwrap(); + + let store = GenotypeStore::from_file(&zip_path).unwrap(); + assert_eq!(store.get("rs73885319").unwrap().as_deref(), Some("AG")); +} + +#[test] +fn shared_real_world_zipped_genotype_exports_are_readable() { + struct FixtureExpectation { + relative: &'static str, + rsid: &'static str, + genotype: &'static str, + } + + let fixtures = [ + FixtureExpectation { + relative: "23andme/v2/hu0199C8/23data20100526.txt.zip", + rsid: "rs3094315", + genotype: "AA", + }, + FixtureExpectation { + relative: "23andme/v3/huE4DAE4/huE4DAE4_20120522224129.txt.zip", + rsid: "rs3131972", + genotype: "GG", + }, + FixtureExpectation { + relative: "23andme/v4/huE18D82/genome__v4_Full_2016.txt.zip", + rsid: "rs3131972", + genotype: "AG", + }, + FixtureExpectation { + relative: "23andme/v5/hu50B3F5/genome_hu50B3F5_v5_Full.zip", + rsid: "rs116587930", + genotype: "GG", + }, + FixtureExpectation { + relative: "dynamicdna/100001-synthetic/100001_X_X_GSAv3-DTC_GRCh38-07-12-2025.txt.zip", + rsid: "rs116587930", + genotype: "GG", + }, + FixtureExpectation { + relative: "ancestrydna/huE922FC/AncestryDNA.txt.zip", + rsid: "rs3131972", + genotype: "GG", + }, + FixtureExpectation { + relative: "familytreedna/hu17B792/2017-04-29_Family_Tree_DNA_Data.csv.zip", + rsid: "rs1000530", + genotype: "TT", + }, + FixtureExpectation { + relative: "genesforgood/hu80B047/GFG0_filtered_imputed_genotypes_noY_noMT_23andMe.txt.zip", + rsid: "rs3094315", + genotype: "AA", + }, + FixtureExpectation { + relative: "myheritage/hu33515F/MyHeritage_raw_dna_data.zip", + rsid: "rs3131972", + genotype: "GG", + }, + ]; + + for fixture in fixtures { + let Some(path) = shared_fixture_or_skip( + "shared_real_world_zipped_genotype_exports_are_readable", + fixture.relative, + ) else { + return; + }; + + let store = GenotypeStore::from_file(&path).unwrap(); + assert_eq!( + store.get(fixture.rsid).unwrap().as_deref(), + Some(fixture.genotype), + "fixture {}", + fixture.relative + ); + } +} + +#[test] +fn bundled_dynamicdna_gsav3_plain_text_fixture_is_readable() { + let path = repo_root() + .join("old/examples/apol1/genotype_files/108179_G0G0_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt"); + let store = GenotypeStore::from_file(&path).unwrap(); + assert_eq!(store.get("rs73885319").unwrap().as_deref(), Some("AA")); +} + #[test] fn vcf_variant_lookup_reads_single_sample_calls() { let dir = temp_dir("vcf"); @@ -117,15 +252,51 @@ struct CramFixture { input_index: PathBuf, } +fn run_large_cram_tests() -> bool { + env::var_os("BIOSCRIPT_RUN_LARGE_TESTS").is_some() +} + +fn require_large_cram_tests(test_name: &str) -> bool { + if run_large_cram_tests() { + true + } else { + eprintln!("skipping {test_name}: set BIOSCRIPT_RUN_LARGE_TESTS=1 to enable"); + false + } +} + fn cram_fixture_or_skip(test_name: &str) -> Option { - let root = repo_root(); + if !require_large_cram_tests(test_name) { + return None; + } + let root = shared_test_data_root()?; let fx = CramFixture { - cram: root.join("../test-data/1k-genomes/aligned/NA06985.final.cram"), - reference: root - .join("../test-data/1k-genomes/ref/GRCh38_full_analysis_set_plus_decoy_hla.fa"), - reference_index: root - .join("../test-data/1k-genomes/ref/GRCh38_full_analysis_set_plus_decoy_hla.fa.fai"), - input_index: root.join("../test-data/1k-genomes/aligned/NA06985.final.cram.crai"), + cram: root.join("1k-genomes/aligned/NA06985.final.cram"), + reference: root.join("1k-genomes/ref/GRCh38_full_analysis_set_plus_decoy_hla.fa"), + reference_index: root.join("1k-genomes/ref/GRCh38_full_analysis_set_plus_decoy_hla.fa.fai"), + input_index: root.join("1k-genomes/aligned/NA06985.final.cram.crai"), + }; + for p in [ + &fx.cram, + &fx.reference, + &fx.reference_index, + &fx.input_index, + ] { + if !p.exists() { + eprintln!("skipping {test_name}: missing {}", p.display()); + return None; + } + } + Some(fx) +} + +fn chr_y_cram_fixture_or_skip(test_name: &str) -> Option { + let root = shared_test_data_root()?; + let fx = CramFixture { + cram: root.join("NA06985-chrY/aligned/NA06985.final.chrY.cram"), + reference: root.join("NA06985-chrY/ref/GRCh38_chrY.fa"), + reference_index: root.join("NA06985-chrY/ref/GRCh38_chrY.fa.fai"), + input_index: root.join("NA06985-chrY/aligned/NA06985.final.chrY.cram.crai"), }; for p in [ &fx.cram, @@ -259,6 +430,49 @@ fn cram_mini_fixture_md5_mismatch_is_tolerated() { assert_eq!(observation.alt_count.unwrap_or(0), 0); } +#[test] +fn cram_chr_y_fixture_lookup_is_fast_and_correct() { + let Some(fx) = chr_y_cram_fixture_or_skip("cram_chr_y_fixture_lookup_is_fast_and_correct") + else { + return; + }; + let store = open_cram_store(&fx); + + let start = std::time::Instant::now(); + let observation = store + .lookup_variant(&VariantSpec { + rsids: vec!["chrY_smoke_3449570".to_owned()], + grch38: Some(bioscript_core::GenomicLocus { + chrom: "chrY".to_owned(), + start: 3_449_570, + end: 3_449_570, + }), + reference: Some("A".to_owned()), + alternate: Some("G".to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }) + .expect("chrY lookup"); + let elapsed = start.elapsed(); + + assert_eq!(observation.backend, "cram"); + let depth = observation.depth.unwrap_or(0); + assert!( + depth >= 8, + "expected >=8 reads at chrY smoke locus, got {depth}" + ); + assert_eq!(observation.alt_count.unwrap_or(0), 0); + assert!( + observation.ref_count.unwrap_or(0) >= 8, + "expected ref-supporting reads at chrY smoke locus, got {:?}", + observation.ref_count + ); + assert!( + elapsed.as_secs() < 5, + "chrY CRAM lookup took {elapsed:?}, expected <5s" + ); +} + #[test] fn cram_apol1_snp_lookup_is_fast_and_correct() { let Some(fx) = cram_fixture_or_skip("cram_apol1_snp_lookup_is_fast_and_correct") else { diff --git a/rust/bioscript-formats/tests/fixtures/ancestrydna_csv_sample.csv b/rust/bioscript-formats/tests/fixtures/ancestrydna_csv_sample.csv new file mode 100644 index 0000000..691b351 --- /dev/null +++ b/rust/bioscript-formats/tests/fixtures/ancestrydna_csv_sample.csv @@ -0,0 +1,6 @@ +rsid,chromosome,position,allele1,allele2 +rs4477212,1,82154,T,T +rs3131972,1,752721,G,G +rs12562034,1,768448,A,G +rs11240777,1,798959,A,G +rs6681049,1,800007,C,C diff --git a/rust/bioscript-formats/tests/fixtures/ancestrydna_v2_sample.txt b/rust/bioscript-formats/tests/fixtures/ancestrydna_v2_sample.txt new file mode 100644 index 0000000..a74633a --- /dev/null +++ b/rust/bioscript-formats/tests/fixtures/ancestrydna_v2_sample.txt @@ -0,0 +1,12 @@ +#AncestryDNA raw data download +#This file was generated by AncestryDNA at: 07/29/2019 14:50:41 UTC +#Data was collected using AncestryDNA array version: V2.0 +#Data is formatted using AncestryDNA converter version: V1.0 +#Genetic data is provided below as five TAB delimited columns. +#Columns two and three contain the chromosome and basepair position of the SNP using human reference build 37.1 coordinates. +rsid chromosome position allele1 allele2 +rs3131972 1 752721 G G +rs114525117 1 759036 G G +rs4040617 1 779322 A A +rs141175086 1 780397 C C +rs115093905 1 787173 G G diff --git a/rust/bioscript-formats/tests/fixtures/familytreedna_sample.csv b/rust/bioscript-formats/tests/fixtures/familytreedna_sample.csv new file mode 100644 index 0000000..cec980a --- /dev/null +++ b/rust/bioscript-formats/tests/fixtures/familytreedna_sample.csv @@ -0,0 +1,6 @@ +RSID,CHROMOSOME,POSITION,RESULT +rs1000530,X,78124078,TT +rs1000900,X,7152113,GG +rs1001874,X,86808168,TT +rs1002793,X,22152783,AA +rs1003026,X,15459721,CC diff --git a/rust/bioscript-formats/tests/fixtures/genesforgood_sample.txt b/rust/bioscript-formats/tests/fixtures/genesforgood_sample.txt new file mode 100644 index 0000000..8b0f743 --- /dev/null +++ b/rust/bioscript-formats/tests/fixtures/genesforgood_sample.txt @@ -0,0 +1,9 @@ +# Genes for Good v1.2.1: filtered, imputed genotype calls. For more information, go to: https://genesforgood.sph.umich.edu/readme/readme1.2.1.txt +# This data file generated by PLINK on: 11/20/17 +# Human genome reference used: GRCh37/Mito:rCRS +# rsid chromosome position genotype +rs28544273 1 751343 TT +rs143225517 1 751756 TT +rs12090487 1 752242 TT +rs3094315 1 752566 AA +rs3131972 1 752721 GG diff --git a/rust/bioscript-formats/tests/inspect.rs b/rust/bioscript-formats/tests/inspect.rs new file mode 100644 index 0000000..61fab20 --- /dev/null +++ b/rust/bioscript-formats/tests/inspect.rs @@ -0,0 +1,394 @@ +use std::{ + env, + path::PathBuf, + time::{Instant, SystemTime, UNIX_EPOCH}, +}; + +use bioscript_core::Assembly; +use bioscript_formats::{DetectedKind, FileContainer, InspectOptions, inspect_file}; + +fn repo_root() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .parent() + .expect("workspace rust dir") + .parent() + .expect("bioscript repo root") + .to_path_buf() +} + +fn fixtures_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures") +} + +fn shared_test_data_root() -> Option { + if let Some(path) = env::var_os("BIOSCRIPT_TEST_DATA_DIR") { + let candidate = PathBuf::from(path); + if candidate.exists() { + return Some(candidate); + } + } + + let sibling = repo_root().join("test-data"); + if sibling.exists() { + return Some(sibling); + } + + env::var_os("HOME") + .map(PathBuf::from) + .map(|home| home.join(".bioscript/cache/test-data")) + .filter(|path| path.exists()) +} + +fn shared_fixture_or_skip(test_name: &str, relative: &str) -> Option { + let root = shared_test_data_root()?; + let path = root.join(relative); + if !path.exists() { + eprintln!("skipping {test_name}: missing {}", path.display()); + return None; + } + Some(path) +} + +fn temp_dir(label: &str) -> PathBuf { + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("clock drift") + .as_nanos(); + let dir = std::env::temp_dir().join(format!( + "bioscript-inspect-{label}-{}-{nanos}", + std::process::id() + )); + std::fs::create_dir_all(&dir).unwrap(); + dir +} + +#[test] +fn ancestrydna_text_fixture_reports_vendor_platform_and_build() { + let path = fixtures_dir().join("ancestrydna_v2_sample.txt"); + let inspection = inspect_file(&path, &InspectOptions::default()).unwrap(); + + assert_eq!(inspection.container, FileContainer::Plain); + assert_eq!(inspection.detected_kind, DetectedKind::GenotypeText); + assert_eq!(inspection.assembly, Some(Assembly::Grch37)); + assert_eq!( + inspection + .source + .as_ref() + .and_then(|source| source.vendor.as_deref()), + Some("AncestryDNA") + ); + assert_eq!( + inspection + .source + .as_ref() + .and_then(|source| source.platform_version.as_deref()), + Some("V2.0") + ); + assert!(inspection.duration_ms < 1000); +} + +#[test] +fn familytreedna_fixture_reports_vendor() { + let path = fixtures_dir().join("familytreedna_sample.csv"); + let inspection = inspect_file(&path, &InspectOptions::default()).unwrap(); + + assert_eq!(inspection.detected_kind, DetectedKind::GenotypeText); + assert_eq!( + inspection + .source + .as_ref() + .and_then(|source| source.vendor.as_deref()), + Some("FamilyTreeDNA") + ); +} + +#[test] +fn genesforgood_fixture_reports_vendor_platform_and_build() { + let path = fixtures_dir().join("genesforgood_sample.txt"); + let inspection = inspect_file(&path, &InspectOptions::default()).unwrap(); + + assert_eq!(inspection.detected_kind, DetectedKind::GenotypeText); + assert_eq!(inspection.assembly, Some(Assembly::Grch37)); + assert_eq!( + inspection + .source + .as_ref() + .and_then(|source| source.vendor.as_deref()), + Some("Genes for Good") + ); + assert_eq!( + inspection + .source + .as_ref() + .and_then(|source| source.platform_version.as_deref()), + Some("v1.2.1") + ); +} + +#[test] +fn zipped_dynamicdna_fixture_reports_vendor_platform_and_build_quickly() { + let Some(path) = shared_fixture_or_skip( + "zipped_dynamicdna_fixture_reports_vendor_platform_and_build_quickly", + "dynamicdna/100001-synthetic/100001_X_X_GSAv3-DTC_GRCh38-07-12-2025.txt.zip", + ) else { + return; + }; + + let started = Instant::now(); + let inspection = inspect_file(&path, &InspectOptions::default()).unwrap(); + let elapsed = started.elapsed().as_millis(); + + assert_eq!(inspection.container, FileContainer::Zip); + assert_eq!(inspection.detected_kind, DetectedKind::GenotypeText); + assert_eq!(inspection.assembly, Some(Assembly::Grch38)); + assert_eq!( + inspection + .source + .as_ref() + .and_then(|source| source.vendor.as_deref()), + Some("Dynamic DNA") + ); + assert_eq!( + inspection + .source + .as_ref() + .and_then(|source| source.platform_version.as_deref()), + Some("GSAv3-DTC") + ); + assert!(inspection.duration_ms < 1000); + assert!(elapsed < 1000); +} + +#[test] +#[allow(clippy::too_many_lines)] +fn shared_zipped_vendor_fixtures_report_expected_metadata() { + struct Expectation { + relative: &'static str, + vendor: &'static str, + platform_version: Option<&'static str>, + assembly: Option, + } + + let fixtures = [ + Expectation { + relative: "23andme/v2/hu0199C8/23data20100526.txt.zip", + vendor: "23andMe", + platform_version: Some("v2"), + assembly: None, + }, + Expectation { + relative: "23andme/v3/huE4DAE4/huE4DAE4_20120522224129.txt.zip", + vendor: "23andMe", + platform_version: Some("v3"), + assembly: None, + }, + Expectation { + relative: "23andme/v4/huE18D82/genome__v4_Full_2016.txt.zip", + vendor: "23andMe", + platform_version: Some("v4"), + assembly: Some(Assembly::Grch37), + }, + Expectation { + relative: "23andme/v5/hu50B3F5/genome_hu50B3F5_v5_Full.zip", + vendor: "23andMe", + platform_version: Some("v5"), + assembly: Some(Assembly::Grch37), + }, + Expectation { + relative: "dynamicdna/100001-synthetic/100001_X_X_GSAv3-DTC_GRCh38-07-12-2025.txt.zip", + vendor: "Dynamic DNA", + platform_version: Some("GSAv3-DTC"), + assembly: Some(Assembly::Grch38), + }, + Expectation { + relative: "ancestrydna/huE922FC/AncestryDNA.txt.zip", + vendor: "AncestryDNA", + platform_version: Some("V2.0"), + assembly: Some(Assembly::Grch37), + }, + Expectation { + relative: "familytreedna/hu17B792/2017-04-29_Family_Tree_DNA_Data.csv.zip", + vendor: "FamilyTreeDNA", + platform_version: None, + assembly: None, + }, + Expectation { + relative: "genesforgood/hu80B047/GFG0_filtered_imputed_genotypes_noY_noMT_23andMe.txt.zip", + vendor: "Genes for Good", + platform_version: Some("v1.2.1"), + assembly: Some(Assembly::Grch37), + }, + Expectation { + relative: "myheritage/hu33515F/MyHeritage_raw_dna_data.zip", + vendor: "MyHeritage", + platform_version: None, + assembly: Some(Assembly::Grch37), + }, + ]; + + for fixture in fixtures { + let Some(path) = shared_fixture_or_skip( + "shared_zipped_vendor_fixtures_report_expected_metadata", + fixture.relative, + ) else { + return; + }; + + let started = Instant::now(); + let inspection = inspect_file(&path, &InspectOptions::default()).unwrap(); + let elapsed = started.elapsed().as_millis(); + + assert_eq!( + inspection.container, + FileContainer::Zip, + "fixture {}", + fixture.relative + ); + assert_eq!( + inspection.detected_kind, + DetectedKind::GenotypeText, + "fixture {}", + fixture.relative + ); + assert_eq!( + inspection + .source + .as_ref() + .and_then(|source| source.vendor.as_deref()), + Some(fixture.vendor), + "fixture {}", + fixture.relative + ); + assert_eq!( + inspection + .source + .as_ref() + .and_then(|source| source.platform_version.as_deref()), + fixture.platform_version, + "fixture {}", + fixture.relative + ); + assert_eq!( + inspection.assembly, fixture.assembly, + "fixture {}", + fixture.relative + ); + assert!( + inspection.duration_ms < 1000, + "fixture {}", + fixture.relative + ); + assert!(elapsed < 1000, "fixture {}", fixture.relative); + } +} + +#[test] +fn chr_y_cram_fixture_reports_index_without_decoding_entire_file() { + let Some(path) = shared_fixture_or_skip( + "chr_y_cram_fixture_reports_index_without_decoding_entire_file", + "NA06985-chrY/aligned/NA06985.final.chrY.cram", + ) else { + return; + }; + + let started = Instant::now(); + let inspection = inspect_file(&path, &InspectOptions::default()).unwrap(); + let elapsed = started.elapsed().as_millis(); + + assert_eq!(inspection.detected_kind, DetectedKind::AlignmentCram); + assert_eq!(inspection.has_index, Some(true)); + assert!(inspection.index_path.is_some()); + assert!(inspection.duration_ms < 1000); + assert!(elapsed < 1000); +} + +#[test] +fn phased_vcf_reports_phasing() { + let dir = temp_dir("phased-vcf"); + let path = dir.join("sample.vcf"); + std::fs::write( + &path, + "##fileformat=VCFv4.2\n\ +##FORMAT=\n\ +#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ +1\t100\trs1\tA\tG\t.\tPASS\t.\tGT\t0|1\n", + ) + .unwrap(); + + let inspection = inspect_file(&path, &InspectOptions::default()).unwrap(); + assert_eq!(inspection.detected_kind, DetectedKind::Vcf); + assert_eq!(inspection.phased, Some(true)); +} + +#[test] +fn unphased_vcf_reports_false_for_slash_genotypes() { + let dir = temp_dir("unphased-vcf"); + let path = dir.join("sample.vcf"); + std::fs::write( + &path, + "##fileformat=VCFv4.2\n\ +##FORMAT=\n\ +#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ +1\t100\trs1\tA\tG\t.\tPASS\t.\tGT\t0/1\n", + ) + .unwrap(); + + let inspection = inspect_file(&path, &InspectOptions::default()).unwrap(); + assert_eq!(inspection.detected_kind, DetectedKind::Vcf); + assert_eq!(inspection.phased, Some(false)); +} + +#[test] +fn dot_slash_genotype_is_treated_as_unphased() { + let dir = temp_dir("dot-slash-vcf"); + let path = dir.join("sample.vcf"); + std::fs::write( + &path, + "##fileformat=VCFv4.2\n\ +##FORMAT=\n\ +#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ +1\t100\trs1\tA\tG\t.\tPASS\t.\tGT:GQ\t./.:9\n", + ) + .unwrap(); + + let inspection = inspect_file(&path, &InspectOptions::default()).unwrap(); + assert_eq!(inspection.detected_kind, DetectedKind::Vcf); + assert_eq!(inspection.phased, Some(false)); +} + +#[test] +fn vcf_without_usable_gt_reports_unknown_phasing() { + let dir = temp_dir("unknown-phasing-vcf"); + let path = dir.join("sample.vcf"); + std::fs::write( + &path, + "##fileformat=VCFv4.2\n\ +##FORMAT=\n\ +#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ +1\t100\trs1\tA\tG\t.\tPASS\t.\tDP\t42\n", + ) + .unwrap(); + + let inspection = inspect_file(&path, &InspectOptions::default()).unwrap(); + assert_eq!(inspection.detected_kind, DetectedKind::Vcf); + assert_eq!(inspection.phased, None); +} + +#[test] +fn mixed_vcf_sampled_rows_prefer_true_when_pipe_is_seen() { + let dir = temp_dir("mixed-phasing-vcf"); + let path = dir.join("sample.vcf"); + std::fs::write( + &path, + "##fileformat=VCFv4.2\n\ +##FORMAT=\n\ +#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ +1\t100\trs1\tA\tG\t.\tPASS\t.\tGT\t0/1\n\ +1\t101\trs2\tA\tG\t.\tPASS\t.\tGT\t0|1\n", + ) + .unwrap(); + + let inspection = inspect_file(&path, &InspectOptions::default()).unwrap(); + assert_eq!(inspection.detected_kind, DetectedKind::Vcf); + assert_eq!(inspection.phased, Some(true)); +} diff --git a/test.sh b/test.sh index 47835a9..5e93a02 100755 --- a/test.sh +++ b/test.sh @@ -2,11 +2,46 @@ set -euo pipefail ROOT_DIR="$(cd "$(dirname "$0")" && pwd)" +REPORT=0 +LARGE=0 + +while [[ $# -gt 0 ]]; do + case "$1" in + --security) + cd "$ROOT_DIR/rust" + exec cargo test -p bioscript-runtime --test security -- --nocapture + ;; + --report) + REPORT=1 + shift + ;; + --large) + LARGE=1 + shift + ;; + *) + echo "Unknown argument: $1" >&2 + exit 1 + ;; + esac +done cd "$ROOT_DIR/rust" -if [[ "${1:-}" == "--security" ]]; then - exec cargo test -p bioscript-runtime --test security -- --nocapture +if [[ "$LARGE" == "1" ]]; then + BIOSCRIPT_RUN_LARGE_TESTS=1 cargo test -p bioscript-formats --test file_formats --test inspect -- --nocapture +else + cargo test -p bioscript-formats --test file_formats --test inspect -- --nocapture fi +cargo test -p bioscript-cli --test cli -- --nocapture -exec cargo test -p bioscript-cli -- --nocapture +if [[ "$REPORT" == "1" ]]; then + cargo build -p bioscript-cli + cd "$ROOT_DIR" + REPORT_PATH="$ROOT_DIR/test-reports/file-formats.html" + python3 "$ROOT_DIR/tools/generate_file_format_report.py" \ + --bioscript "$ROOT_DIR/rust/target/debug/bioscript" \ + --root "$ROOT_DIR" \ + --output "$REPORT_PATH" + echo "HTML report: $REPORT_PATH" +fi diff --git a/tools/fetch_test_data.sh b/tools/fetch_test_data.sh new file mode 100755 index 0000000..a458890 --- /dev/null +++ b/tools/fetch_test_data.sh @@ -0,0 +1,350 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Fetch test data defined in test-data/sources.yaml. +# Only downloads files that are not already present locally. +# +# Usage: +# ./fetch_test_data.sh # download all missing files +# ./fetch_test_data.sh --check # report what's present/missing +# ./fetch_test_data.sh --dataset 1k-genomes # only this dataset +# ./fetch_test_data.sh --dataset 23andme # only this dataset +# ./fetch_test_data.sh --list # show full manifest +# ./fetch_test_data.sh --only "*.crai,*.fai" # only matching files +# ./fetch_test_data.sh --exclude "*.fa,*.cram" # skip matching files + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" +DATA_DIR="${REPO_ROOT}/test-data" +DEFAULT_CACHE_ROOT="${HOME}/.bioscript/cache/test-data" +CACHE_ROOT="${BIOSCRIPT_TEST_DATA_CACHE_DIR:-${DEFAULT_CACHE_ROOT}}" +SOURCES="${SCRIPT_DIR}/sources.yaml" + +# --- Helpers ----------------------------------------------------------------- + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[0;33m' +CYAN='\033[0;36m' +NC='\033[0m' + +info() { printf "${GREEN}[OK]${NC} %s\n" "$*"; } +warn() { printf "${YELLOW}[MISS]${NC} %s\n" "$*"; } +fetching() { printf "${CYAN}[FETCH]${NC} %s\n" "$*"; } +error() { printf "${RED}[ERR]${NC} %s\n" "$*"; } + +human_size() { + local bytes=$1 + if [ "$bytes" -ge 1073741824 ] 2>/dev/null; then echo "$(( bytes / 1073741824 )) GB" + elif [ "$bytes" -ge 1048576 ] 2>/dev/null; then echo "$(( bytes / 1048576 )) MB" + elif [ "$bytes" -ge 1024 ] 2>/dev/null; then echo "$(( bytes / 1024 )) KB" + else echo "${bytes} B" + fi +} + +file_size() { + stat -f%z "$1" 2>/dev/null || stat --printf="%s" "$1" 2>/dev/null || echo "0" +} + +ensure_parent_dir() { + mkdir -p "$(dirname "$1")" +} + +reconstruct_split_archive() { + local cache_dir="$1" + local repo_dir="$2" + local filename="$3" + + case "$filename" in + *.tar.gz.??) ;; + *) return 0 ;; + esac + + local archive_name="${filename%.[A-Za-z][A-Za-z]}" + local final_name="${archive_name%.tar.gz}" + local final_cache_path="${cache_dir}/${final_name}" + local final_repo_path="${repo_dir}/${final_name}" + local first_part="${cache_dir}/${archive_name}.aa" + + if [ -f "$final_cache_path" ]; then + ensure_repo_symlink "$final_cache_path" "$final_repo_path" + return 0 + fi + + if [ ! -f "$first_part" ]; then + return 0 + fi + + local parts=() + local candidate="" + for candidate in "${cache_dir}/${archive_name}".??; do + [ -e "$candidate" ] || continue + parts+=("$candidate") + done + + if [ "${#parts[@]}" -eq 0 ]; then + return 0 + fi + + local tmpdir + tmpdir="$(mktemp -d "${cache_dir}/reconstruct.XXXXXX")" + + if cat "${parts[@]}" | tar xzf - -C "$tmpdir"; then + local extracted + extracted="$(find "$tmpdir" -type f | head -n 1)" + if [ -n "$extracted" ] && [ -f "$extracted" ]; then + mv "$extracted" "$final_cache_path" + ensure_repo_symlink "$final_cache_path" "$final_repo_path" + fi + fi + + rm -rf "$tmpdir" +} + +ensure_repo_symlink() { + local cache_path="$1" + local repo_path="$2" + + ensure_parent_dir "$repo_path" + + if [ -L "$repo_path" ] && [ "$(readlink "$repo_path")" = "$cache_path" ]; then + return 0 + fi + + rm -f "$repo_path" + ln -s "$cache_path" "$repo_path" +} + +migrate_repo_file_into_cache() { + local repo_path="$1" + local cache_path="$2" + + ensure_parent_dir "$cache_path" + mv "$repo_path" "$cache_path" + ensure_repo_symlink "$cache_path" "$repo_path" +} + +materialize_local_file() { + local label="$1" + local repo_path="$2" + local cache_path="$3" + + if [ -f "$cache_path" ]; then + ensure_repo_symlink "$cache_path" "$repo_path" + return 0 + fi + + if [ -f "$repo_path" ] && [ ! -L "$repo_path" ]; then + if migrate_repo_file_into_cache "$repo_path" "$cache_path"; then + info "${label} (migrated into cache)" + return 0 + fi + error "${label} — failed to migrate into cache" + return 1 + fi + + return 1 +} + +# --- Parse sources.yaml via python and emit tab-separated lines -------------- +# Output: datasetsubdirfilenameurl + +parse_sources() { + python3 -c " +import os, sys +from urllib.parse import unquote, urlparse + +import yaml + +def basename(url: str) -> str: + return os.path.basename(unquote(urlparse(url).path)) + +with open(sys.argv[1]) as f: + cfg = yaml.safe_load(f) + +for ds_name, ds in cfg.get('datasets', {}).items(): + for sd_name, sd in ds.get('subdirs', {}).items(): + for f in sd.get('files', []): + print(f\"{ds_name}\t{sd_name}\t{f['name']}\t{f['url']}\") + +for sample_name, sample in cfg.get('sample_data_urls', {}).items(): + for key, subdir in ( + ('ref', 'ref'), + ('ref_index', 'ref'), + ('aligned', 'aligned'), + ('aligned_index', 'aligned'), + ('snp', 'snp'), + ): + value = sample.get(key) + if not value: + continue + urls = value if isinstance(value, list) else [value] + for url in urls: + print(f\"{sample_name}\t{subdir}\t{basename(url)}\t{url}\") +" "$SOURCES" +} + +# --- CLI --------------------------------------------------------------------- + +MODE="download" +FILTER_DATASET="" +ONLY_PATTERNS="" +EXCLUDE_PATTERNS="" + +# Check if a filename matches any pattern in a comma-separated list +# Supports: exact name, glob (*), partial substring +matches_any() { + local filename="$1" + local patterns="$2" + [ -z "$patterns" ] && return 1 + local IFS=',' + for pat in $patterns; do + pat="$(echo "$pat" | sed 's/^ *//;s/ *$//')" + case "$filename" in + $pat) return 0 ;; + esac + done + return 1 +} + +while [ $# -gt 0 ]; do + case "$1" in + --check) MODE="check"; shift ;; + --list) MODE="list"; shift ;; + --dataset) FILTER_DATASET="$2"; shift 2 ;; + --only) ONLY_PATTERNS="$2"; shift 2 ;; + --exclude) EXCLUDE_PATTERNS="$2"; shift 2 ;; + --help|-h) + cat < Only process this dataset (e.g. 1k-genomes, 23andme) + --only Only include files matching patterns (comma-separated globs) + --exclude Skip files matching patterns (comma-separated globs) + -h, --help Show this help + +Filter examples: + --only "*.zip" Only 23andMe zip files + --only "*.fai,*.crai" Only index files + --exclude "*.fa,*.cram" Skip large files, get indexes only + --dataset 23andme --only "*v5*" Only the v5 23andMe file + +Datasets: + 1k-genomes GRCh38 reference genome + NA06985 high-coverage CRAM + 23andme 23andMe SNP exports (v2-v5) from OpenMined biovault-data + dynamicdna Dynamic DNA GSAv3-DTC synthetic export on GRCh38 + +Data directory: ${DATA_DIR}/ +Cache directory: ${CACHE_ROOT}/ +Config file: ${SOURCES} +EOF + exit 0 + ;; + *) error "Unknown argument: $1"; exit 1 ;; + esac +done + +if [ ! -f "$SOURCES" ]; then + error "Sources file not found: ${SOURCES}" + exit 1 +fi + +for cmd in curl python3; do + if ! command -v "$cmd" &>/dev/null; then + error "$cmd is required but not found." + exit 1 + fi +done + +echo "" +echo "PGx Test Data Fetcher" +echo "=====================" +echo "Config: ${SOURCES}" +echo "Data: ${DATA_DIR}" +echo "Cache: ${CACHE_ROOT}" +[ -n "$FILTER_DATASET" ] && echo "Dataset: ${FILTER_DATASET}" +[ -n "$ONLY_PATTERNS" ] && echo "Only: ${ONLY_PATTERNS}" +[ -n "$EXCLUDE_PATTERNS" ] && echo "Exclude: ${EXCLUDE_PATTERNS}" +echo "" + +missing=0 +present=0 +downloaded=0 +failed=0 + +while IFS=$'\t' read -r dataset subdir filename url; do + # apply dataset filter + if [ -n "$FILTER_DATASET" ] && [ "$dataset" != "$FILTER_DATASET" ]; then + continue + fi + + # apply --only filter (match against filename or full path) + if [ -n "$ONLY_PATTERNS" ]; then + if ! matches_any "$filename" "$ONLY_PATTERNS" && ! matches_any "${dataset}/${subdir}/${filename}" "$ONLY_PATTERNS"; then + continue + fi + fi + + # apply --exclude filter + if [ -n "$EXCLUDE_PATTERNS" ]; then + if matches_any "$filename" "$EXCLUDE_PATTERNS" || matches_any "${dataset}/${subdir}/${filename}" "$EXCLUDE_PATTERNS"; then + continue + fi + fi + + label="${dataset}/${subdir}/${filename}" + repo_dir="${DATA_DIR}/${dataset}/${subdir}" + repo_path="${repo_dir}/${filename}" + cache_dir="${CACHE_ROOT}/${dataset}/${subdir}" + cache_path="${cache_dir}/${filename}" + + if [ "$MODE" = "list" ]; then + printf " %-60s %s\n" "$label" "$url" + continue + fi + + if materialize_local_file "$label" "$repo_path" "$cache_path"; then + reconstruct_split_archive "$cache_dir" "$repo_dir" "$filename" + size=$(file_size "$cache_path") + info "${label} ($(human_size "$size"))" + present=$((present + 1)) + continue + fi + + if [ "$MODE" = "check" ]; then + warn "${label}" + missing=$((missing + 1)) + continue + fi + + # download + mkdir -p "$cache_dir" + fetching "${label}" + echo " ${url}" + + if curl -fSL --progress-bar --retry 3 --retry-delay 5 -o "$cache_path" "$url"; then + ensure_repo_symlink "$cache_path" "$repo_path" + reconstruct_split_archive "$cache_dir" "$repo_dir" "$filename" + size=$(file_size "$cache_path") + info "${label} ($(human_size "$size"))" + downloaded=$((downloaded + 1)) + else + error "${label} — download failed" + rm -f "$cache_path" + failed=$((failed + 1)) + fi + +done < <(parse_sources) + +echo "" +echo "Summary: ${present} present, ${downloaded} downloaded, ${missing} missing, ${failed} failed" + +if [ "$failed" -gt 0 ]; then + exit 1 +fi diff --git a/tools/generate_file_format_report.py b/tools/generate_file_format_report.py new file mode 100644 index 0000000..bd6971a --- /dev/null +++ b/tools/generate_file_format_report.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import html +import subprocess +import sys +from datetime import datetime, timezone +from pathlib import Path + + +INSPECT_EXTENSIONS = ( + ".zip", + ".vcf", + ".vcf.gz", + ".cram", + ".bam", + ".fa", + ".fasta", + ".fa.gz", + ".fna", +) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser() + parser.add_argument("--bioscript", required=True, type=Path) + parser.add_argument("--root", required=True, type=Path) + parser.add_argument("--output", required=True, type=Path) + return parser.parse_args() + + +def iter_targets(test_data_root: Path) -> list[Path]: + targets: list[Path] = [] + for path in sorted(test_data_root.rglob("*")): + if not path.is_file(): + continue + lower = path.name.lower() + if lower.endswith(INSPECT_EXTENSIONS): + targets.append(path) + return targets + + +def inspect_file(bioscript: Path, root: Path, path: Path) -> dict[str, str]: + proc = subprocess.run( + [str(bioscript), "inspect", str(path)], + cwd=root, + text=True, + capture_output=True, + check=False, + ) + result = { + "path": str(path.relative_to(root)), + "status": "ok" if proc.returncode == 0 else "error", + "stderr": proc.stderr.strip(), + } + if proc.returncode != 0: + return result + for line in proc.stdout.splitlines(): + if "\t" not in line: + continue + key, value = line.split("\t", 1) + result[key] = value + return result + + +def cell(value: str) -> str: + return html.escape(value or "") + + +def render_html(rows: list[dict[str, str]]) -> str: + generated_at = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC") + total = len(rows) + failures = sum(1 for row in rows if row["status"] != "ok") + headers = [ + "path", + "container", + "kind", + "confidence", + "vendor", + "platform_version", + "assembly", + "phased", + "selected_entry", + "has_index", + "index_path", + "reference_matches", + "source_confidence", + "duration_ms", + "source_evidence", + "evidence", + "warnings", + "stderr", + ] + lines = [ + "", + "", + "", + "", + "Bioscript File Format Report", + "", + "", + "", + "

Bioscript File Format Report

", + f"
Generated {cell(generated_at)}. Files inspected: {total}. Failures: {failures}.
", + "", + "" + "".join(f"" for header in headers) + "", + "", + ] + for row in rows: + css = "ok" if row["status"] == "ok" else "error" + lines.append(f"") + for header in headers: + value = row.get(header, "") + if header == "path": + lines.append(f"") + else: + lines.append(f"") + lines.append("") + lines.extend(["", "
{cell(header)}
{cell(value)}{cell(value)}
", "", ""]) + return "\n".join(lines) + + +def main() -> int: + args = parse_args() + test_data_root = args.root / "test-data" + if not args.bioscript.exists(): + print(f"bioscript binary not found: {args.bioscript}", file=sys.stderr) + return 1 + if not test_data_root.exists(): + print(f"test-data directory not found: {test_data_root}", file=sys.stderr) + return 1 + + rows = [inspect_file(args.bioscript, args.root, path) for path in iter_targets(test_data_root)] + args.output.parent.mkdir(parents=True, exist_ok=True) + args.output.write_text(render_html(rows), encoding="utf-8") + print(args.output) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tools/sources.yaml b/tools/sources.yaml new file mode 100644 index 0000000..e9435c4 --- /dev/null +++ b/tools/sources.yaml @@ -0,0 +1,120 @@ +# Test data source manifest +# Used by fetch_test_data.sh to download missing files. +# Each dataset has a local directory and a list of files with URLs. + +datasets: + 1k-genomes: + description: "1000 Genomes Project high-coverage data (NA06985, CEU population)" + subdirs: + ref: + description: "GRCh38 reference genome" + files: + - name: "GRCh38_full_analysis_set_plus_decoy_hla.fa" + url: "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa" + description: "GRCh38 reference FASTA with decoy and HLA sequences" + - name: "GRCh38_full_analysis_set_plus_decoy_hla.fa.fai" + url: "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa.fai" + description: "FASTA index for the reference genome" + aligned: + description: "High-coverage aligned reads" + files: + - name: "NA06985.final.cram" + url: "https://ftp-trace.ncbi.nih.gov/1000genomes/ftp/1000G_2504_high_coverage/data/ERR3239276/NA06985.final.cram" + description: "CRAM alignment for NA06985 (30x WGS)" + - name: "NA06985.final.cram.crai" + url: "https://ftp-trace.ncbi.nih.gov/1000genomes/ftp/1000G_2504_high_coverage/data/ERR3239276/NA06985.final.cram.crai" + description: "CRAM index" + + 23andme: + description: "23andMe SNP genotype files from OpenMined biovault-data (PGP participants)" + source_repo: "https://github.com/OpenMined/biovault-data/tree/main/snp/23andme" + subdirs: + v2/hu0199C8: + description: "23andMe chip version 2, participant hu0199C8" + files: + - name: "23data20100526.txt.zip" + url: "https://raw.githubusercontent.com/OpenMined/biovault-data/main/snp/23andme/v2/hu0199C8/23data20100526.txt.zip" + description: "PGP participant hu0199C8, 23andMe v2 export (2010)" + v3/huE4DAE4: + description: "23andMe chip version 3, participant huE4DAE4" + files: + - name: "huE4DAE4_20120522224129.txt.zip" + url: "https://raw.githubusercontent.com/OpenMined/biovault-data/main/snp/23andme/v3/huE4DAE4/huE4DAE4_20120522224129.txt.zip" + description: "PGP participant huE4DAE4, 23andMe v3 export (2012)" + v4/huE18D82: + description: "23andMe chip version 4, participant huE18D82" + files: + - name: "genome__v4_Full_2016.txt.zip" + url: "https://raw.githubusercontent.com/OpenMined/biovault-data/main/snp/23andme/v4/huE18D82/genome__v4_Full_2016.txt.zip" + description: "PGP participant huE18D82, 23andMe v4 export (2016)" + v5/hu50B3F5: + description: "23andMe chip version 5, participant hu50B3F5" + files: + - name: "genome_hu50B3F5_v5_Full.zip" + url: "https://raw.githubusercontent.com/OpenMined/biovault-data/main/snp/23andme/v5/hu50B3F5/genome_hu50B3F5_v5_Full.zip" + description: "PGP participant hu50B3F5, 23andMe v5 export" + + dynamicdna: + description: "Dynamic DNA SNP genotype files from OpenMined biovault-data" + source_repo: "https://github.com/OpenMined/biovault-data/tree/main/snp/dynamicdna" + subdirs: + 100001-synthetic: + description: "Synthetic Dynamic DNA GSAv3-DTC participant on GRCh38" + files: + - name: "100001_X_X_GSAv3-DTC_GRCh38-07-12-2025.txt.zip" + url: "https://raw.githubusercontent.com/OpenMined/biovault-data/main/snp/dynamicdna/100001-synthetic/100001_X_X_GSAv3-DTC_GRCh38-07-12-2025.txt.zip" + description: "Dynamic DNA GSAv3-DTC synthetic export on GRCh38" + + ancestrydna: + description: "AncestryDNA SNP genotype files from OpenMined biovault-data" + source_repo: "https://github.com/OpenMined/biovault-data/tree/main/snp/ancestrydna" + subdirs: + huE922FC: + description: "AncestryDNA V2.0 export" + files: + - name: "AncestryDNA.txt.zip" + url: "https://raw.githubusercontent.com/OpenMined/biovault-data/main/snp/ancestrydna/huE922FC/AncestryDNA.txt.zip" + description: "AncestryDNA export with V2.0 array header and build 37.1 coordinates" + + familytreedna: + description: "FamilyTreeDNA SNP genotype files from OpenMined biovault-data" + source_repo: "https://github.com/OpenMined/biovault-data/tree/main/snp/familytreedna" + subdirs: + hu17B792: + description: "FamilyTreeDNA CSV export" + files: + - name: "2017-04-29_Family_Tree_DNA_Data.csv.zip" + url: "https://raw.githubusercontent.com/OpenMined/biovault-data/main/snp/familytreedna/hu17B792/2017-04-29_Family_Tree_DNA_Data.csv.zip" + description: "FamilyTreeDNA CSV export wrapped as zip" + + genesforgood: + description: "Genes for Good genotype files from OpenMined biovault-data" + source_repo: "https://github.com/OpenMined/biovault-data/tree/main/snp/genesforgood" + subdirs: + hu80B047: + description: "Genes for Good filtered unphased genotype calls on GRCh37" + files: + - name: "GFG0_filtered_imputed_genotypes_noY_noMT_23andMe.txt.zip" + url: "https://raw.githubusercontent.com/OpenMined/biovault-data/main/snp/genesforgood/hu80B047/GFG0_filtered_imputed_genotypes_noY_noMT_23andMe.txt.zip" + description: "Genes for Good filtered, imputed genotype export on GRCh37" + + myheritage: + description: "MyHeritage SNP genotype files from OpenMined biovault-data" + source_repo: "https://github.com/OpenMined/biovault-data/tree/main/snp/myheritage" + subdirs: + hu33515F: + description: "MyHeritage raw DNA export" + files: + - name: "MyHeritage_raw_dna_data.zip" + url: "https://raw.githubusercontent.com/OpenMined/biovault-data/main/snp/myheritage/hu33515F/MyHeritage_raw_dna_data.zip" + description: "MyHeritage raw DNA export for participant hu33515F" + +sample_data_urls: + NA06985-chrY: + ref_version: GRCh38 + ref: "https://raw.githubusercontent.com/OpenMined/biovault-data/main/cram/reference/GRCh38_chrY.fa" + ref_index: "https://raw.githubusercontent.com/OpenMined/biovault-data/main/cram/reference/GRCh38_chrY.fa.fai" + aligned: + - "https://raw.githubusercontent.com/OpenMined/biovault-data/main/cram/NA06985/NA06985.final.chrY.cram.tar.gz.aa" + - "https://raw.githubusercontent.com/OpenMined/biovault-data/main/cram/NA06985/NA06985.final.chrY.cram.tar.gz.ab" + aligned_index: "https://raw.githubusercontent.com/OpenMined/biovault-data/main/cram/NA06985/NA06985.final.chrY.cram.crai"