diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 055f9fd..265841a 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -1012,6 +1012,8 @@ dependencies = [ "noodles-csi", "noodles-fasta", "noodles-sam", + "noodles-tabix", + "noodles-vcf", ] [[package]] @@ -1096,6 +1098,30 @@ dependencies = [ "noodles-csi", ] +[[package]] +name = "noodles-tabix" +version = "0.61.0" +dependencies = [ + "bstr", + "indexmap", + "noodles-bgzf", + "noodles-core", + "noodles-csi", +] + +[[package]] +name = "noodles-vcf" +version = "0.87.0" +dependencies = [ + "indexmap", + "memchr", + "noodles-bgzf", + "noodles-core", + "noodles-csi", + "noodles-tabix", + "percent-encoding", +] + [[package]] name = "num-bigint" version = "0.4.6" @@ -1140,6 +1166,12 @@ dependencies = [ "indexmap", ] +[[package]] +name = "percent-encoding" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" + [[package]] name = "phf" version = "0.11.3" diff --git a/rust/Cargo.toml b/rust/Cargo.toml index ca3d8b4..19eda18 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -22,3 +22,5 @@ noodles-cram = { path = "../noodles/noodles-cram" } noodles-csi = { path = "../noodles/noodles-csi" } noodles-fasta = { path = "../noodles/noodles-fasta" } noodles-sam = { path = "../noodles/noodles-sam" } +noodles-tabix = { path = "../noodles/noodles-tabix" } +noodles-vcf = { path = "../noodles/noodles-vcf" } diff --git a/rust/bioscript-formats/Cargo.toml b/rust/bioscript-formats/Cargo.toml index be17e43..c657916 100644 --- a/rust/bioscript-formats/Cargo.toml +++ b/rust/bioscript-formats/Cargo.toml @@ -8,7 +8,7 @@ crate-type = ["rlib"] [dependencies] bioscript-core = { path = "../bioscript-core" } -noodles = { version = "0.109.0", features = ["bgzf", "core", "cram", "csi", "fasta", "sam"] } +noodles = { version = "0.109.0", features = ["bgzf", "core", "cram", "csi", "fasta", "sam", "tabix", "vcf"] } zip = { version = "2.2.0", default-features = false, features = ["deflate"] } [lints.clippy] diff --git a/rust/bioscript-formats/src/alignment.rs b/rust/bioscript-formats/src/alignment.rs index c319d0d..690dcc9 100644 --- a/rust/bioscript-formats/src/alignment.rs +++ b/rust/bioscript-formats/src/alignment.rs @@ -12,6 +12,7 @@ use noodles::{ self, alignment::{Record as _, record::Cigar as _}, }, + tabix, }; use bioscript_core::{GenomicLocus, RuntimeError}; @@ -94,6 +95,13 @@ where R: Read + Seek, F: FnMut(AlignmentRecord) -> Result, { + // Same idempotent-rewind rationale as `for_each_raw_cram_record_with_reader` + // — the CRAM header lives at offset 0; we must rewind before each call so + // callers can iterate over multiple loci with the same `IndexedReader`. + reader + .get_mut() + .seek(std::io::SeekFrom::Start(0)) + .map_err(|err| RuntimeError::Io(format!("failed to rewind CRAM {label}: {err}")))?; let header = reader.read_header().map_err(|err| { RuntimeError::Io(format!("failed to read CRAM header {label}: {err}")) })?; @@ -132,6 +140,15 @@ where R: Read + Seek, F: FnMut(cram::Record<'_>) -> Result, { + // Re-seeks to position 0 before reading the header so this helper is + // idempotent across repeated calls on the same indexed reader (e.g. a + // wasm caller looking up N variants in a loop). Otherwise the second + // call reads garbage because the stream position is wherever the + // previous container iteration left it. + reader + .get_mut() + .seek(std::io::SeekFrom::Start(0)) + .map_err(|err| RuntimeError::Io(format!("failed to rewind CRAM {label}: {err}")))?; let header = reader.read_header().map_err(|err| { RuntimeError::Io(format!("failed to read CRAM header {label}: {err}")) })?; @@ -206,6 +223,15 @@ pub fn parse_fai_bytes(bytes: &[u8]) -> Result .map_err(|err| RuntimeError::Io(format!("failed to parse FASTA index bytes: {err}"))) } +/// Parse a tabix index (`.tbi`) from an in-memory byte buffer. Used by wasm +/// callers that pass the small index inline while the bgzipped VCF stays on +/// a JS-backed `Read + Seek` reader. +pub fn parse_tbi_bytes(bytes: &[u8]) -> Result { + tabix::io::Reader::new(std::io::Cursor::new(bytes)) + .read_index() + .map_err(|err| RuntimeError::Io(format!("failed to parse tabix index bytes: {err}"))) +} + pub(crate) fn build_cram_indexed_reader_from_path( path: &Path, options: &GenotypeLoadOptions, diff --git a/rust/bioscript-formats/src/genotype.rs b/rust/bioscript-formats/src/genotype.rs index 7571e83..34e81bb 100644 --- a/rust/bioscript-formats/src/genotype.rs +++ b/rust/bioscript-formats/src/genotype.rs @@ -8,9 +8,11 @@ use std::{ }; use noodles::bgzf; -use noodles::core::Position; +use noodles::core::{Position, Region}; use noodles::cram; +use noodles::csi::{self, BinningIndex}; use noodles::sam::alignment::Record as _; +use noodles::tabix; use zip::ZipArchive; use bioscript_core::{ @@ -1028,6 +1030,147 @@ pub fn observe_cram_snp_with_reader( }) } +/// Observe a SNP at `locus` over an already-built tabix-indexed bgzipped VCF +/// reader. Mirrors the CRAM variant for VCF: caller builds +/// `csi::io::IndexedReader::new(reader, tabix_index)` once and calls this per +/// variant. Non-filesystem callers (wasm with a JS-backed reader) go through +/// this path. +/// +/// Only the locus-based match path is implemented — rsid-only variants would +/// need a linear scan, which is a follow-up. Pass `matched_rsid` through if +/// the caller already resolved it. +pub fn observe_vcf_snp_with_reader( + indexed: &mut csi::io::IndexedReader, tabix::Index>, + label: &str, + locus: &GenomicLocus, + reference: char, + alternate: char, + matched_rsid: Option, + assembly: Option, +) -> Result +where + R: Read + Seek, +{ + let locus_label = format!("{}:{}", locus.chrom, locus.start); + + let Some(seq_name) = resolve_vcf_chrom_name(indexed.index(), &locus.chrom) else { + return Ok(VariantObservation { + backend: "vcf".to_owned(), + matched_rsid, + assembly, + evidence: vec![format!( + "{label}: tabix index has no contig matching {} (tried chr-prefixed and bare forms)", + locus.chrom + )], + ..VariantObservation::default() + }); + }; + + let pos_usize = usize::try_from(locus.start).map_err(|err| { + RuntimeError::Io(format!( + "{label}: invalid VCF position {} for {locus_label}: {err}", + locus.start + )) + })?; + let position = Position::try_from(pos_usize).map_err(|err| { + RuntimeError::Io(format!( + "{label}: invalid VCF position {} for {locus_label}: {err}", + locus.start + )) + })?; + let region = Region::new(seq_name.as_str(), position..=position); + + let query = indexed + .query(®ion) + .map_err(|err| RuntimeError::Io(format!("{label}: tabix query for {locus_label}: {err}")))?; + + let reference_str = reference.to_ascii_uppercase().to_string(); + let alternate_str = alternate.to_ascii_uppercase().to_string(); + + let mut saw_any = false; + for record_result in query { + let record = record_result + .map_err(|err| RuntimeError::Io(format!("{label}: tabix record iter: {err}")))?; + let line: &str = record.as_ref(); + let Some(row) = parse_vcf_record(line)? else { + continue; + }; + if row.position != locus.start { + continue; + } + saw_any = true; + if !row.reference.eq_ignore_ascii_case(&reference_str) { + continue; + } + if !row + .alternates + .iter() + .any(|alt| alt.eq_ignore_ascii_case(&alternate_str)) + { + continue; + } + + return Ok(VariantObservation { + backend: "vcf".to_owned(), + matched_rsid: matched_rsid.or_else(|| row.rsid.clone()), + assembly, + genotype: Some(row.genotype.clone()), + evidence: vec![format!("{label}: resolved by locus {locus_label}")], + ..VariantObservation::default() + }); + } + + let evidence = if saw_any { + vec![format!( + "{label}: {locus_label} present but ref={reference}/alt={alternate} did not match any record" + )] + } else { + vec![format!( + "{label}: no VCF record at {locus_label}" + )] + }; + Ok(VariantObservation { + backend: "vcf".to_owned(), + matched_rsid, + assembly, + evidence, + ..VariantObservation::default() + }) +} + +/// Match the user-provided chromosome name against the tabix index's set of +/// reference sequence names. VCFs vary: some use `chr22`, others `22`. Try +/// the user's spelling verbatim, then toggle the `chr` prefix, then fall back +/// to a case-insensitive compare against the normalized suffix. +fn resolve_vcf_chrom_name(index: &tabix::Index, user_chrom: &str) -> Option { + let header = index.header()?; + let names = header.reference_sequence_names(); + + let trimmed = user_chrom.trim(); + let stripped = trimmed.strip_prefix("chr").unwrap_or(trimmed); + + let candidates = [ + trimmed.to_owned(), + stripped.to_owned(), + format!("chr{stripped}"), + ]; + for cand in &candidates { + if names.contains(cand.as_bytes()) { + return Some(cand.clone()); + } + } + // Case-insensitive fallback against the full set. + let target = stripped.to_ascii_lowercase(); + for name in names { + let as_str = std::str::from_utf8(name.as_ref()).ok()?; + let as_stripped = as_str.strip_prefix("chr").unwrap_or(as_str); + if as_stripped.eq_ignore_ascii_case(&target) { + return Some(as_str.to_owned()); + } + } + None +} + fn normalize_pileup_base(base: u8) -> Option { match (base as char).to_ascii_uppercase() { 'A' | 'C' | 'G' | 'T' => Some((base as char).to_ascii_uppercase()), diff --git a/rust/bioscript-formats/src/lib.rs b/rust/bioscript-formats/src/lib.rs index 065c099..9a3d25e 100644 --- a/rust/bioscript-formats/src/lib.rs +++ b/rust/bioscript-formats/src/lib.rs @@ -14,7 +14,7 @@ mod prepare; pub use genotype::{ BackendCapabilities, GenotypeLoadOptions, GenotypeSourceFormat, GenotypeStore, QueryKind, - observe_cram_snp_with_reader, + observe_cram_snp_with_reader, observe_vcf_snp_with_reader, }; pub use inspect::{ DetectedKind, DetectionConfidence, FileContainer, FileInspection, InspectOptions, diff --git a/rust/bioscript-wasm/Cargo.toml b/rust/bioscript-wasm/Cargo.toml index 7a4dce2..06c54f6 100644 --- a/rust/bioscript-wasm/Cargo.toml +++ b/rust/bioscript-wasm/Cargo.toml @@ -9,7 +9,7 @@ crate-type = ["cdylib", "rlib"] [dependencies] bioscript-core = { path = "../bioscript-core" } bioscript-formats = { path = "../bioscript-formats" } -noodles = { version = "0.109.0", features = ["cram", "fasta"] } +noodles = { version = "0.109.0", features = ["bgzf", "cram", "csi", "fasta", "tabix"] } wasm-bindgen = "0.2" js-sys = "0.3" serde = { version = "1", features = ["derive"] } diff --git a/rust/bioscript-wasm/src/lib.rs b/rust/bioscript-wasm/src/lib.rs index 6a38bae..17bbb0f 100644 --- a/rust/bioscript-wasm/src/lib.rs +++ b/rust/bioscript-wasm/src/lib.rs @@ -22,7 +22,9 @@ use bioscript_core::GenomicLocus; use bioscript_formats::{ DetectedKind, DetectionConfidence, FileContainer, FileInspection, InspectOptions, SourceMetadata, alignment, inspect_bytes as inspect_bytes_rs, observe_cram_snp_with_reader, + observe_vcf_snp_with_reader, }; +use noodles::csi as noodles_csi; use serde::{Deserialize, Serialize}; use wasm_bindgen::prelude::*; @@ -188,6 +190,76 @@ pub fn lookup_cram_variants( serde_json::to_string(&results).map_err(|err| JsError::new(&format!("encode results: {err}"))) } +/// Observe a list of SNP variants against a bgzipped + tabix-indexed VCF, +/// with the bulk bytes pulled on demand via a JS-supplied `readAt(offset, len)` +/// callback. The small `.tbi` payload is passed inline. +/// +/// The reader must provide the VCF synchronously — on web this is a +/// `FileReaderSync`-backed callback running inside a Web Worker. +#[wasm_bindgen(js_name = lookupVcfVariants)] +pub fn lookup_vcf_variants( + vcf_read_at: js_sys::Function, + vcf_len: f64, + tbi_bytes: &[u8], + variants_json: &str, +) -> Result { + let tabix_index = alignment::parse_tbi_bytes(tbi_bytes) + .map_err(|err| JsError::new(&format!("parse tbi: {err:?}")))?; + let vcf_reader = JsReader::new(vcf_read_at, vcf_len as u64, "vcf"); + let mut indexed = noodles_csi::io::IndexedReader::new(vcf_reader, tabix_index); + + let variants: Vec = serde_json::from_str(variants_json) + .map_err(|err| JsError::new(&format!("parse variantsJson: {err}")))?; + + let mut results = Vec::with_capacity(variants.len()); + for variant in variants { + let ref_char = variant + .ref_base + .chars() + .next() + .ok_or_else(|| JsError::new(&format!("variant {}: empty ref", variant.name)))?; + let alt_char = variant + .alt_base + .chars() + .next() + .ok_or_else(|| JsError::new(&format!("variant {}: empty alt", variant.name)))?; + let assembly = variant + .assembly + .as_deref() + .and_then(parse_assembly_str); + let locus = GenomicLocus { + chrom: variant.chrom.clone(), + start: variant.pos, + end: variant.pos, + }; + let observation = observe_vcf_snp_with_reader( + &mut indexed, + &variant.name, + &locus, + ref_char, + alt_char, + variant.rsid.clone(), + assembly, + ) + .map_err(|err| JsError::new(&format!("vcf lookup {}: {err:?}", variant.name)))?; + results.push(VariantObservationJs { + name: variant.name, + backend: observation.backend, + matched_rsid: observation.matched_rsid, + assembly: observation.assembly.map(|a| render_assembly(a).to_owned()), + genotype: observation.genotype, + ref_count: observation.ref_count, + alt_count: observation.alt_count, + depth: observation.depth, + raw_counts: observation.raw_counts, + decision: observation.decision, + evidence: observation.evidence, + }); + } + + serde_json::to_string(&results).map_err(|err| JsError::new(&format!("encode results: {err}"))) +} + fn parse_assembly_str(s: &str) -> Option { match s.to_ascii_lowercase().as_str() { "grch37" | "hg19" | "b37" => Some(bioscript_core::Assembly::Grch37),