Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions rust/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions rust/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,5 @@ noodles-cram = { path = "../noodles/noodles-cram" }
noodles-csi = { path = "../noodles/noodles-csi" }
noodles-fasta = { path = "../noodles/noodles-fasta" }
noodles-sam = { path = "../noodles/noodles-sam" }
noodles-tabix = { path = "../noodles/noodles-tabix" }
noodles-vcf = { path = "../noodles/noodles-vcf" }
2 changes: 1 addition & 1 deletion rust/bioscript-formats/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ crate-type = ["rlib"]

[dependencies]
bioscript-core = { path = "../bioscript-core" }
noodles = { version = "0.109.0", features = ["bgzf", "core", "cram", "csi", "fasta", "sam"] }
noodles = { version = "0.109.0", features = ["bgzf", "core", "cram", "csi", "fasta", "sam", "tabix", "vcf"] }
zip = { version = "2.2.0", default-features = false, features = ["deflate"] }

[lints.clippy]
Expand Down
26 changes: 26 additions & 0 deletions rust/bioscript-formats/src/alignment.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ use noodles::{
self,
alignment::{Record as _, record::Cigar as _},
},
tabix,
};

use bioscript_core::{GenomicLocus, RuntimeError};
Expand Down Expand Up @@ -94,6 +95,13 @@ where
R: Read + Seek,
F: FnMut(AlignmentRecord) -> Result<bool, RuntimeError>,
{
// Same idempotent-rewind rationale as `for_each_raw_cram_record_with_reader`
// — the CRAM header lives at offset 0; we must rewind before each call so
// callers can iterate over multiple loci with the same `IndexedReader`.
reader
.get_mut()
.seek(std::io::SeekFrom::Start(0))
.map_err(|err| RuntimeError::Io(format!("failed to rewind CRAM {label}: {err}")))?;
let header = reader.read_header().map_err(|err| {
RuntimeError::Io(format!("failed to read CRAM header {label}: {err}"))
})?;
Expand Down Expand Up @@ -132,6 +140,15 @@ where
R: Read + Seek,
F: FnMut(cram::Record<'_>) -> Result<bool, RuntimeError>,
{
// Re-seeks to position 0 before reading the header so this helper is
// idempotent across repeated calls on the same indexed reader (e.g. a
// wasm caller looking up N variants in a loop). Otherwise the second
// call reads garbage because the stream position is wherever the
// previous container iteration left it.
reader
.get_mut()
.seek(std::io::SeekFrom::Start(0))
.map_err(|err| RuntimeError::Io(format!("failed to rewind CRAM {label}: {err}")))?;
let header = reader.read_header().map_err(|err| {
RuntimeError::Io(format!("failed to read CRAM header {label}: {err}"))
})?;
Expand Down Expand Up @@ -206,6 +223,15 @@ pub fn parse_fai_bytes(bytes: &[u8]) -> Result<fasta::fai::Index, RuntimeError>
.map_err(|err| RuntimeError::Io(format!("failed to parse FASTA index bytes: {err}")))
}

/// Parse a tabix index (`.tbi`) from an in-memory byte buffer. Used by wasm
/// callers that pass the small index inline while the bgzipped VCF stays on
/// a JS-backed `Read + Seek` reader.
pub fn parse_tbi_bytes(bytes: &[u8]) -> Result<tabix::Index, RuntimeError> {
tabix::io::Reader::new(std::io::Cursor::new(bytes))
.read_index()
.map_err(|err| RuntimeError::Io(format!("failed to parse tabix index bytes: {err}")))
}

pub(crate) fn build_cram_indexed_reader_from_path(
path: &Path,
options: &GenotypeLoadOptions,
Expand Down
145 changes: 144 additions & 1 deletion rust/bioscript-formats/src/genotype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@ use std::{
};

use noodles::bgzf;
use noodles::core::Position;
use noodles::core::{Position, Region};
use noodles::cram;
use noodles::csi::{self, BinningIndex};
use noodles::sam::alignment::Record as _;
use noodles::tabix;
use zip::ZipArchive;

use bioscript_core::{
Expand Down Expand Up @@ -1028,6 +1030,147 @@ pub fn observe_cram_snp_with_reader<R: Read + Seek>(
})
}

/// Observe a SNP at `locus` over an already-built tabix-indexed bgzipped VCF
/// reader. Mirrors the CRAM variant for VCF: caller builds
/// `csi::io::IndexedReader::new(reader, tabix_index)` once and calls this per
/// variant. Non-filesystem callers (wasm with a JS-backed reader) go through
/// this path.
///
/// Only the locus-based match path is implemented — rsid-only variants would
/// need a linear scan, which is a follow-up. Pass `matched_rsid` through if
/// the caller already resolved it.
pub fn observe_vcf_snp_with_reader<R>(
indexed: &mut csi::io::IndexedReader<bgzf::io::Reader<R>, tabix::Index>,
label: &str,
locus: &GenomicLocus,
reference: char,
alternate: char,
matched_rsid: Option<String>,
assembly: Option<Assembly>,
) -> Result<VariantObservation, RuntimeError>
where
R: Read + Seek,
{
let locus_label = format!("{}:{}", locus.chrom, locus.start);

let Some(seq_name) = resolve_vcf_chrom_name(indexed.index(), &locus.chrom) else {
return Ok(VariantObservation {
backend: "vcf".to_owned(),
matched_rsid,
assembly,
evidence: vec![format!(
"{label}: tabix index has no contig matching {} (tried chr-prefixed and bare forms)",
locus.chrom
)],
..VariantObservation::default()
});
};

let pos_usize = usize::try_from(locus.start).map_err(|err| {
RuntimeError::Io(format!(
"{label}: invalid VCF position {} for {locus_label}: {err}",
locus.start
))
})?;
let position = Position::try_from(pos_usize).map_err(|err| {
RuntimeError::Io(format!(
"{label}: invalid VCF position {} for {locus_label}: {err}",
locus.start
))
})?;
let region = Region::new(seq_name.as_str(), position..=position);

let query = indexed
.query(&region)
.map_err(|err| RuntimeError::Io(format!("{label}: tabix query for {locus_label}: {err}")))?;

let reference_str = reference.to_ascii_uppercase().to_string();
let alternate_str = alternate.to_ascii_uppercase().to_string();

let mut saw_any = false;
for record_result in query {
let record = record_result
.map_err(|err| RuntimeError::Io(format!("{label}: tabix record iter: {err}")))?;
let line: &str = record.as_ref();
let Some(row) = parse_vcf_record(line)? else {
continue;
};
if row.position != locus.start {
continue;
}
saw_any = true;
if !row.reference.eq_ignore_ascii_case(&reference_str) {
continue;
}
if !row
.alternates
.iter()
.any(|alt| alt.eq_ignore_ascii_case(&alternate_str))
{
continue;
}

return Ok(VariantObservation {
backend: "vcf".to_owned(),
matched_rsid: matched_rsid.or_else(|| row.rsid.clone()),
assembly,
genotype: Some(row.genotype.clone()),
evidence: vec![format!("{label}: resolved by locus {locus_label}")],
..VariantObservation::default()
});
}

let evidence = if saw_any {
vec![format!(
"{label}: {locus_label} present but ref={reference}/alt={alternate} did not match any record"
)]
} else {
vec![format!(
"{label}: no VCF record at {locus_label}"
)]
};
Ok(VariantObservation {
backend: "vcf".to_owned(),
matched_rsid,
assembly,
evidence,
..VariantObservation::default()
})
}

/// Match the user-provided chromosome name against the tabix index's set of
/// reference sequence names. VCFs vary: some use `chr22`, others `22`. Try
/// the user's spelling verbatim, then toggle the `chr` prefix, then fall back
/// to a case-insensitive compare against the normalized suffix.
fn resolve_vcf_chrom_name(index: &tabix::Index, user_chrom: &str) -> Option<String> {
let header = index.header()?;
let names = header.reference_sequence_names();

let trimmed = user_chrom.trim();
let stripped = trimmed.strip_prefix("chr").unwrap_or(trimmed);

let candidates = [
trimmed.to_owned(),
stripped.to_owned(),
format!("chr{stripped}"),
];
for cand in &candidates {
if names.contains(cand.as_bytes()) {
return Some(cand.clone());
}
}
// Case-insensitive fallback against the full set.
let target = stripped.to_ascii_lowercase();
for name in names {
let as_str = std::str::from_utf8(name.as_ref()).ok()?;
let as_stripped = as_str.strip_prefix("chr").unwrap_or(as_str);
if as_stripped.eq_ignore_ascii_case(&target) {
return Some(as_str.to_owned());
}
}
None
}

fn normalize_pileup_base(base: u8) -> Option<char> {
match (base as char).to_ascii_uppercase() {
'A' | 'C' | 'G' | 'T' => Some((base as char).to_ascii_uppercase()),
Expand Down
2 changes: 1 addition & 1 deletion rust/bioscript-formats/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ mod prepare;

pub use genotype::{
BackendCapabilities, GenotypeLoadOptions, GenotypeSourceFormat, GenotypeStore, QueryKind,
observe_cram_snp_with_reader,
observe_cram_snp_with_reader, observe_vcf_snp_with_reader,
};
pub use inspect::{
DetectedKind, DetectionConfidence, FileContainer, FileInspection, InspectOptions,
Expand Down
2 changes: 1 addition & 1 deletion rust/bioscript-wasm/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ crate-type = ["cdylib", "rlib"]
[dependencies]
bioscript-core = { path = "../bioscript-core" }
bioscript-formats = { path = "../bioscript-formats" }
noodles = { version = "0.109.0", features = ["cram", "fasta"] }
noodles = { version = "0.109.0", features = ["bgzf", "cram", "csi", "fasta", "tabix"] }
wasm-bindgen = "0.2"
js-sys = "0.3"
serde = { version = "1", features = ["derive"] }
Expand Down
72 changes: 72 additions & 0 deletions rust/bioscript-wasm/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@ use bioscript_core::GenomicLocus;
use bioscript_formats::{
DetectedKind, DetectionConfidence, FileContainer, FileInspection, InspectOptions,
SourceMetadata, alignment, inspect_bytes as inspect_bytes_rs, observe_cram_snp_with_reader,
observe_vcf_snp_with_reader,
};
use noodles::csi as noodles_csi;
use serde::{Deserialize, Serialize};
use wasm_bindgen::prelude::*;

Expand Down Expand Up @@ -188,6 +190,76 @@ pub fn lookup_cram_variants(
serde_json::to_string(&results).map_err(|err| JsError::new(&format!("encode results: {err}")))
}

/// Observe a list of SNP variants against a bgzipped + tabix-indexed VCF,
/// with the bulk bytes pulled on demand via a JS-supplied `readAt(offset, len)`
/// callback. The small `.tbi` payload is passed inline.
///
/// The reader must provide the VCF synchronously — on web this is a
/// `FileReaderSync`-backed callback running inside a Web Worker.
#[wasm_bindgen(js_name = lookupVcfVariants)]
pub fn lookup_vcf_variants(
vcf_read_at: js_sys::Function,
vcf_len: f64,
tbi_bytes: &[u8],
variants_json: &str,
) -> Result<String, JsError> {
let tabix_index = alignment::parse_tbi_bytes(tbi_bytes)
.map_err(|err| JsError::new(&format!("parse tbi: {err:?}")))?;
let vcf_reader = JsReader::new(vcf_read_at, vcf_len as u64, "vcf");
let mut indexed = noodles_csi::io::IndexedReader::new(vcf_reader, tabix_index);

let variants: Vec<VariantInput> = serde_json::from_str(variants_json)
.map_err(|err| JsError::new(&format!("parse variantsJson: {err}")))?;

let mut results = Vec::with_capacity(variants.len());
for variant in variants {
let ref_char = variant
.ref_base
.chars()
.next()
.ok_or_else(|| JsError::new(&format!("variant {}: empty ref", variant.name)))?;
let alt_char = variant
.alt_base
.chars()
.next()
.ok_or_else(|| JsError::new(&format!("variant {}: empty alt", variant.name)))?;
let assembly = variant
.assembly
.as_deref()
.and_then(parse_assembly_str);
let locus = GenomicLocus {
chrom: variant.chrom.clone(),
start: variant.pos,
end: variant.pos,
};
let observation = observe_vcf_snp_with_reader(
&mut indexed,
&variant.name,
&locus,
ref_char,
alt_char,
variant.rsid.clone(),
assembly,
)
.map_err(|err| JsError::new(&format!("vcf lookup {}: {err:?}", variant.name)))?;
results.push(VariantObservationJs {
name: variant.name,
backend: observation.backend,
matched_rsid: observation.matched_rsid,
assembly: observation.assembly.map(|a| render_assembly(a).to_owned()),
genotype: observation.genotype,
ref_count: observation.ref_count,
alt_count: observation.alt_count,
depth: observation.depth,
raw_counts: observation.raw_counts,
decision: observation.decision,
evidence: observation.evidence,
});
}

serde_json::to_string(&results).map_err(|err| JsError::new(&format!("encode results: {err}")))
}

fn parse_assembly_str(s: &str) -> Option<bioscript_core::Assembly> {
match s.to_ascii_lowercase().as_str() {
"grch37" | "hg19" | "b37" => Some(bioscript_core::Assembly::Grch37),
Expand Down
Loading