diff --git a/noodles b/noodles index cbfb804..bff28f9 160000 --- a/noodles +++ b/noodles @@ -1 +1 @@ -Subproject commit cbfb804255d08bbe085b7229c1bdc296ed49eeee +Subproject commit bff28f9699ab25c1cb6d3661f07ff1381c869f90 diff --git a/rust/bioscript-cli/tests/cli.rs b/rust/bioscript-cli/tests/cli.rs index 84758f8..b345e38 100644 --- a/rust/bioscript-cli/tests/cli.rs +++ b/rust/bioscript-cli/tests/cli.rs @@ -102,6 +102,32 @@ fn batch_lookup_query_plan_runs_and_preserves_requested_result_order() { assert!(stdout.contains("II")); } +#[test] +fn lookup_variant_details_returns_counts_and_decision_fields() { + let root = repo_root(); + let script = root.join("rust/bioscript-cli/tests/fixtures/lookup_details.py"); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("--input-file") + .arg("old/examples/apol1/test_snps.txt") + .arg(script) + .output() + .unwrap(); + + assert!( + output.status.success(), + "stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("VariantObservation")); + assert!(stdout.contains("genotype='AG'")); + assert!(stdout.contains("raw_counts={")); + assert!(stdout.contains("decision=")); + assert!(stdout.contains("evidence=[")); +} + #[test] fn inspect_subcommand_reports_detected_vendor_and_platform() { let root = repo_root(); diff --git a/rust/bioscript-cli/tests/fixtures/lookup_details.py b/rust/bioscript-cli/tests/fixtures/lookup_details.py new file mode 100644 index 0000000..3d3cdac --- /dev/null +++ b/rust/bioscript-cli/tests/fixtures/lookup_details.py @@ -0,0 +1,17 @@ +VARIANT = bioscript.variant( + rsid="rs73885319", + grch38="22:36265860-36265860", + ref="A", + alt="G", + kind="snp", +) + + +def main(): + genotypes = bioscript.load_genotypes(input_file) + details = genotypes.lookup_variant_details(VARIANT) + print(details) + + +if __name__ == "__main__": + main() diff --git a/rust/bioscript-core/src/variant.rs b/rust/bioscript-core/src/variant.rs index d945a77..e31622d 100644 --- a/rust/bioscript-core/src/variant.rs +++ b/rust/bioscript-core/src/variant.rs @@ -1,3 +1,5 @@ +use std::collections::BTreeMap; + #[derive(Debug, Clone, PartialEq, Eq)] pub struct GenomicLocus { pub chrom: String, @@ -41,6 +43,8 @@ pub struct VariantObservation { pub ref_count: Option, pub alt_count: Option, pub depth: Option, + pub raw_counts: BTreeMap, + pub decision: Option, pub evidence: Vec, } diff --git a/rust/bioscript-formats/src/alignment.rs b/rust/bioscript-formats/src/alignment.rs index e2345da..a40a44a 100644 --- a/rust/bioscript-formats/src/alignment.rs +++ b/rust/bioscript-formats/src/alignment.rs @@ -10,10 +10,7 @@ use noodles::{ fasta::{self, repository::adapters::IndexedReader as FastaIndexedReader}, sam::{ self, - alignment::{ - Record as _, - record::{Cigar as _, Sequence as _}, - }, + alignment::{Record as _, record::Cigar as _}, }, }; @@ -45,7 +42,6 @@ pub(crate) struct AlignmentRecord { pub start: i64, pub end: i64, pub is_unmapped: bool, - pub sequence: Vec, pub cigar: Vec, } @@ -103,7 +99,68 @@ where let selected_containers = select_query_containers(reader.index(), &header, ®ion)?; - stream_selected_containers( + stream_selected_alignment_records( + path, + &mut reader, + &header, + ®ion, + locus.end, + &selected_containers, + &mut on_record, + )?; + + Ok(()) +} + +pub(crate) fn for_each_raw_cram_record( + path: &Path, + options: &GenotypeLoadOptions, + reference_file: &Path, + locus: &GenomicLocus, + mut on_record: F, +) -> Result<(), RuntimeError> +where + F: FnMut(cram::Record<'_>) -> Result, +{ + let repository = build_reference_repository(reference_file)?; + let mut builder = + cram::io::indexed_reader::Builder::default().set_reference_sequence_repository(repository); + + if let Some(index_path) = options.input_index.as_ref() { + let index = crai::fs::read(index_path).map_err(|err| { + RuntimeError::Io(format!( + "failed to read CRAM index {} for {}: {err}", + index_path.display(), + path.display() + )) + })?; + builder = builder.set_index(index); + } + + let mut reader = builder.build_from_path(path).map_err(|err| { + RuntimeError::Io(format!( + "failed to open indexed CRAM {}: {err}", + path.display() + )) + })?; + + let header = reader.read_header().map_err(|err| { + RuntimeError::Io(format!( + "failed to read CRAM header {}: {err}", + path.display() + )) + })?; + + let region = build_region(&header, locus).ok_or_else(|| { + RuntimeError::Unsupported(format!( + "indexed CRAM does not contain contig {} for {}:{}-{}", + locus.chrom, locus.chrom, locus.start, locus.end + )) + })?; + + let selected_containers = select_query_containers(reader.index(), &header, ®ion)?; + + stream_selected_cram_records( path, &mut reader, &header, @@ -188,7 +245,7 @@ fn select_query_containers( .collect()) } -fn stream_selected_containers( +fn stream_selected_alignment_records( path: &Path, reader: &mut cram::io::indexed_reader::IndexedReader, header: &sam::Header, @@ -199,6 +256,32 @@ fn stream_selected_containers( ) -> Result<(), RuntimeError> where F: FnMut(AlignmentRecord) -> Result, +{ + stream_selected_cram_records( + path, + reader, + header, + region, + locus_end, + selected_containers, + &mut |record| { + let alignment_record = build_alignment_record_from_cram(path, &record)?; + on_record(alignment_record) + }, + ) +} + +fn stream_selected_cram_records( + path: &Path, + reader: &mut cram::io::indexed_reader::IndexedReader, + header: &sam::Header, + region: &Region, + locus_end: i64, + selected_containers: &[SelectedContainer], + on_record: &mut F, +) -> Result<(), RuntimeError> +where + F: FnMut(cram::Record<'_>) -> Result, { let interval = region.interval(); @@ -293,7 +376,7 @@ where return Ok(true); } - match on_record(alignment_record) { + match on_record(record.clone()) { Ok(true) => Ok(true), Ok(false) => { stop = true; @@ -351,7 +434,7 @@ where return Ok(true); } - match on_record(alignment_record) { + match on_record(record.clone()) { Ok(true) => Ok(true), Ok(false) => { stop = true; @@ -445,7 +528,6 @@ fn build_alignment_record_from_cram( None => start, }; - let sequence: Vec = record.sequence().iter().collect(); let cigar = record .cigar() .iter() @@ -463,7 +545,6 @@ fn build_alignment_record_from_cram( start, end, is_unmapped, - sequence, cigar, }) } diff --git a/rust/bioscript-formats/src/genotype.rs b/rust/bioscript-formats/src/genotype.rs index eed1489..74df992 100644 --- a/rust/bioscript-formats/src/genotype.rs +++ b/rust/bioscript-formats/src/genotype.rs @@ -1,5 +1,5 @@ use std::{ - collections::{BTreeSet, HashMap}, + collections::{BTreeMap, BTreeSet, HashMap}, fmt::Write as _, fs::File, io::{BufRead, BufReader}, @@ -8,6 +8,8 @@ use std::{ }; use noodles::bgzf; +use noodles::core::Position; +use noodles::sam::alignment::Record as _; use zip::ZipArchive; use bioscript_core::{ @@ -17,6 +19,8 @@ use bioscript_core::{ use crate::alignment::{self, AlignmentOpKind, AlignmentRecord}; const COMMENT_PREFIXES: [&str; 2] = ["#", "//"]; +const DEFAULT_MPILEUP_MIN_BASE_QUALITY: u8 = 13; +const DEFAULT_MPILEUP_MIN_MAPPING_QUALITY: u8 = 0; const RSID_ALIASES: &[&str] = &["rsid", "name", "snp", "marker", "id", "snpid"]; const CHROM_ALIASES: &[&str] = &["chromosome", "chr", "chrom"]; @@ -472,32 +476,19 @@ impl CramBackend { })?; let target_pos = locus.start; - let mut alt_count = 0u32; - let mut ref_count = 0u32; - let mut depth = 0u32; - - alignment::for_each_cram_record( + let pileup = observe_snp_pileup( &self.path, &self.options, reference_file, locus, - |record| { - if record.is_unmapped { - return Ok(true); - } - let Some(base) = base_at_position(&record, target_pos) else { - return Ok(true); - }; - let base = (base as char).to_ascii_uppercase(); - depth += 1; - if base == reference { - ref_count += 1; - } else if base == alternate { - alt_count += 1; - } - Ok(true) - }, + reference, + alternate, )?; + let ref_count = pileup.filtered_ref_count; + let alt_count = pileup.filtered_alt_count; + let depth = pileup.filtered_depth; + + let evidence = pileup.evidence_lines(&describe_locus(locus), target_pos); Ok(VariantObservation { backend: self.backend_name().to_owned(), @@ -507,10 +498,11 @@ impl CramBackend { ref_count: Some(ref_count), alt_count: Some(alt_count), depth: Some(depth), - evidence: vec![format!( - "observed SNP at {}:{} depth={} ref_count={} alt_count={}", - locus.chrom, target_pos, depth, ref_count, alt_count - )], + raw_counts: pileup.raw_base_counts, + decision: Some(describe_snp_decision_rule( + reference, alternate, ref_count, alt_count, depth, + )), + evidence, }) } @@ -562,6 +554,10 @@ impl CramBackend { ref_count: Some(ref_count), alt_count: Some(alt_count), depth: Some(depth), + raw_counts: BTreeMap::new(), + decision: Some(describe_copy_number_decision_rule( + &reference, &alternate, ref_count, alt_count, depth, + )), evidence: vec![format!( "observed deletion anchor {}:{} len={} depth={} ref_count={} alt_count={}", locus.chrom, anchor_pos, deletion_length, depth, ref_count, alt_count @@ -631,6 +627,10 @@ impl CramBackend { ref_count: Some(ref_count), alt_count: Some(alt_count), depth: Some(depth), + raw_counts: BTreeMap::new(), + decision: Some(describe_copy_number_decision_rule( + &reference, &alternate, ref_count, alt_count, depth, + )), evidence: vec![format!( "observed indel at {} depth={} ref_count={} alt_count={} matching_alt_lengths={}", describe_locus(locus), @@ -733,6 +733,25 @@ fn infer_snp_genotype( } } +fn describe_snp_decision_rule( + reference: char, + alternate: char, + ref_count: u32, + alt_count: u32, + depth: u32, +) -> String { + if depth == 0 { + return format!( + "no covering reads for SNP; genotype unresolved (ref={reference}, alt={alternate})" + ); + } + + let alt_fraction = f64::from(alt_count) / f64::from(depth); + format!( + "SNP genotype rule: alt_fraction={alt_fraction:.3} with thresholds ref<=0.200, het=(0.200,0.800), alt>=0.800; counts ref={ref_count} alt={alt_count} depth={depth} for {reference}>{alternate}" + ) +} + fn infer_copy_number_genotype( reference: &str, alternate: &str, @@ -753,6 +772,179 @@ fn infer_copy_number_genotype( } } +fn describe_copy_number_decision_rule( + reference: &str, + alternate: &str, + ref_count: u32, + alt_count: u32, + depth: u32, +) -> String { + if depth == 0 { + return format!( + "no covering reads for copy-number style variant; genotype unresolved (ref={reference}, alt={alternate})" + ); + } + + let alt_fraction = f64::from(alt_count) / f64::from(depth); + format!( + "copy-number genotype rule: alt_fraction={alt_fraction:.3} with thresholds ref<=0.200, het=(0.200,0.800), alt>=0.800; counts ref={ref_count} alt={alt_count} depth={depth} for {reference}->{alternate}" + ) +} + +#[derive(Debug, Clone, Default)] +struct SnpPileupCounts { + filtered_depth: u32, + filtered_ref_count: u32, + filtered_alt_count: u32, + filtered_base_counts: BTreeMap, + raw_depth: u32, + raw_ref_count: u32, + raw_alt_count: u32, + raw_base_counts: BTreeMap, + filtered_low_base_quality: u32, + filtered_low_mapping_quality: u32, + filtered_non_acgt: u32, + filtered_unmapped: u32, + filtered_secondary: u32, + filtered_qc_fail: u32, + filtered_duplicate: u32, + filtered_improper_pair: u32, + raw_forward_counts: BTreeMap, + raw_reverse_counts: BTreeMap, +} + +impl SnpPileupCounts { + fn evidence_lines(&self, locus: &str, target_pos: i64) -> Vec { + vec![ + format!( + "observed SNP pileup at {locus} target_pos={target_pos} filtered_depth={} ref_count={} alt_count={}", + self.filtered_depth, self.filtered_ref_count, self.filtered_alt_count + ), + format!( + "raw pileup depth={} ref_count={} alt_count={} raw_counts={:?}", + self.raw_depth, self.raw_ref_count, self.raw_alt_count, self.raw_base_counts + ), + format!( + "raw strand counts: forward={:?} reverse={:?}", + self.raw_forward_counts, self.raw_reverse_counts + ), + format!( + "filters applied: min_base_quality={} min_mapping_quality={} filtered_low_base_quality={} filtered_low_mapping_quality={} filtered_non_acgt={} filtered_unmapped={} filtered_secondary={} filtered_qc_fail={} filtered_duplicate={} filtered_improper_pair={}", + DEFAULT_MPILEUP_MIN_BASE_QUALITY, + DEFAULT_MPILEUP_MIN_MAPPING_QUALITY, + self.filtered_low_base_quality, + self.filtered_low_mapping_quality, + self.filtered_non_acgt, + self.filtered_unmapped, + self.filtered_secondary, + self.filtered_qc_fail, + self.filtered_duplicate, + self.filtered_improper_pair + ), + ] + } +} + +fn observe_snp_pileup( + cram_path: &Path, + options: &GenotypeLoadOptions, + reference_file: &Path, + locus: &GenomicLocus, + reference: char, + alternate: char, +) -> Result { + let mut counts = SnpPileupCounts::default(); + let target_position = Position::try_from(usize::try_from(locus.start).map_err(|_| { + RuntimeError::InvalidArguments("SNP locus start is out of range".to_owned()) + })?) + .map_err(|_| RuntimeError::InvalidArguments("SNP locus start is out of range".to_owned()))?; + let reference_base = reference as u8; + + alignment::for_each_raw_cram_record(cram_path, options, reference_file, locus, |record| { + let flags = record + .flags() + .map_err(|err| RuntimeError::Io(format!("failed to read CRAM flags: {err}")))?; + if flags.is_unmapped() { + counts.filtered_unmapped += 1; + return Ok(true); + } + if flags.is_secondary() { + counts.filtered_secondary += 1; + return Ok(true); + } + if flags.is_qc_fail() { + counts.filtered_qc_fail += 1; + return Ok(true); + } + if flags.is_duplicate() { + counts.filtered_duplicate += 1; + return Ok(true); + } + if flags.is_segmented() && !flags.is_properly_segmented() { + counts.filtered_improper_pair += 1; + return Ok(true); + } + + let Some((base, base_quality)) = + record.base_quality_at_reference_position(target_position, reference_base) + else { + return Ok(true); + }; + + let normalized_base = normalize_pileup_base(base); + record.mapping_quality().transpose().map_err(|err| { + RuntimeError::Io(format!("failed to read CRAM mapping quality: {err}")) + })?; + let is_reverse = flags.is_reverse_complemented(); + if let Some(base) = normalized_base { + counts.raw_depth += 1; + *counts.raw_base_counts.entry(base.to_string()).or_insert(0) += 1; + let strand_counts = if is_reverse { + &mut counts.raw_reverse_counts + } else { + &mut counts.raw_forward_counts + }; + *strand_counts.entry(base.to_string()).or_insert(0) += 1; + if base == reference { + counts.raw_ref_count += 1; + } else if base == alternate { + counts.raw_alt_count += 1; + } + } + + if base_quality < DEFAULT_MPILEUP_MIN_BASE_QUALITY { + counts.filtered_low_base_quality += 1; + return Ok(true); + } + + let Some(base) = normalized_base else { + counts.filtered_non_acgt += 1; + return Ok(true); + }; + + counts.filtered_depth += 1; + *counts + .filtered_base_counts + .entry(base.to_string()) + .or_insert(0) += 1; + if base == reference { + counts.filtered_ref_count += 1; + } else if base == alternate { + counts.filtered_alt_count += 1; + } + Ok(true) + })?; + + Ok(counts) +} + +fn normalize_pileup_base(base: u8) -> Option { + match (base as char).to_ascii_uppercase() { + 'A' | 'C' | 'G' | 'T' => Some((base as char).to_ascii_uppercase()), + _ => None, + } +} + #[derive(Debug, Clone, Copy)] struct IndelClassification { covering: bool, @@ -765,40 +957,6 @@ fn len_as_i64(len: usize) -> Option { i64::try_from(len).ok() } -fn base_at_position(record: &AlignmentRecord, target_pos: i64) -> Option { - let mut ref_pos = record.start; - let mut read_pos = 0usize; - - for op in &record.cigar { - match op.kind { - AlignmentOpKind::Match - | AlignmentOpKind::SequenceMatch - | AlignmentOpKind::SequenceMismatch => { - let op_len = len_as_i64(op.len)?; - if target_pos >= ref_pos && target_pos < ref_pos + op_len { - let offset = usize::try_from(target_pos - ref_pos).ok()?; - return record.sequence.get(read_pos + offset).copied(); - } - ref_pos += op_len; - read_pos += op.len; - } - AlignmentOpKind::Insertion | AlignmentOpKind::SoftClip => { - read_pos += op.len; - } - AlignmentOpKind::Deletion | AlignmentOpKind::Skip => { - let op_len = len_as_i64(op.len)?; - if target_pos >= ref_pos && target_pos < ref_pos + op_len { - return None; - } - ref_pos += op_len; - } - AlignmentOpKind::HardClip | AlignmentOpKind::Pad => {} - } - } - - None -} - fn spans_position(record: &AlignmentRecord, pos: i64) -> bool { pos >= record.start.saturating_sub(1) && pos <= record.end } @@ -1664,6 +1822,9 @@ fn vcf_reference_token(reference: &str, alternates: &[&str]) -> String { let mut saw_longer = false; for alt in alternates { + if is_symbolic_vcf_alt(alt) { + continue; + } match alt.len().cmp(&reference.len()) { std::cmp::Ordering::Less => saw_shorter = true, std::cmp::Ordering::Greater => saw_longer = true, @@ -1679,6 +1840,9 @@ fn vcf_reference_token(reference: &str, alternates: &[&str]) -> String { } fn vcf_alt_token(reference: &str, alternate: &str) -> String { + if is_symbolic_vcf_alt(alternate) { + return "--".to_owned(); + } match alternate.len().cmp(&reference.len()) { std::cmp::Ordering::Less => "D".to_owned(), std::cmp::Ordering::Greater => "I".to_owned(), @@ -1686,6 +1850,11 @@ fn vcf_alt_token(reference: &str, alternate: &str) -> String { } } +fn is_symbolic_vcf_alt(alternate: &str) -> bool { + let trimmed = alternate.trim(); + trimmed.starts_with('<') && trimmed.ends_with('>') +} + fn normalize_sequence_token(value: &str) -> String { value.trim().to_ascii_uppercase() } diff --git a/rust/bioscript-formats/src/inspect.rs b/rust/bioscript-formats/src/inspect.rs index cf0b3cd..ff0aada 100644 --- a/rust/bioscript-formats/src/inspect.rs +++ b/rust/bioscript-formats/src/inspect.rs @@ -526,6 +526,10 @@ fn detect_source( vendor = Some("Sequencing.com".to_owned()); confidence = DetectionConfidence::WeakHeuristic; evidence.push("sequencing.com header text".to_owned()); + } else if normalized.contains("carigenetics") || normalized.contains("cari genetics") { + vendor = Some("CariGenetics".to_owned()); + confidence = DetectionConfidence::StrongHeuristic; + evidence.push("CariGenetics path/header text".to_owned()); } vendor.map(|vendor| SourceMetadata { @@ -571,11 +575,21 @@ fn canonicalize_ancestry_version(value: &str) -> String { fn detect_assembly(lower_name: &str, sample_lines: &[String]) -> Option { let header = sample_lines.join("\n").to_ascii_lowercase(); let combined = format!("{lower_name}\n{header}"); - if combined.contains("build 38") || combined.contains("grch38") || combined.contains("hg38") { + let looks_like_grch38 = combined.contains("build 38") + || combined.contains("grch38") + || combined.contains("hg38") + || combined.contains("gca_000001405.15") + || combined.contains("grch38_no_alt_analysis_set") + || combined.contains("##contig="); + + if looks_like_grch38 { Some(Assembly::Grch38) } else if combined.contains("build 37") || combined.contains("grch37") || combined.contains("hg19") + || combined.contains("assembly=b37") + || combined.contains("assembly=\"b37\"") + || combined.contains("human_g1k_v37") || combined.contains("37.1") { Some(Assembly::Grch37) diff --git a/rust/bioscript-formats/tests/file_formats.rs b/rust/bioscript-formats/tests/file_formats.rs index 74b73dc..f060935 100644 --- a/rust/bioscript-formats/tests/file_formats.rs +++ b/rust/bioscript-formats/tests/file_formats.rs @@ -245,6 +245,25 @@ fn vcf_variant_lookup_reads_single_sample_calls() { assert_eq!(observation.genotype.as_deref(), Some("DI")); } +#[test] +fn vcf_variant_lookup_ignores_symbolic_non_ref_alt_when_decoding_gt() { + let dir = temp_dir("vcf-non-ref"); + let vcf_path = dir.join("sample.g.vcf"); + fs::write( + &vcf_path, + "##fileformat=VCFv4.2\n\ +##FORMAT=\n\ +#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ +6\t39016636\trs10305420\tC\tT,\t.\tPASS\t.\tGT\t0/1\n\ +6\t38979128\trs9357296\tA\tG,\t.\tPASS\t.\tGT\t0/1\n", + ) + .unwrap(); + + let store = GenotypeStore::from_file(&vcf_path).unwrap(); + assert_eq!(store.get("rs10305420").unwrap().as_deref(), Some("CT")); + assert_eq!(store.get("rs9357296").unwrap().as_deref(), Some("AG")); +} + struct CramFixture { cram: PathBuf, reference: PathBuf, @@ -577,3 +596,61 @@ fn cram_md5_mismatch_is_tolerated_and_returns_correct_result() { "depth {depth_i32} differs from samtools mpileup depth {samtools_depth} by >6" ); } + +#[test] +fn cram_rs9357296_reports_heterozygous_counts_for_na06985() { + let Some(fx) = cram_fixture_or_skip("cram_rs9357296_reports_heterozygous_counts_for_na06985") + else { + return; + }; + let store = open_cram_store(&fx); + + let observation = store + .lookup_variant(&VariantSpec { + rsids: vec!["rs9357296".to_owned()], + grch38: Some(bioscript_core::GenomicLocus { + chrom: "6".to_owned(), + start: 39_011_352, + end: 39_011_352, + }), + reference: Some("A".to_owned()), + alternate: Some("G".to_owned()), + kind: Some(VariantKind::Snp), + ..VariantSpec::default() + }) + .expect("rs9357296 lookup"); + + assert_eq!(observation.backend, "cram"); + assert_eq!(observation.genotype.as_deref(), Some("AG")); + assert_eq!(observation.raw_counts.get("A").copied(), Some(18)); + assert_eq!(observation.raw_counts.get("G").copied(), Some(12)); + assert_eq!(observation.raw_counts.get("T").copied(), None); + assert_eq!(observation.depth, Some(29)); + assert_eq!(observation.ref_count, Some(17)); + assert_eq!(observation.alt_count, Some(12)); + assert!( + observation + .decision + .as_deref() + .is_some_and(|text| text.contains("alt_fraction=0.414")), + "missing SNP decision summary: {:?}", + observation.decision + ); + assert!( + observation + .evidence + .iter() + .any(|line| line.contains("raw pileup depth=30")), + "missing raw pileup evidence: {:?}", + observation.evidence + ); + assert!( + observation.evidence.iter().any(|line| { + line.contains("filtered_duplicate=4") + && line.contains("filtered_low_base_quality=1") + && line.contains("filtered_improper_pair=0") + }), + "missing mpileup-style filter evidence: {:?}", + observation.evidence + ); +} diff --git a/rust/bioscript-formats/tests/inspect.rs b/rust/bioscript-formats/tests/inspect.rs index 61fab20..185ccce 100644 --- a/rust/bioscript-formats/tests/inspect.rs +++ b/rust/bioscript-formats/tests/inspect.rs @@ -392,3 +392,62 @@ fn mixed_vcf_sampled_rows_prefer_true_when_pipe_is_seen() { assert_eq!(inspection.detected_kind, DetectedKind::Vcf); assert_eq!(inspection.phased, Some(true)); } + +#[test] +fn vcf_b37_contig_headers_report_grch37_assembly() { + let dir = temp_dir("b37-vcf"); + let path = dir.join("sample.vcf"); + std::fs::write( + &path, + "##fileformat=VCFv4.2\n\ +##contig=\n\ +##SentieonCommandLine.Haplotyper=\n\ +##FORMAT=\n\ +#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ +1\t100\trs1\tA\tG\t.\tPASS\t.\tGT\t0/1\n", + ) + .unwrap(); + + let inspection = inspect_file(&path, &InspectOptions::default()).unwrap(); + assert_eq!(inspection.detected_kind, DetectedKind::Vcf); + assert_eq!(inspection.assembly, Some(Assembly::Grch37)); +} + +#[test] +fn vcf_grch38_reference_token_reports_grch38_assembly() { + let dir = temp_dir("grch38-vcf-reference"); + let path = dir.join("sample.vcf"); + std::fs::write( + &path, + "##fileformat=VCFv4.2\n\ +##reference=GCA_000001405.15_GRCh38_no_alt_analysis_set.fa\n\ +##FORMAT=\n\ +#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ +chr1\t100\trs1\tA\tG\t.\tPASS\t.\tGT\t0/1\n", + ) + .unwrap(); + + let inspection = inspect_file(&path, &InspectOptions::default()).unwrap(); + assert_eq!(inspection.detected_kind, DetectedKind::Vcf); + assert_eq!(inspection.assembly, Some(Assembly::Grch38)); +} + +#[test] +fn vcf_grch38_contig_lengths_report_grch38_assembly() { + let dir = temp_dir("grch38-vcf-contigs"); + let path = dir.join("sample.vcf"); + std::fs::write( + &path, + "##fileformat=VCFv4.2\n\ +##contig=\n\ +##contig=\n\ +##FORMAT=\n\ +#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ +chr1\t100\trs1\tA\tG\t.\tPASS\t.\tGT\t0/1\n", + ) + .unwrap(); + + let inspection = inspect_file(&path, &InspectOptions::default()).unwrap(); + assert_eq!(inspection.detected_kind, DetectedKind::Vcf); + assert_eq!(inspection.assembly, Some(Assembly::Grch38)); +} diff --git a/rust/bioscript-runtime/src/runtime.rs b/rust/bioscript-runtime/src/runtime.rs index d45b86b..667e67c 100644 --- a/rust/bioscript-runtime/src/runtime.rs +++ b/rust/bioscript-runtime/src/runtime.rs @@ -272,9 +272,15 @@ impl BioscriptRuntime { ("Bioscript", "write_text") => self.method_write_text(args, kwargs), ("GenotypeFile", "get") => self.method_genotype_get(args, kwargs), ("GenotypeFile", "lookup_variant") => self.method_genotype_lookup_variant(args, kwargs), + ("GenotypeFile", "lookup_variant_details") => { + self.method_genotype_lookup_variant_details(args, kwargs) + } ("GenotypeFile", "lookup_variants") => { self.method_genotype_lookup_variants(args, kwargs) } + ("GenotypeFile", "lookup_variants_details") => { + self.method_genotype_lookup_variants_details(args, kwargs) + } _ => Err(RuntimeError::Unsupported(format!( "'{class_name}' object has no attribute '{method_name}'" ))), @@ -413,6 +419,39 @@ impl BioscriptRuntime { }) } + fn method_genotype_lookup_variant_details( + &self, + args: &[MontyObject], + kwargs: &[(MontyObject, MontyObject)], + ) -> Result { + let started = Instant::now(); + reject_kwargs(kwargs, "GenotypeFile.lookup_variant_details")?; + if args.len() != 2 { + return Err(RuntimeError::InvalidArguments( + "GenotypeFile.lookup_variant_details expects self and variant".to_owned(), + )); + } + let handle = dataclass_handle_id(&args[0], "GenotypeFile")?; + let spec = dataclass_to_variant_spec(&args[1])?; + let guard = self + .state + .genotype_files + .lock() + .expect("genotype mutex poisoned"); + let Some(store) = guard.get(&handle) else { + return Err(RuntimeError::InvalidArguments(format!( + "unknown genotype handle: {handle}" + ))); + }; + let observation = store.lookup_variant(&spec)?; + self.record_timing( + "lookup_variant_details", + started.elapsed(), + format!("rsids={}", spec.rsids.join("|")), + ); + Ok(variant_observation_object(&observation)) + } + fn method_genotype_lookup_variants( &self, args: &[MontyObject], @@ -454,6 +493,44 @@ impl BioscriptRuntime { )) } + fn method_genotype_lookup_variants_details( + &self, + args: &[MontyObject], + kwargs: &[(MontyObject, MontyObject)], + ) -> Result { + let started = Instant::now(); + reject_kwargs(kwargs, "GenotypeFile.lookup_variants_details")?; + if args.len() != 2 { + return Err(RuntimeError::InvalidArguments( + "GenotypeFile.lookup_variants_details expects self and a variant plan".to_owned(), + )); + } + let handle = dataclass_handle_id(&args[0], "GenotypeFile")?; + let specs = variant_specs_from_plan(&args[1])?; + let guard = self + .state + .genotype_files + .lock() + .expect("genotype mutex poisoned"); + let Some(store) = guard.get(&handle) else { + return Err(RuntimeError::InvalidArguments(format!( + "unknown genotype handle: {handle}" + ))); + }; + let observations = store.lookup_variants(&specs)?; + self.record_timing( + "lookup_variants_details", + started.elapsed(), + format!("count={}", specs.len()), + ); + Ok(MontyObject::List( + observations + .iter() + .map(variant_observation_object) + .collect(), + )) + } + fn method_write_tsv( &self, args: &[MontyObject], @@ -811,6 +888,109 @@ fn variant_plan_object(variants: &[VariantSpec]) -> MontyObject { } } +fn variant_observation_object(observation: &bioscript_core::VariantObservation) -> MontyObject { + let mut attrs = vec![ + ( + MontyObject::String("backend".to_owned()), + MontyObject::String(observation.backend.clone()), + ), + ( + MontyObject::String("matched_rsid".to_owned()), + match &observation.matched_rsid { + Some(value) => MontyObject::String(value.clone()), + None => MontyObject::None, + }, + ), + ( + MontyObject::String("assembly".to_owned()), + match observation.assembly { + Some(assembly) => MontyObject::String(match assembly { + bioscript_core::Assembly::Grch37 => "grch37".to_owned(), + bioscript_core::Assembly::Grch38 => "grch38".to_owned(), + }), + None => MontyObject::None, + }, + ), + ( + MontyObject::String("genotype".to_owned()), + match &observation.genotype { + Some(value) => MontyObject::String(value.clone()), + None => MontyObject::None, + }, + ), + ( + MontyObject::String("ref_count".to_owned()), + observation.ref_count.map_or(MontyObject::None, |value| { + MontyObject::Int(i64::from(value)) + }), + ), + ( + MontyObject::String("alt_count".to_owned()), + observation.alt_count.map_or(MontyObject::None, |value| { + MontyObject::Int(i64::from(value)) + }), + ), + ( + MontyObject::String("depth".to_owned()), + observation.depth.map_or(MontyObject::None, |value| { + MontyObject::Int(i64::from(value)) + }), + ), + ( + MontyObject::String("decision".to_owned()), + match &observation.decision { + Some(value) => MontyObject::String(value.clone()), + None => MontyObject::None, + }, + ), + ( + MontyObject::String("raw_counts".to_owned()), + MontyObject::Dict( + observation + .raw_counts + .iter() + .map(|(base, count)| { + ( + MontyObject::String(base.clone()), + MontyObject::Int(i64::from(*count)), + ) + }) + .collect(), + ), + ), + ( + MontyObject::String("evidence".to_owned()), + MontyObject::List( + observation + .evidence + .iter() + .cloned() + .map(MontyObject::String) + .collect(), + ), + ), + ]; + + MontyObject::Dataclass { + name: "VariantObservation".to_owned(), + type_id: 5, + field_names: vec![ + "backend".to_owned(), + "matched_rsid".to_owned(), + "assembly".to_owned(), + "genotype".to_owned(), + "ref_count".to_owned(), + "alt_count".to_owned(), + "depth".to_owned(), + "decision".to_owned(), + "raw_counts".to_owned(), + "evidence".to_owned(), + ], + attrs: attrs.drain(..).collect(), + frozen: true, + } +} + fn dataclass_handle_id(obj: &MontyObject, expected_name: &str) -> Result { match obj { MontyObject::Dataclass { name, attrs, .. } if name == expected_name => {