diff --git a/docs/examples/abcg2-rs2231142-pgx.yaml b/docs/examples/abcg2-rs2231142-pgx.yaml index b7131f4..b56d149 100644 --- a/docs/examples/abcg2-rs2231142-pgx.yaml +++ b/docs/examples/abcg2-rs2231142-pgx.yaml @@ -1,6 +1,11 @@ schema: "bioscript:variant" version: "1.0" variant_id: "ABCG2_rs2231142" +tags: + - "gene:abcg2" + - "domain:pgx" + - "input:vcf" + - "input:cram" label: "ABCG2 rs2231142" gene: "ABCG2" summary: "ABCG2 SNV locus with pharmacogenomic interpretation" @@ -26,16 +31,6 @@ alleles: - "T" canonical_alt: "A" -research: - tasks: - - "Verify canonical dbSNP, ClinVar, and Ensembl annotations" - - "Collect PharmGKB and disease-association references for ABCG2" - tags: - - "abcg2" - - "pgx" - - "snv" - - "rs2231142" - clinical: pgx: gene: "ABCG2" diff --git a/docs/panel-schema.md b/docs/panel-schema.md new file mode 100644 index 0000000..708daee --- /dev/null +++ b/docs/panel-schema.md @@ -0,0 +1,109 @@ +# Panel Schema + +Use a panel when you want one manifest that points to a curated set of runnable variant records. + +Right now the Rust runner supports variant members directly. Keep the shape simple. + +## Schema Identity + +```yaml +schema: "bioscript:panel:1.0" +version: "1.0" +``` + +## Minimal Shape + +```yaml +schema: "bioscript:panel:1.0" +version: "1.0" +name: "traits-common" +label: "Common Traits" +tags: + - "type:trait" + +members: + - kind: "variant" + path: "variants/rs671.yaml" + version: "1.0" + - kind: "variant" + path: "variants/rs713598.yaml" + version: "1.0" +``` + +## Purpose + +A panel is: + +- a selection manifest +- a stable name for a bundle of variants +- something the Rust `bioscript` command can run directly + +It is not: + +- a full remote package manager +- a replacement for richer assay manifests + +## Members + +Each member must currently be: + +```yaml +- kind: "variant" + path: "variants/rs671.yaml" + version: "1.0" +``` + +Rules: + +- `kind` is required +- exactly one of `path` or `download` is required +- current runner support is `variant` members only +- `version` is recommended for local members +- `sha256` is optional for local members + +## Permissions And Downloads + +Panels may declare remote downloads up front even if the current runner only executes local members. + +```yaml +permissions: + domains: + - "https://example.org" + +downloads: + - id: "remote-rs671" + url: "https://example.org/variants/rs671.yaml" + sha256: "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + version: "1.0" +``` + +Validation rules: + +- `permissions.domains` entries must be origins only +- every `downloads[*].url` origin must also appear in `permissions.domains` +- `downloads[*].sha256` must be a 64-character lowercase hex digest + +This keeps host approval focused on which remote origins may be contacted. + +## Running Panels + +Examples: + +```bash +bioscript panel.yaml --input-file sample.txt --output-file output.tsv +bioscript panel.yaml --input-file sample.txt --filter tag=type:trait +bioscript panel.yaml --input-file sample.txt --filter name=rs671 +``` + +Current filter keys: + +- `kind` +- `name` +- `path` +- `tag` + +## Future Relationship To Catalogues And Assays + +- `panel` is the small runnable collection manifest +- `catalogue` can later become a larger published index over many panels and items +- `assay` can later become the richer multi-file runnable bundle with declared assets and overrides diff --git a/docs/panel-schema.yaml b/docs/panel-schema.yaml new file mode 100644 index 0000000..a35dcd3 --- /dev/null +++ b/docs/panel-schema.yaml @@ -0,0 +1,14 @@ +schema: "bioscript:panel:1.0" +version: "1.0" +name: "traits-common" +label: "Common Traits" +tags: + - "type:trait" + +members: + - kind: "variant" + path: "variants/rs671.yaml" + version: "1.0" + - kind: "variant" + path: "variants/rs713598.yaml" + version: "1.0" diff --git a/docs/variant-catalogue.md b/docs/variant-catalogue.md deleted file mode 100644 index 09eb2c4..0000000 --- a/docs/variant-catalogue.md +++ /dev/null @@ -1,71 +0,0 @@ -# Variant Catalogue Schema - -Use a catalogue when you want to manage many `bioscript:variant` files together as a panel. - -This keeps each variant small and self-contained while giving you one manifest for: - -- grouping -- version pinning -- caching -- batch validation -- panel execution - -## Recommended Shape - -```yaml -schema: "bioscript:catalogue" -version: "1.0" -catalogue_id: "apol1-panel" -label: "APOL1 panel" - -variants: - - id: "APOL1_G1_rs73885319" - path: "variants/apol1/rs73885319.yaml" - version: "1.0" - - id: "APOL1_G1_rs60910145" - path: "variants/apol1/rs60910145.yaml" - version: "1.0" - - id: "APOL1_G2" - path: "variants/apol1/g2.yaml" - version: "1.0" -``` - -## Why This Shape - -- `path` is easy to load locally -- `id` lets you refer to a variant without reparsing the path -- `version` lets you cache and pin a known record revision - -## Optional Extensions - -Later you can add: - -- `sha256` -- `source_url` -- `tags` -- `groups` -- `assemblies` - -Example: - -```yaml -schema: "bioscript:catalogue" -version: "1.0" -catalogue_id: "pgx-core" - -variants: - - id: "ABCG2_rs2231142" - path: "variants/abcg2/rs2231142.yaml" - version: "1.0" - tags: - - "pgx" - - "statin" -``` - -## Recommended Workflow - -1. Keep one variant per file. -2. Keep those files in a predictable directory layout. -3. Use a catalogue file to define the panel. -4. Validate both the catalogue and each referenced variant. -5. Cache by `id + version`. diff --git a/docs/variant-schema.md b/docs/variant-schema.md index 94b1718..a9673b3 100644 --- a/docs/variant-schema.md +++ b/docs/variant-schema.md @@ -6,49 +6,48 @@ Use it as the canonical stored form. Keep it small enough to: - validate cleanly - generate `bioscript.variant(...)` calls -- enrich later with external research if needed +- carry variant identity plus typed findings ## Schema Identity ```yaml -schema: "bioscript:variant" +schema: "bioscript:variant:1.0" version: "1.0" ``` -`bioscript:variant` is a reasonable name for this object. It is explicit, stable, and leaves room for later schema types like `bioscript:panel` or `bioscript:report`. - ## Minimal Shape ```yaml -schema: "bioscript:variant" +schema: "bioscript:variant:1.0" version: "1.0" -variant_id: "APOL1_G1_rs73885319" +name: "traits-common-rs671-G-A" +tags: + - "type:trait" identifiers: rsids: - - "rs73885319" + - "rs671" coordinates: grch37: - chrom: "22" - pos: 36661906 + chrom: "12" + pos: 112241766 grch38: - chrom: "22" - pos: 36265860 + chrom: "12" + pos: 111803962 alleles: kind: "snv" - ref: "A" + ref: "G" alts: - - "G" - canonical_alt: "G" + - "A" ``` ## Required Fields - `schema` - `version` -- `variant_id` +- `name` - `alleles.kind` - `alleles.ref` - `alleles.alts` @@ -58,6 +57,54 @@ At least one of these must also exist: - `identifiers` - `coordinates` +## Top-Level `tags` + +Optional free-form classification tags for filtering and compatibility hints. + +Example: + +```yaml +tags: + - "type:trait" + - "validated:23andme:v5" +``` + +Guidelines: + +- use simple lowercase strings +- keep the vocabulary small +- `type:trait` is the main broadly useful tag for common trait records +- only add `validated:*` tags after direct assay test runs against local test data +- absence of a `validated:*` tag does not mean unsupported + +## Validation Rules + +- `schema` must be `bioscript:variant:1.0` +- `version` must be `1.0` +- `name` is required +- `identifiers.rsids` and `identifiers.aliases`, if present, must look like `rs123` +- `coordinates.*.chrom` must be one of: + - `1` through `22` + - `X` + - `Y` + - `MT` +- `coordinates.*` must use either: + - `pos` + - or `start` and `end` +- `pos`, `start`, and `end` must be integers +- `start` and `end` must be `>= 1` +- `end` must be `>= start` +- `alleles.kind` must be one of: + - `snv` + - `deletion` + - `insertion` + - `indel` +- stored allele values must be biological alleles, not symbolic `I` / `D` +- for `kind: snv`, `ref` and every `alt` must be single-base `A`, `C`, `G`, or `T` +- each finding must have its own `schema` +- if a finding has `alt`, it must exist in `alleles.alts` +- provenance URLs must be valid `http` or `https` URLs + ## Core Model ### `identifiers` @@ -69,16 +116,12 @@ Current fields: - `rsids` - `aliases` -Each is a simple string list. - Example: ```yaml identifiers: rsids: - "rs71785313" - - "rs1317778148" - - "rs143830837" aliases: [] ``` @@ -91,29 +134,27 @@ Supported assemblies now: - `grch37` - `grch38` -Coordinate shape: +Coordinate shapes: ```yaml coordinates: grch38: chrom: "22" - start: 36266000 - end: 36266005 + pos: 36265860 ``` -For single-base variants, you may use `pos` instead of `start` and `end`: - ```yaml coordinates: grch38: - chrom: "4" - pos: 88131171 + chrom: "22" + start: 36266000 + end: 36266005 ``` Rules: -- use `pos` for single-base sites when you want the compact form -- use `start` and `end` for spans such as deletions +- use `pos` for single-base sites +- use `start` and `end` for spans such as deletions or complex indels - do not include HGVS here ### `alleles` @@ -122,16 +163,17 @@ The biological allele definition. Fields: -- `kind`: `snv | deletion | insertion | indel | other` +- `kind`: `snv | deletion | insertion | indel` - `ref` - `alts` -- `canonical_alt` optional - `deletion_length` optional - `insertion_sequence` optional - `motifs` optional - `equivalent_coordinates` optional - `notes` optional +Stored YAML should describe the biological allele. Do not use symbolic `I` / `D` allele values in this schema. + Example SNV: ```yaml @@ -142,152 +184,69 @@ alleles: - "A" - "C" - "T" - canonical_alt: "A" ``` -Example deletion: +### `findings` + +Optional typed interpretation records attached to the variant. + +Each finding is a small object with its own `schema`. Parsers may validate schemas they understand and ignore schemas they do not understand. + +Minimal finding envelope: ```yaml -alleles: - kind: "deletion" - ref: "I" - alts: - - "D" - canonical_alt: "D" - deletion_length: 6 - motifs: - - "TTATAA" - - "ATAATT" - equivalent_coordinates: - grch37: - - "22:36662042-36662047" - - "22:36662046-36662051" - grch38: - - "22:36265996-36266001" - - "22:36266000-36266005" +findings: + - schema: "bioscript:trait:1.0" + alt: "A" + summary: "Associated with alcohol flushing after alcohol exposure." ``` -`canonical_alt` means: this is the specific alternate allele the file is primarily about. +Envelope fields: -If the file is only a locus-level record and you do not want to choose one allele, omit `canonical_alt`. +- `schema` required +- `alt` optional, but required for allele-specific findings +- `label` optional +- `summary` optional +- `notes` optional + +Unknown finding schemas are allowed. ## Optional Metadata These fields are optional and can be added without changing the core shape: +- `tags` - `label` - `gene` - `summary` -- `research` -- `clinical` +- `findings` +- `provenance` -## `research` +## `provenance` -Optional instructions or tags for enrichment tooling. - -```yaml -research: - tasks: - - "Confirm canonical dbSNP aliases" - - "Find ClinVar and Ensembl links" - tags: - - "apol1" - - "pgx" -``` - -`tasks` is for downstream research automation, not for runtime execution. - -## `clinical` - -Optional clinical interpretation metadata. - -Current supported optional domain: - -- `clinical.pgx` +Optional source metadata for the variant record and its findings. Example: ```yaml -clinical: - pgx: - gene: "ABCG2" - source: "ClinPGx" - variant_page_url: "https://www.clinpgx.org/variant/PA166156544/labelAnnotation" - drug_labels: - - source: "HCSC" - title: "Annotation of HCSC Label for rosuvastatin and ABCG2, SLCO1B1" - genes: - - "ABCG2" - - "SLCO1B1" - drugs: - - "rosuvastatin" - pgx_level: "Actionable PGx" - actionable: true - tags: - - "Dosing Info" - - "Prescribing Info" +provenance: + sources: + - kind: "database" + label: "dbSNP" + url: "https://www.ncbi.nlm.nih.gov/snp/rs671" + fields: + - "identifiers.rsids" + - "coordinates.grch37" + - "coordinates.grch38" + - "alleles" ``` ## Deliberately Out Of Core Schema These are intentionally not part of the compact core schema right now: -- `hgvs_genomic` - transcript HGVS - protein HGVS +- domain-specific nested blocks like `clinical` -Those can usually be derived or enriched later. They are useful annotation data, but they make the base files noisier and harder to validate. - -## Mapping To `bioscript.variant(...)` - -This YAML: - -```yaml -schema: "bioscript:variant" -version: "1.0" -variant_id: "APOL1_G2" -label: "APOL1 G2 deletion" -gene: "APOL1" - -identifiers: - rsids: - - "rs71785313" - - "rs1317778148" - - "rs143830837" - -coordinates: - grch37: - chrom: "22" - start: 36662046 - end: 36662051 - grch38: - chrom: "22" - start: 36266000 - end: 36266005 - -alleles: - kind: "deletion" - ref: "I" - alts: - - "D" - canonical_alt: "D" - deletion_length: 6 - motifs: - - "TTATAA" - - "ATAATT" -``` - -maps cleanly to: - -```python -bioscript.variant( - rsid=["rs71785313", "rs1317778148", "rs143830837"], - grch37="22:36662046-36662051", - grch38="22:36266000-36266005", - ref="I", - alt="D", - kind="deletion", - deletion_length=6, - motifs=["TTATAA", "ATAATT"], -) -``` +Those can be expressed as typed findings instead. diff --git a/docs/variant-schema.yaml b/docs/variant-schema.yaml index 7c1b298..1992b30 100644 --- a/docs/variant-schema.yaml +++ b/docs/variant-schema.yaml @@ -1,74 +1,56 @@ -schema: "bioscript:variant" +schema: "bioscript:variant:1.0" version: "1.0" -variant_id: "string" +name: "string" +tags: + - "type:trait" + - "validated:23andme:v5" label: "string" gene: "string" summary: "string" identifiers: rsids: - - "string" + - "rs123" aliases: - - "string" + - "rs456" coordinates: grch37: - chrom: "string" - pos: 0 + chrom: "1" + pos: 12345 grch38: - chrom: "string" - start: 0 - end: 0 + chrom: "1" + start: 12345 + end: 12346 alleles: kind: "snv" - ref: "string" + ref: "A" alts: - - "string" - canonical_alt: "string" - deletion_length: 0 - insertion_sequence: "string" + - "G" + deletion_length: 1 + insertion_sequence: "AT" motifs: - - "string" + - "AT" equivalent_coordinates: grch37: - - "string" + - "1:12345-12346" grch38: - - "string" + - "1:12346-12347" notes: - "string" -research: - tasks: - - "string" - tags: - - "string" +findings: + - schema: "bioscript:trait:1.0" + alt: "G" + label: "string" + summary: "string" + notes: "string" -clinical: - pgx: - gene: "string" - source: "ClinPGx" - variant_page_url: "https://example.org" - drug_labels: - - source: "FDA" - title: "string" - genes: - - "string" - drugs: - - "string" - pgx_level: "Actionable PGx" - actionable: true - tags: - - "Dosing Info" - annotation_url: "https://example.org" - note: "string" - clinical_annotations: - - source: "CPIC" - title: "string" - genes: - - "string" - drugs: - - "string" - level: "string" - actionable: true - annotation_url: "https://example.org" +provenance: + sources: + - kind: "database" + label: "string" + url: "https://example.org" + fields: + - "alleles" diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 055f9fd..32f5047 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -104,10 +104,12 @@ checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" name = "bioscript-cli" version = "0.1.0" dependencies = [ + "bioscript-core", "bioscript-formats", "bioscript-runtime", "bioscript-schema", "monty", + "serde_yaml", "zip", ] @@ -150,7 +152,9 @@ dependencies = [ name = "bioscript-schema" version = "0.1.0" dependencies = [ + "bioscript-core", "serde_yaml", + "url", ] [[package]] @@ -561,6 +565,15 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" +[[package]] +name = "form_urlencoded" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf" +dependencies = [ + "percent-encoding", +] + [[package]] name = "funty" version = "2.0.0" @@ -712,6 +725,109 @@ dependencies = [ "cc", ] +[[package]] +name = "icu_collections" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2984d1cd16c883d7935b9e07e44071dca8d917fd52ecc02c04d5fa0b5a3f191c" +dependencies = [ + "displaydoc", + "potential_utf", + "utf8_iter", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locale_core" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92219b62b3e2b4d88ac5119f8904c10f8f61bf7e95b640d25ba3075e6cac2c29" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_normalizer" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c56e5ee99d6e3d33bd91c5d85458b6005a22140021cc324cea84dd0e72cff3b4" +dependencies = [ + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da3be0ae77ea334f4da67c12f149704f19f81d1adf7c51cf482943e84a2bad38" + +[[package]] +name = "icu_properties" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bee3b67d0ea5c2cca5003417989af8996f8604e34fb9ddf96208a033901e70de" +dependencies = [ + "icu_collections", + "icu_locale_core", + "icu_properties_data", + "icu_provider", + "zerotrie", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e2bbb201e0c04f7b4b3e14382af113e17ba4f63e2c9d2ee626b720cbce54a14" + +[[package]] +name = "icu_provider" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "139c4cf31c8b5f33d7e199446eff9c1e02decfc2f0eec2c8d71f65befa45b421" +dependencies = [ + "displaydoc", + "icu_locale_core", + "writeable", + "yoke", + "zerofrom", + "zerotrie", + "zerovec", +] + +[[package]] +name = "idna" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344" +dependencies = [ + "icu_normalizer", + "icu_properties", +] + [[package]] name = "indexmap" version = "2.14.0" @@ -901,6 +1017,12 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" +[[package]] +name = "litemap" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0" + [[package]] name = "lock_api" version = "0.4.14" @@ -1140,6 +1262,12 @@ dependencies = [ "indexmap", ] +[[package]] +name = "percent-encoding" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" + [[package]] name = "phf" version = "0.11.3" @@ -1197,6 +1325,15 @@ dependencies = [ "serde", ] +[[package]] +name = "potential_utf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0103b1cef7ec0cf76490e969665504990193874ea05c85ff9bab8b911d0a0564" +dependencies = [ + "zerovec", +] + [[package]] name = "ppv-lite86" version = "0.2.21" @@ -1648,6 +1785,17 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "synstructure" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "tap" version = "1.0.1" @@ -1700,6 +1848,16 @@ dependencies = [ "syn", ] +[[package]] +name = "tinystr" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8323304221c2a851516f22236c5722a72eaa19749016521d6dff0824447d96d" +dependencies = [ + "displaydoc", + "zerovec", +] + [[package]] name = "tinyvec" version = "1.11.0" @@ -1770,6 +1928,24 @@ version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861" +[[package]] +name = "url" +version = "2.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", + "serde", +] + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + [[package]] name = "version_check" version = "0.9.5" @@ -1995,6 +2171,12 @@ version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +[[package]] +name = "writeable" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" + [[package]] name = "wyz" version = "0.5.1" @@ -2004,6 +2186,29 @@ dependencies = [ "tap", ] +[[package]] +name = "yoke" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abe8c5fda708d9ca3df187cae8bfb9ceda00dd96231bed36e445a1a48e66f9ca" +dependencies = [ + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + [[package]] name = "zerocopy" version = "0.8.48" @@ -2024,6 +2229,60 @@ dependencies = [ "syn", ] +[[package]] +name = "zerofrom" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69faa1f2a1ea75661980b013019ed6687ed0e83d069bc1114e2cc74c6c04c4df" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "zerotrie" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f9152d31db0792fa83f70fb2f83148effb5c1f5b8c7686c3459e361d9bc20bf" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", +] + +[[package]] +name = "zerovec" +version = "0.11.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90f911cbc359ab6af17377d242225f4d75119aec87ea711a880987b18cd7b239" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "zip" version = "2.4.2" diff --git a/rust/bioscript-cli/Cargo.toml b/rust/bioscript-cli/Cargo.toml index 6a3e5f7..374329d 100644 --- a/rust/bioscript-cli/Cargo.toml +++ b/rust/bioscript-cli/Cargo.toml @@ -8,10 +8,12 @@ name = "bioscript" path = "src/main.rs" [dependencies] +bioscript-core = { path = "../bioscript-core" } bioscript-formats = { path = "../bioscript-formats" } bioscript-runtime = { path = "../bioscript-runtime" } bioscript-schema = { path = "../bioscript-schema" } monty = { path = "../../monty/crates/monty" } +serde_yaml = "0.9.34" [dev-dependencies] zip = { version = "2.2.0", default-features = false, features = ["deflate"] } diff --git a/rust/bioscript-cli/src/main.rs b/rust/bioscript-cli/src/main.rs index 3db0063..6c446b2 100644 --- a/rust/bioscript-cli/src/main.rs +++ b/rust/bioscript-cli/src/main.rs @@ -1,18 +1,22 @@ use std::{ + collections::BTreeMap, env, fmt::Write as _, fs, - path::PathBuf, + path::{Path, PathBuf}, process::ExitCode, time::{Duration, Instant}, }; use bioscript_formats::{ - GenotypeLoadOptions, GenotypeSourceFormat, InspectOptions, PrepareRequest, inspect_file, - prepare_indexes, shell_flags, + GenotypeLoadOptions, GenotypeSourceFormat, GenotypeStore, InspectOptions, PrepareRequest, + inspect_file, prepare_indexes, shell_flags, }; use bioscript_runtime::{BioscriptRuntime, RuntimeConfig, StageTiming}; -use bioscript_schema::validate_variants_path; +use bioscript_schema::{ + PanelManifest, VariantManifest, load_panel_manifest, load_variant_manifest, validate_panels_path, + validate_variants_path, +}; use monty::ResourceLimits; fn main() -> ExitCode { @@ -32,6 +36,9 @@ fn run_cli() -> Result<(), String> { if first == "validate-variants" { return run_validate_variants(args.collect()); } + if first == "validate-panels" { + return run_validate_panels(args.collect()); + } if first == "prepare" { return run_prepare(args.collect()); } @@ -48,6 +55,7 @@ fn run_cli() -> Result<(), String> { let mut participant_id: Option = None; let mut trace_report: Option = None; let mut timing_report: Option = None; + let mut filters: Vec = Vec::new(); let mut auto_index = false; let mut cache_dir: Option = None; let mut loader = GenotypeLoadOptions::default(); @@ -89,6 +97,11 @@ fn run_cli() -> Result<(), String> { return Err("--timing-report requires a path".to_owned()); }; timing_report = Some(PathBuf::from(value)); + } else if arg == "--filter" { + let Some(value) = args.next() else { + return Err("--filter requires key=value".to_owned()); + }; + filters.push(value); } else if arg == "--input-format" { let Some(value) = args.next() else { return Err("--input-format requires a value".to_owned()); @@ -164,7 +177,7 @@ fn run_cli() -> Result<(), String> { let Some(script_path) = script_path else { return Err( - "usage: bioscript [--root ] [--input-file ] [--output-file ] [--participant-id ] [--trace-report ] [--input-format auto|text|zip|vcf|cram] [--input-index ] [--reference-file ] [--reference-index ] [--auto-index] [--cache-dir ] [--max-duration-ms N] [--max-memory-bytes N] [--max-allocations N] [--max-recursion-depth N]\n bioscript prepare [--root ] [--input-file ] [--reference-file ] [--input-format auto|text|zip|vcf|cram] [--cache-dir ]\n bioscript inspect [--input-index ] [--reference-file ] [--reference-index ]" + "usage: bioscript [--root ] [--input-file ] [--output-file ] [--participant-id ] [--trace-report ] [--timing-report ] [--filter key=value] [--input-format auto|text|zip|vcf|cram] [--input-index ] [--reference-file ] [--reference-index ] [--auto-index] [--cache-dir ] [--max-duration-ms N] [--max-memory-bytes N] [--max-allocations N] [--max-recursion-depth N]\n bioscript validate-variants [--report ]\n bioscript validate-panels [--report ]\n bioscript prepare [--root ] [--input-file ] [--reference-file ] [--input-format auto|text|zip|vcf|cram] [--cache-dir ]\n bioscript inspect [--input-index ] [--reference-file ] [--reference-index ]" .to_owned(), ); }; @@ -175,6 +188,7 @@ fn run_cli() -> Result<(), String> { env::current_dir().map_err(|err| format!("failed to get current directory: {err}"))? } }; + normalize_loader_paths(&runtime_root, &mut loader); // auto-index: detect and build missing indexes for CRAM/BAM/FASTA let mut cli_timings: Vec = Vec::new(); @@ -218,6 +232,29 @@ fn run_cli() -> Result<(), String> { }); } + if is_yaml_manifest(&script_path) { + let manifest_started = Instant::now(); + run_manifest( + &runtime_root, + &script_path, + input_file.as_deref(), + output_file.as_deref(), + participant_id.as_deref(), + trace_report.as_deref(), + &loader, + &filters, + )?; + cli_timings.push(StageTiming { + stage: "manifest_run".to_owned(), + duration_ms: manifest_started.elapsed().as_millis(), + detail: script_path.display().to_string(), + }); + if let Some(timing_path) = timing_report { + write_timing_report(&timing_path, &cli_timings)?; + } + return Ok(()); + } + let runtime = BioscriptRuntime::with_config(runtime_root, RuntimeConfig { limits, loader }) .map_err(|err| err.to_string())?; let mut inputs = Vec::new(); @@ -422,3 +459,370 @@ fn run_validate_variants(args: Vec) -> Result<(), String> { Ok(()) } + +fn run_validate_panels(args: Vec) -> Result<(), String> { + let mut path: Option = None; + let mut report_path: Option = None; + + let mut iter = args.into_iter(); + while let Some(arg) = iter.next() { + if arg == "--report" { + let Some(value) = iter.next() else { + return Err("--report requires a path".to_owned()); + }; + report_path = Some(PathBuf::from(value)); + } else if path.is_none() { + path = Some(PathBuf::from(arg)); + } else { + return Err(format!("unexpected argument: {arg}")); + } + } + + let Some(path) = path else { + return Err("usage: bioscript validate-panels [--report ]".to_owned()); + }; + + let report = validate_panels_path(&path)?; + let text = report.render_text(); + print!("{text}"); + + if let Some(report_path) = report_path { + if let Some(parent) = report_path.parent() { + std::fs::create_dir_all(parent).map_err(|err| { + format!("failed to create report dir {}: {err}", parent.display()) + })?; + } + std::fs::write(&report_path, text) + .map_err(|err| format!("failed to write {}: {err}", report_path.display()))?; + } + + if report.has_errors() { + return Err(format!( + "validation found {} errors and {} warnings", + report.total_errors(), + report.total_warnings() + )); + } + + Ok(()) +} + +fn is_yaml_manifest(path: &Path) -> bool { + path.extension() + .and_then(|ext| ext.to_str()) + .is_some_and(|ext| matches!(ext, "yaml" | "yml")) +} + +fn run_manifest( + runtime_root: &Path, + manifest_path: &Path, + input_file: Option<&str>, + output_file: Option<&str>, + participant_id: Option<&str>, + trace_report: Option<&Path>, + loader: &GenotypeLoadOptions, + filters: &[String], +) -> Result<(), String> { + let schema = manifest_schema(manifest_path)?; + let resolved_input = input_file.map(|value| resolve_cli_path(runtime_root, value)); + let resolved_output = output_file.map(|value| resolve_cli_path_buf(runtime_root, Path::new(value))); + let resolved_trace = trace_report.map(|value| resolve_cli_path_buf(runtime_root, value)); + match schema.as_str() { + "bioscript:variant:1.0" | "bioscript:variant" => { + let manifest = load_variant_manifest(manifest_path)?; + let row = run_variant_manifest( + runtime_root, + &manifest, + resolved_input.as_deref(), + participant_id, + loader, + )?; + write_manifest_outputs( + std::slice::from_ref(&row), + resolved_output.as_deref(), + resolved_trace.as_deref(), + )?; + Ok(()) + } + "bioscript:panel:1.0" => { + let manifest = load_panel_manifest(manifest_path)?; + let rows = run_panel_manifest( + runtime_root, + &manifest, + resolved_input.as_deref(), + participant_id, + loader, + filters, + )?; + write_manifest_outputs(&rows, resolved_output.as_deref(), resolved_trace.as_deref())?; + Ok(()) + } + other => Err(format!("unsupported manifest schema '{other}'")), + } +} + +fn run_variant_manifest( + runtime_root: &Path, + manifest: &VariantManifest, + input_file: Option<&str>, + participant_id: Option<&str>, + loader: &GenotypeLoadOptions, +) -> Result, String> { + let input_file = input_file.ok_or("manifest execution requires --input-file")?; + let store = GenotypeStore::from_file_with_options(Path::new(input_file), loader) + .map_err(|err| err.to_string())?; + let observation = store + .lookup_variant(&manifest.spec) + .map_err(|err| err.to_string())?; + Ok(variant_row( + runtime_root, + &manifest.path, + &manifest.name, + &manifest.tags, + &observation, + participant_id, + )) +} + +fn run_panel_manifest( + runtime_root: &Path, + panel: &PanelManifest, + input_file: Option<&str>, + participant_id: Option<&str>, + loader: &GenotypeLoadOptions, + filters: &[String], +) -> Result>, String> { + let input_file = input_file.ok_or("manifest execution requires --input-file")?; + let store = GenotypeStore::from_file_with_options(Path::new(input_file), loader) + .map_err(|err| err.to_string())?; + let mut rows = Vec::new(); + + for member in &panel.members { + if member.kind != "variant" { + return Err(format!( + "panel member kind '{}' is not executable yet; panel execution is currently variant-only", + member.kind + )); + } + let Some(path) = &member.path else { + return Err("remote panel members are not executable yet".to_owned()); + }; + let resolved = resolve_manifest_path(runtime_root, &panel.path, path)?; + let manifest = load_variant_manifest(&resolved)?; + if !matches_filters(&manifest, &resolved, filters) { + continue; + } + let observation = store + .lookup_variant(&manifest.spec) + .map_err(|err| err.to_string())?; + rows.push(variant_row( + runtime_root, + &resolved, + &manifest.name, + &manifest.tags, + &observation, + participant_id, + )); + } + + Ok(rows) +} + +fn variant_row( + runtime_root: &Path, + path: &Path, + name: &str, + tags: &[String], + observation: &bioscript_core::VariantObservation, + participant_id: Option<&str>, +) -> BTreeMap { + let mut row = BTreeMap::new(); + row.insert("kind".to_owned(), "variant".to_owned()); + row.insert("name".to_owned(), name.to_owned()); + row.insert( + "path".to_owned(), + path.strip_prefix(runtime_root) + .unwrap_or(path) + .display() + .to_string(), + ); + row.insert("tags".to_owned(), tags.join(",")); + row.insert("backend".to_owned(), observation.backend.clone()); + row.insert( + "participant_id".to_owned(), + participant_id.unwrap_or_default().to_owned(), + ); + row.insert( + "matched_rsid".to_owned(), + observation.matched_rsid.clone().unwrap_or_default(), + ); + row.insert( + "assembly".to_owned(), + observation + .assembly + .map(|value| match value { + bioscript_core::Assembly::Grch37 => "grch37".to_owned(), + bioscript_core::Assembly::Grch38 => "grch38".to_owned(), + }) + .unwrap_or_default(), + ); + row.insert( + "genotype".to_owned(), + observation.genotype.clone().unwrap_or_default(), + ); + row.insert( + "ref_count".to_owned(), + observation.ref_count.map_or_else(String::new, |value| value.to_string()), + ); + row.insert( + "alt_count".to_owned(), + observation.alt_count.map_or_else(String::new, |value| value.to_string()), + ); + row.insert( + "depth".to_owned(), + observation.depth.map_or_else(String::new, |value| value.to_string()), + ); + row.insert("evidence".to_owned(), observation.evidence.join(" | ")); + row +} + +fn write_manifest_outputs( + rows: &[BTreeMap], + output_file: Option<&Path>, + trace_report: Option<&Path>, +) -> Result<(), String> { + let text = render_rows_as_tsv(rows); + if let Some(output_file) = output_file { + if let Some(parent) = output_file.parent() { + fs::create_dir_all(parent) + .map_err(|err| format!("failed to create output dir {}: {err}", parent.display()))?; + } + fs::write(output_file, &text) + .map_err(|err| format!("failed to write output {}: {err}", output_file.display()))?; + } else { + print!("{text}"); + } + + if let Some(trace_report) = trace_report { + if let Some(parent) = trace_report.parent() { + fs::create_dir_all(parent).map_err(|err| { + format!("failed to create trace dir {}: {err}", parent.display()) + })?; + } + let mut trace = String::from("step\tline\tcode\n"); + for (idx, row) in rows.iter().enumerate() { + let _ = writeln!( + trace, + "{}\t{}\t{}", + idx + 1, + idx + 1, + row.get("path").cloned().unwrap_or_default() + ); + } + fs::write(trace_report, trace) + .map_err(|err| format!("failed to write trace {}: {err}", trace_report.display()))?; + } + + Ok(()) +} + +fn resolve_cli_path(root: &Path, value: &str) -> String { + resolve_cli_path_buf(root, Path::new(value)).display().to_string() +} + +fn resolve_cli_path_buf(root: &Path, value: &Path) -> PathBuf { + if value.is_absolute() { + value.to_path_buf() + } else { + root.join(value) + } +} + +fn render_rows_as_tsv(rows: &[BTreeMap]) -> String { + let headers = [ + "kind", + "name", + "path", + "tags", + "participant_id", + "backend", + "matched_rsid", + "assembly", + "genotype", + "ref_count", + "alt_count", + "depth", + "evidence", + ]; + let mut out = headers.join("\t"); + out.push('\n'); + for row in rows { + let line = headers + .iter() + .map(|header| row.get(*header).cloned().unwrap_or_default().replace('\t', " ")) + .collect::>() + .join("\t"); + out.push_str(&line); + out.push('\n'); + } + out +} + +fn matches_filters(manifest: &VariantManifest, path: &Path, filters: &[String]) -> bool { + filters.iter().all(|filter| match filter.split_once('=') { + Some(("kind", value)) => value == "variant", + Some(("name", value)) => manifest.name.contains(value), + Some(("path", value)) => path.display().to_string().contains(value), + Some(("tag", value)) => manifest.tags.iter().any(|tag| tag == value), + Some(_) | None => false, + }) +} + +fn resolve_manifest_path( + runtime_root: &Path, + manifest_path: &Path, + relative: &str, +) -> Result { + let base_dir = manifest_path + .parent() + .ok_or_else(|| format!("manifest has no parent: {}", manifest_path.display()))?; + let joined = base_dir.join(relative); + let canonical_root = runtime_root + .canonicalize() + .map_err(|err| format!("failed to resolve root {}: {err}", runtime_root.display()))?; + let canonical_joined = joined + .canonicalize() + .map_err(|err| format!("failed to resolve {}: {err}", joined.display()))?; + if !canonical_joined.starts_with(&canonical_root) { + return Err(format!( + "manifest member path escapes bioscript root: {}", + canonical_joined.display() + )); + } + Ok(canonical_joined) +} + +fn manifest_schema(path: &Path) -> Result { + let text = fs::read_to_string(path) + .map_err(|err| format!("failed to read {}: {err}", path.display()))?; + let value: serde_yaml::Value = serde_yaml::from_str(&text) + .map_err(|err| format!("failed to parse YAML {}: {err}", path.display()))?; + value + .as_mapping() + .and_then(|mapping| mapping.get(serde_yaml::Value::String("schema".to_owned()))) + .and_then(serde_yaml::Value::as_str) + .map(ToOwned::to_owned) + .ok_or_else(|| format!("{} is missing schema", path.display())) +} + +fn normalize_loader_paths(root: &Path, loader: &mut GenotypeLoadOptions) { + if let Some(path) = loader.input_index.take() { + loader.input_index = Some(resolve_cli_path_buf(root, &path)); + } + if let Some(path) = loader.reference_file.take() { + loader.reference_file = Some(resolve_cli_path_buf(root, &path)); + } + if let Some(path) = loader.reference_index.take() { + loader.reference_index = Some(resolve_cli_path_buf(root, &path)); + } +} diff --git a/rust/bioscript-cli/tests/cli.rs b/rust/bioscript-cli/tests/cli.rs index b345e38..5936b1d 100644 --- a/rust/bioscript-cli/tests/cli.rs +++ b/rust/bioscript-cli/tests/cli.rs @@ -1,4 +1,9 @@ -use std::{fs, path::PathBuf, process::Command}; +use std::{ + fs, + path::PathBuf, + process::Command, + time::{SystemTime, UNIX_EPOCH}, +}; fn repo_root() -> PathBuf { PathBuf::from(env!("CARGO_MANIFEST_DIR")) @@ -9,6 +14,19 @@ fn repo_root() -> PathBuf { .to_path_buf() } +fn temp_dir(label: &str) -> PathBuf { + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("clock drift") + .as_nanos(); + let dir = std::env::temp_dir().join(format!( + "bioscript-cli-tests-tmp-{label}-{}-{nanos}", + std::process::id() + )); + fs::create_dir_all(&dir).unwrap(); + dir +} + #[test] fn hello_world_script_runs_via_cli_and_writes_within_root() { let root = repo_root(); @@ -152,3 +170,143 @@ fn inspect_subcommand_reports_detected_vendor_and_platform() { assert!(stdout.contains("assembly\tgrch37")); assert!(stdout.contains("duration_ms\t")); } + +#[test] +fn variant_manifest_runs_directly_via_cli() { + let root = repo_root(); + let dir = temp_dir("variant-manifest"); + let manifest = dir.join("rs1.yaml"); + fs::write( + &manifest, + r#" +schema: "bioscript:variant:1.0" +version: "1.0" +name: "example-rs73885319" +tags: + - "type:trait" +identifiers: + rsids: + - "rs73885319" +coordinates: + grch38: + chrom: "22" + pos: 36265860 +alleles: + kind: "snv" + ref: "A" + alts: + - "G" +"#, + ) + .unwrap(); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("--input-file") + .arg("old/examples/apol1/test_snps.txt") + .arg(&manifest) + .output() + .unwrap(); + + assert!( + output.status.success(), + "stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("kind\tname\tpath")); + assert!(stdout.contains("example-rs73885319")); + assert!(stdout.contains("AG")); +} + +#[test] +fn panel_manifest_runs_directly_via_cli() { + let root = repo_root(); + let dir = temp_dir("panel-manifest"); + let variants_dir = dir.join("variants"); + fs::create_dir_all(&variants_dir).unwrap(); + fs::write( + variants_dir.join("rs73885319.yaml"), + r#" +schema: "bioscript:variant:1.0" +version: "1.0" +name: "example-rs73885319" +tags: + - "type:trait" +identifiers: + rsids: + - "rs73885319" +coordinates: + grch38: + chrom: "22" + pos: 36265860 +alleles: + kind: "snv" + ref: "A" + alts: + - "G" +"#, + ) + .unwrap(); + fs::write( + variants_dir.join("rs60910145.yaml"), + r#" +schema: "bioscript:variant:1.0" +version: "1.0" +name: "example-rs60910145" +tags: + - "type:trait" +identifiers: + rsids: + - "rs60910145" +coordinates: + grch38: + chrom: "22" + pos: 36265988 +alleles: + kind: "snv" + ref: "T" + alts: + - "G" +"#, + ) + .unwrap(); + let panel = dir.join("panel.yaml"); + fs::write( + &panel, + r#" +schema: "bioscript:panel:1.0" +version: "1.0" +name: "example-panel" +tags: + - "type:trait" +members: + - kind: "variant" + path: "variants/rs73885319.yaml" + version: "1.0" + - kind: "variant" + path: "variants/rs60910145.yaml" + version: "1.0" +"#, + ) + .unwrap(); + + let output = Command::new(env!("CARGO_BIN_EXE_bioscript")) + .current_dir(&root) + .arg("--input-file") + .arg("old/examples/apol1/test_snps.txt") + .arg("--filter") + .arg("name=rs73885319") + .arg(&panel) + .output() + .unwrap(); + + assert!( + output.status.success(), + "stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("example-rs73885319")); + assert!(!stdout.contains("example-rs60910145")); +} diff --git a/rust/bioscript-formats/src/prepare.rs b/rust/bioscript-formats/src/prepare.rs index 53de556..3af9ca6 100644 --- a/rust/bioscript-formats/src/prepare.rs +++ b/rust/bioscript-formats/src/prepare.rs @@ -83,22 +83,39 @@ fn canonical_dir(path: &Path) -> Result { fn resolve_rooted_path(root: &Path, raw: &str) -> Result { let raw_path = Path::new(raw); let resolved = if raw_path.is_absolute() { + ensure_path_within_root(root, raw_path)?; raw_path.to_path_buf() } else { + ensure_relative_path_safe(raw_path)?; root.join(raw_path) }; let canonical = resolved .canonicalize() .map_err(|err| format!("failed to resolve {}: {err}", resolved.display()))?; - if !canonical.starts_with(root) { - return Err(format!( - "path escapes bioscript root: {}", - canonical.display() - )); - } Ok(canonical) } +fn ensure_relative_path_safe(path: &Path) -> Result<(), String> { + for component in path.components() { + match component { + std::path::Component::ParentDir + | std::path::Component::RootDir + | std::path::Component::Prefix(_) => { + return Err(format!("path escapes bioscript root: {}", path.display())); + } + std::path::Component::CurDir | std::path::Component::Normal(_) => {} + } + } + Ok(()) +} + +fn ensure_path_within_root(root: &Path, path: &Path) -> Result<(), String> { + let relative = path + .strip_prefix(root) + .map_err(|_| format!("path escapes bioscript root: {}", path.display()))?; + ensure_relative_path_safe(relative) +} + fn resolve_cache_dir(cwd: &Path, cache_dir: &Path) -> PathBuf { if cache_dir.is_absolute() { cache_dir.to_path_buf() diff --git a/rust/bioscript-schema/Cargo.toml b/rust/bioscript-schema/Cargo.toml index fd869a2..3c9542d 100644 --- a/rust/bioscript-schema/Cargo.toml +++ b/rust/bioscript-schema/Cargo.toml @@ -4,7 +4,9 @@ version = "0.1.0" edition = "2024" [dependencies] +bioscript-core = { path = "../bioscript-core" } serde_yaml = "0.9.34" +url = "2.5" [lints.clippy] pedantic = { level = "warn", priority = -1 } diff --git a/rust/bioscript-schema/src/lib.rs b/rust/bioscript-schema/src/lib.rs index 157187c..4e82ec3 100644 --- a/rust/bioscript-schema/src/lib.rs +++ b/rust/bioscript-schema/src/lib.rs @@ -1,3 +1,7 @@ mod validator; -pub use validator::{FileReport, Issue, Severity, ValidationReport, validate_variants_path}; +pub use validator::{ + Download, FileReport, Issue, PanelManifest, PanelMember, Permissions, Severity, + ValidationReport, VariantManifest, load_panel_manifest, load_variant_manifest, + validate_panels_path, validate_variants_path, +}; diff --git a/rust/bioscript-schema/src/validator.rs b/rust/bioscript-schema/src/validator.rs index 8e6094d..15f3a96 100644 --- a/rust/bioscript-schema/src/validator.rs +++ b/rust/bioscript-schema/src/validator.rs @@ -1,10 +1,13 @@ use std::{ + collections::BTreeSet, fmt::{self, Write as _}, fs, path::{Path, PathBuf}, }; -use serde_yaml::Value; +use bioscript_core::{GenomicLocus, VariantKind, VariantSpec}; +use serde_yaml::{Mapping, Value}; +use url::Url; #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum Severity { @@ -94,6 +97,47 @@ impl ValidationReport { } } +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct VariantManifest { + pub path: PathBuf, + pub name: String, + pub tags: Vec, + pub spec: VariantSpec, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PanelManifest { + pub path: PathBuf, + pub name: String, + pub tags: Vec, + pub permissions: Permissions, + pub downloads: Vec, + pub members: Vec, +} + +#[derive(Debug, Clone, PartialEq, Eq, Default)] +pub struct Permissions { + pub domains: Vec, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Download { + pub id: String, + pub url: String, + pub origin: String, + pub sha256: String, + pub version: String, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PanelMember { + pub kind: String, + pub path: Option, + pub download: Option, + pub sha256: Option, + pub version: Option, +} + /// Validate a variant file or directory of variant files. /// /// # Errors @@ -101,10 +145,88 @@ impl ValidationReport { /// Returns an error when the input path cannot be read, traversed, or parsed /// as YAML. pub fn validate_variants_path(path: &Path) -> Result { - let files = collect_variant_files(path)?; + validate_manifest_path(path, ManifestSelector::Variant) +} + +/// Validate a panel file or directory of panel files. +/// +/// # Errors +/// +/// Returns an error when the input path cannot be read, traversed, or parsed +/// as YAML. +pub fn validate_panels_path(path: &Path) -> Result { + validate_manifest_path(path, ManifestSelector::Panel) +} + +/// Load a single variant manifest from YAML. +/// +/// # Errors +/// +/// Returns an error when the file does not parse or is not a valid variant +/// manifest. +pub fn load_variant_manifest(path: &Path) -> Result { + let value = load_yaml(path)?; + let mut issues = Vec::new(); + validate_variant_root(&value, &mut issues); + if issues.iter().any(|issue| issue.severity == Severity::Error) { + return Err(render_single_manifest_errors(path, &issues)); + } + + Ok(VariantManifest { + path: path.to_path_buf(), + name: required_non_empty_string(&value, &["name"])?, + tags: seq_of_strings(&value, &["tags"]).unwrap_or_default(), + spec: variant_spec_from_root(&value)?, + }) +} + +/// Load a single panel manifest from YAML. +/// +/// # Errors +/// +/// Returns an error when the file does not parse or is not a valid panel +/// manifest. +pub fn load_panel_manifest(path: &Path) -> Result { + let value = load_yaml(path)?; + let mut issues = Vec::new(); + validate_panel_root(&value, &mut issues); + if issues.iter().any(|issue| issue.severity == Severity::Error) { + return Err(render_single_manifest_errors(path, &issues)); + } + + let permissions = Permissions { + domains: seq_of_strings(&value, &["permissions", "domains"]).unwrap_or_default(), + }; + let downloads = parse_downloads(&value)?; + let members = parse_panel_members(&value)?; + + Ok(PanelManifest { + path: path.to_path_buf(), + name: required_non_empty_string(&value, &["name"])?, + tags: seq_of_strings(&value, &["tags"]).unwrap_or_default(), + permissions, + downloads, + members, + }) +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum ManifestSelector { + Variant, + Panel, +} + +fn validate_manifest_path( + path: &Path, + selector: ManifestSelector, +) -> Result { + let files = collect_yaml_files(path)?; let mut reports = Vec::new(); for file in &files { - let report = validate_variant_file(file)?; + let report = match selector { + ManifestSelector::Variant => validate_variant_file(file)?, + ManifestSelector::Panel => validate_panel_file(file)?, + }; if !report.issues.is_empty() { reports.push(report); } @@ -115,31 +237,31 @@ pub fn validate_variants_path(path: &Path) -> Result { }) } -fn collect_variant_files(path: &Path) -> Result, String> { +fn collect_yaml_files(path: &Path) -> Result, String> { if path.is_file() { return Ok(vec![path.to_path_buf()]); } let mut files = Vec::new(); - collect_variant_files_recursive(path, &mut files)?; + collect_yaml_files_recursive(path, &mut files)?; files.sort(); Ok(files) } -fn collect_variant_files_recursive(path: &Path, files: &mut Vec) -> Result<(), String> { +fn collect_yaml_files_recursive(path: &Path, files: &mut Vec) -> Result<(), String> { let entries = fs::read_dir(path) .map_err(|err| format!("failed to read directory {}: {err}", path.display()))?; for entry in entries { let entry = entry.map_err(|err| format!("failed to read directory entry: {err}"))?; let entry_path = entry.path(); if entry_path.is_dir() { - collect_variant_files_recursive(&entry_path, files)?; + collect_yaml_files_recursive(&entry_path, files)?; continue; } let Some(file_name) = entry_path.file_name().and_then(|name| name.to_str()) else { continue; }; - if matches!(file_name, "variant.yaml" | "variant.yml") { + if file_name.ends_with(".yaml") || file_name.ends_with(".yml") { files.push(entry_path); } } @@ -147,45 +269,74 @@ fn collect_variant_files_recursive(path: &Path, files: &mut Vec) -> Res } fn validate_variant_file(path: &Path) -> Result { - let text = fs::read_to_string(path) - .map_err(|err| format!("failed to read {}: {err}", path.display()))?; - let value: Value = serde_yaml::from_str(&text) - .map_err(|err| format!("failed to parse YAML {}: {err}", path.display()))?; + let value = load_yaml(path)?; + let Some(schema) = scalar_at(&value, &["schema"]) else { + return Ok(FileReport { + file: path.to_path_buf(), + issues: vec![Issue { + severity: Severity::Error, + path: "schema".to_owned(), + message: "missing schema".to_owned(), + }], + }); + }; + if !schema.contains("variant") { + return Ok(FileReport { + file: path.to_path_buf(), + issues: Vec::new(), + }); + } let mut issues = Vec::new(); - validate_required_shape(&value, &mut issues); - validate_kind_vs_tags(&value, &mut issues); - validate_pgx_shape(&value, &mut issues); - + validate_variant_root(&value, &mut issues); Ok(FileReport { file: path.to_path_buf(), issues, }) } -fn validate_required_shape(root: &Value, issues: &mut Vec) { - require_const(root, &["schema"], "bioscript:variant", issues); - require_const(root, &["version"], "1.0", issues); - require_path(root, &["variant_id"], issues); - require_path(root, &["alleles"], issues); - require_path(root, &["alleles", "kind"], issues); - require_path(root, &["alleles", "ref"], issues); - if value_at(root, &["alleles", "alts"]).is_none() { - issues.push(Issue { - severity: Severity::Error, - path: "alleles".to_owned(), - message: "missing allele definition; expected alleles.alts".to_owned(), +fn validate_panel_file(path: &Path) -> Result { + let value = load_yaml(path)?; + let Some(schema) = scalar_at(&value, &["schema"]) else { + return Ok(FileReport { + file: path.to_path_buf(), + issues: vec![Issue { + severity: Severity::Error, + path: "schema".to_owned(), + message: "missing schema".to_owned(), + }], }); - } - - if value_at(root, &["alleles", "alt"]).is_some() { - issues.push(Issue { - severity: Severity::Warning, - path: "alleles.alt".to_owned(), - message: "alleles.alt is legacy shape; prefer alleles.alts and optional alleles.canonical_alt".to_owned(), + }; + if !schema.contains("panel") { + return Ok(FileReport { + file: path.to_path_buf(), + issues: Vec::new(), }); } + let mut issues = Vec::new(); + validate_panel_root(&value, &mut issues); + Ok(FileReport { + file: path.to_path_buf(), + issues, + }) +} + +fn validate_variant_root(root: &Value, issues: &mut Vec) { + validate_schema_and_identity( + root, + "bioscript:variant:1.0", + Some("bioscript:variant"), + issues, + ); + validate_optional_strings(root, &["name", "label", "gene", "summary"], issues); + validate_tags(root, issues); + validate_identifiers(root, issues); + validate_coordinates(root, issues); + validate_alleles(root, issues); + validate_findings(root, issues); + validate_provenance(root, issues); + let has_identifiers = value_at(root, &["identifiers"]) .and_then(Value::as_mapping) .is_some_and(|mapping| !mapping.is_empty()); @@ -199,76 +350,953 @@ fn validate_required_shape(root: &Value, issues: &mut Vec) { message: "expected at least one identifier block or one coordinate block".to_owned(), }); } - if let Some(canonical_alt) = scalar_at(root, &["alleles", "canonical_alt"]) { - let alts = seq_at(root, &["alleles", "alts"]).unwrap_or_default(); - if !alts.iter().any(|alt| alt == &canonical_alt) { +} + +fn validate_panel_root(root: &Value, issues: &mut Vec) { + validate_schema_and_identity(root, "bioscript:panel:1.0", None, issues); + validate_optional_strings(root, &["name", "label", "summary"], issues); + validate_tags(root, issues); + validate_permissions(root, issues); + validate_downloads(root, issues); + validate_panel_members(root, issues); +} + +fn validate_schema_and_identity( + root: &Value, + canonical_schema: &str, + legacy_schema: Option<&str>, + issues: &mut Vec, +) { + let schema = scalar_at(root, &["schema"]); + let valid_schema = schema + .as_deref() + .is_some_and(|value| value == canonical_schema || legacy_schema == Some(value)); + if !valid_schema { + issues.push(Issue { + severity: Severity::Error, + path: "schema".to_owned(), + message: format!("expected schema to be '{canonical_schema}'"), + }); + } + if let Some(legacy_schema) = legacy_schema + && matches!(schema.as_deref(), Some(value) if value == legacy_schema) + { + issues.push(Issue { + severity: Severity::Warning, + path: "schema".to_owned(), + message: format!( + "legacy schema value '{legacy_schema}'; prefer '{canonical_schema}'" + ), + }); + } + require_const(root, &["version"], "1.0", issues); + match scalar_at(root, &["name"]) { + Some(name) if !name.trim().is_empty() => {} + Some(_) => issues.push(Issue { + severity: Severity::Error, + path: "name".to_owned(), + message: "empty string".to_owned(), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: "name".to_owned(), + message: "missing required field".to_owned(), + }), + } + if value_at(root, &["variant_id"]).is_some() { + issues.push(Issue { + severity: Severity::Warning, + path: "variant_id".to_owned(), + message: "variant_id is legacy; prefer name".to_owned(), + }); + } +} + +fn validate_optional_strings(root: &Value, fields: &[&str], issues: &mut Vec) { + for field in fields { + if let Some(value) = value_at(root, &[*field]) { + match value.as_str() { + Some(text) if !text.trim().is_empty() => {} + Some(_) => issues.push(Issue { + severity: Severity::Warning, + path: (*field).to_owned(), + message: "empty string".to_owned(), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: (*field).to_owned(), + message: "expected string".to_owned(), + }), + } + } + } +} + +fn validate_tags(root: &Value, issues: &mut Vec) { + let Some(value) = value_at(root, &["tags"]) else { + return; + }; + let Some(items) = value.as_sequence() else { + issues.push(Issue { + severity: Severity::Error, + path: "tags".to_owned(), + message: "expected a sequence of strings".to_owned(), + }); + return; + }; + + for (idx, item) in items.iter().enumerate() { + let Some(tag) = item.as_str() else { issues.push(Issue { severity: Severity::Error, - path: "alleles.canonical_alt".to_owned(), - message: format!( - "canonical_alt '{canonical_alt}' is not present in alleles.alts {alts:?}" - ), + path: format!("tags[{idx}]"), + message: "expected string".to_owned(), + }); + continue; + }; + if tag.trim().is_empty() { + issues.push(Issue { + severity: Severity::Error, + path: format!("tags[{idx}]"), + message: "empty tag string".to_owned(), }); } } } -fn validate_kind_vs_tags(root: &Value, issues: &mut Vec) { +fn validate_identifiers(root: &Value, issues: &mut Vec) { + for field in ["rsids", "aliases"] { + let Some(values) = value_at(root, &["identifiers", field]) else { + continue; + }; + let Some(items) = values.as_sequence() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("identifiers.{field}"), + message: "expected a sequence of strings".to_owned(), + }); + continue; + }; + let mut seen = BTreeSet::new(); + for (idx, item) in items.iter().enumerate() { + let Some(value) = item.as_str() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("identifiers.{field}[{idx}]"), + message: "expected string".to_owned(), + }); + continue; + }; + if !is_rsid(value) { + issues.push(Issue { + severity: Severity::Error, + path: format!("identifiers.{field}[{idx}]"), + message: format!("expected rsid like rs123, found '{value}'"), + }); + } + if !seen.insert(value.to_owned()) { + issues.push(Issue { + severity: Severity::Warning, + path: format!("identifiers.{field}[{idx}]"), + message: format!("duplicate identifier '{value}'"), + }); + } + } + } +} + +fn validate_coordinates(root: &Value, issues: &mut Vec) { + for assembly in ["grch37", "grch38"] { + let Some(coord) = mapping_at(root, &["coordinates", assembly]) else { + continue; + }; + + let Some(chrom) = coord + .get(Value::String("chrom".to_owned())) + .and_then(Value::as_str) + else { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}.chrom"), + message: "missing chrom".to_owned(), + }); + continue; + }; + if !is_allowed_chromosome(chrom) { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}.chrom"), + message: format!("invalid chromosome '{chrom}'; expected 1-22, X, Y, or MT"), + }); + } + + let has_pos = coord.contains_key(Value::String("pos".to_owned())); + let has_start = coord.contains_key(Value::String("start".to_owned())); + let has_end = coord.contains_key(Value::String("end".to_owned())); + if has_pos && (has_start || has_end) { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}"), + message: "use either pos or start/end, not both".to_owned(), + }); + continue; + } + if !has_pos && !(has_start && has_end) { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}"), + message: "expected either pos or start/end".to_owned(), + }); + continue; + } + + if has_pos { + if let Some(pos) = i64_at_mapping(coord, "pos") { + if pos < 1 { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}.pos"), + message: "expected integer >= 1".to_owned(), + }); + } + } else { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}.pos"), + message: "expected integer".to_owned(), + }); + } + } else { + let start = i64_at_mapping(coord, "start"); + let end = i64_at_mapping(coord, "end"); + match (start, end) { + (Some(start), Some(end)) => { + if start < 1 { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}.start"), + message: "expected integer >= 1".to_owned(), + }); + } + if end < 1 { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}.end"), + message: "expected integer >= 1".to_owned(), + }); + } + if end < start { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}.end"), + message: "expected end >= start".to_owned(), + }); + } + if start == end { + issues.push(Issue { + severity: Severity::Warning, + path: format!("coordinates.{assembly}"), + message: "single-position coordinate uses start/end; prefer pos".to_owned(), + }); + } + } + _ => { + issues.push(Issue { + severity: Severity::Error, + path: format!("coordinates.{assembly}"), + message: "expected integer start/end".to_owned(), + }); + } + } + } + } +} + +fn validate_alleles(root: &Value, issues: &mut Vec) { + require_path(root, &["alleles"], issues); + require_path(root, &["alleles", "kind"], issues); + require_path(root, &["alleles", "ref"], issues); + require_path(root, &["alleles", "alts"], issues); + let Some(kind) = scalar_at(root, &["alleles", "kind"]) else { return; }; - let Some(tags) = seq_at(root, &["research", "tags"]) else { + if !matches!(kind.as_str(), "snv" | "deletion" | "insertion" | "indel") { + issues.push(Issue { + severity: Severity::Error, + path: "alleles.kind".to_owned(), + message: "expected one of snv, deletion, insertion, indel".to_owned(), + }); + } + + if value_at(root, &["alleles", "canonical_alt"]).is_some() { + issues.push(Issue { + severity: Severity::Error, + path: "alleles.canonical_alt".to_owned(), + message: "canonical_alt is not part of the current schema".to_owned(), + }); + } + + let Some(reference) = scalar_at(root, &["alleles", "ref"]) else { return; }; + if reference.trim().is_empty() { + issues.push(Issue { + severity: Severity::Error, + path: "alleles.ref".to_owned(), + message: "empty string".to_owned(), + }); + } - let has_legacy_snp_tag = tags.iter().any(|tag| tag == "snp"); - let has_preferred_snv_tag = tags.iter().any(|tag| tag == "snv"); - if kind == "snv" && has_legacy_snp_tag && !has_preferred_snv_tag { + let Some(alts_value) = value_at(root, &["alleles", "alts"]) else { + return; + }; + let Some(alts_seq) = alts_value.as_sequence() else { issues.push(Issue { - severity: Severity::Warning, - path: "research.tags".to_owned(), - message: "alleles.kind is 'snv' but research.tags uses 'snp'; pick one vocabulary and use it consistently".to_owned(), + severity: Severity::Error, + path: "alleles.alts".to_owned(), + message: "expected a non-empty sequence of strings".to_owned(), + }); + return; + }; + if alts_seq.is_empty() { + issues.push(Issue { + severity: Severity::Error, + path: "alleles.alts".to_owned(), + message: "expected at least one alternate allele".to_owned(), + }); + return; + } + + let mut alts = Vec::new(); + for (idx, item) in alts_seq.iter().enumerate() { + let Some(alt) = item.as_str() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("alleles.alts[{idx}]"), + message: "expected string".to_owned(), + }); + continue; + }; + if alt.trim().is_empty() { + issues.push(Issue { + severity: Severity::Error, + path: format!("alleles.alts[{idx}]"), + message: "empty string".to_owned(), + }); + continue; + } + if alt == "I" || alt == "D" { + issues.push(Issue { + severity: Severity::Error, + path: format!("alleles.alts[{idx}]"), + message: "symbolic I/D alleles are not allowed in stored YAML; use biological alleles".to_owned(), + }); + } + alts.push(alt.to_owned()); + } + if reference == "I" || reference == "D" { + issues.push(Issue { + severity: Severity::Error, + path: "alleles.ref".to_owned(), + message: "symbolic I/D alleles are not allowed in stored YAML; use biological alleles".to_owned(), }); } + + if kind == "snv" { + if !is_base_allele(&reference) { + issues.push(Issue { + severity: Severity::Error, + path: "alleles.ref".to_owned(), + message: "snv ref must be one of A/C/G/T".to_owned(), + }); + } + for (idx, alt) in alts.iter().enumerate() { + if !is_base_allele(alt) { + issues.push(Issue { + severity: Severity::Error, + path: format!("alleles.alts[{idx}]"), + message: "snv alt must be one of A/C/G/T".to_owned(), + }); + } + } + } } -fn validate_pgx_shape(root: &Value, issues: &mut Vec) { - let Some(pgx) = mapping_at(root, &["clinical", "pgx"]) else { +fn validate_findings(root: &Value, issues: &mut Vec) { + let alts = seq_of_strings(root, &["alleles", "alts"]).unwrap_or_default(); + let Some(findings) = value_at(root, &["findings"]).and_then(Value::as_sequence) else { return; }; - for key in ["drug_labels", "annotations", "clinical_annotations"] { - let Some(items) = pgx - .get(Value::String(key.to_owned())) - .and_then(Value::as_sequence) + for (idx, finding) in findings.iter().enumerate() { + let Some(mapping) = finding.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("findings[{idx}]"), + message: "expected mapping".to_owned(), + }); + continue; + }; + + let Some(schema) = mapping + .get(Value::String("schema".to_owned())) + .and_then(Value::as_str) else { + issues.push(Issue { + severity: Severity::Error, + path: format!("findings[{idx}].schema"), + message: "missing schema".to_owned(), + }); continue; }; - for (idx, item) in items.iter().enumerate() { - let Some(mapping) = item.as_mapping() else { + if schema.trim().is_empty() { + issues.push(Issue { + severity: Severity::Error, + path: format!("findings[{idx}].schema"), + message: "empty string".to_owned(), + }); + } + if let Some(alt) = mapping + .get(Value::String("alt".to_owned())) + .and_then(Value::as_str) + && !alts.iter().any(|item| item == alt) + { + issues.push(Issue { + severity: Severity::Error, + path: format!("findings[{idx}].alt"), + message: format!("finding alt '{alt}' is not present in alleles.alts {alts:?}"), + }); + } + let has_summary = mapping + .get(Value::String("summary".to_owned())) + .and_then(Value::as_str) + .is_some_and(|value| !value.trim().is_empty()); + let has_notes = mapping + .get(Value::String("notes".to_owned())) + .and_then(Value::as_str) + .is_some_and(|value| !value.trim().is_empty()); + if !has_summary && !has_notes { + issues.push(Issue { + severity: Severity::Warning, + path: format!("findings[{idx}]"), + message: "finding has neither summary nor notes".to_owned(), + }); + } + } +} + +fn validate_provenance(root: &Value, issues: &mut Vec) { + let Some(sources) = value_at(root, &["provenance", "sources"]).and_then(Value::as_sequence) + else { + return; + }; + for (idx, source) in sources.iter().enumerate() { + let Some(mapping) = source.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("provenance.sources[{idx}]"), + message: "expected mapping".to_owned(), + }); + continue; + }; + for field in ["kind", "label", "url"] { + match mapping + .get(Value::String(field.to_owned())) + .and_then(Value::as_str) + { + Some(text) if !text.trim().is_empty() => {} + Some(_) => issues.push(Issue { + severity: Severity::Error, + path: format!("provenance.sources[{idx}].{field}"), + message: "empty string".to_owned(), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: format!("provenance.sources[{idx}].{field}"), + message: "missing required field".to_owned(), + }), + } + } + if let Some(url) = mapping + .get(Value::String("url".to_owned())) + .and_then(Value::as_str) + { + validate_url_string(url, &format!("provenance.sources[{idx}].url"), false, issues); + } + } +} + +fn validate_permissions(root: &Value, issues: &mut Vec) { + let Some(domains) = value_at(root, &["permissions", "domains"]) else { + return; + }; + let Some(items) = domains.as_sequence() else { + issues.push(Issue { + severity: Severity::Error, + path: "permissions.domains".to_owned(), + message: "expected a sequence of origins".to_owned(), + }); + return; + }; + let mut seen = BTreeSet::new(); + for (idx, item) in items.iter().enumerate() { + let Some(value) = item.as_str() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("permissions.domains[{idx}]"), + message: "expected string".to_owned(), + }); + continue; + }; + match normalize_origin(value) { + Ok(origin) => { + if !seen.insert(origin.clone()) { + issues.push(Issue { + severity: Severity::Warning, + path: format!("permissions.domains[{idx}]"), + message: format!("duplicate origin '{origin}'"), + }); + } + } + Err(message) => issues.push(Issue { + severity: Severity::Error, + path: format!("permissions.domains[{idx}]"), + message, + }), + } + } +} + +fn validate_downloads(root: &Value, issues: &mut Vec) { + let allowed_origins: BTreeSet = seq_of_strings(root, &["permissions", "domains"]) + .unwrap_or_default() + .into_iter() + .filter_map(|domain| normalize_origin(&domain).ok()) + .collect(); + let Some(downloads) = value_at(root, &["downloads"]).and_then(Value::as_sequence) else { + return; + }; + let mut ids = BTreeSet::new(); + for (idx, item) in downloads.iter().enumerate() { + let Some(mapping) = item.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("downloads[{idx}]"), + message: "expected mapping".to_owned(), + }); + continue; + }; + for field in ["id", "url", "sha256", "version"] { + match mapping + .get(Value::String(field.to_owned())) + .and_then(Value::as_str) + { + Some(text) if !text.trim().is_empty() => {} + Some(_) => issues.push(Issue { + severity: Severity::Error, + path: format!("downloads[{idx}].{field}"), + message: "empty string".to_owned(), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: format!("downloads[{idx}].{field}"), + message: "missing required field".to_owned(), + }), + } + } + + if let Some(id) = mapping + .get(Value::String("id".to_owned())) + .and_then(Value::as_str) + { + if !ids.insert(id.to_owned()) { issues.push(Issue { - severity: Severity::Warning, - path: format!("clinical.pgx.{key}[{idx}]"), - message: "expected mapping".to_owned(), + severity: Severity::Error, + path: format!("downloads[{idx}].id"), + message: format!("duplicate download id '{id}'"), }); - continue; - }; + } + } + if let Some(sha) = mapping + .get(Value::String("sha256".to_owned())) + .and_then(Value::as_str) + && !is_sha256(sha) + { + issues.push(Issue { + severity: Severity::Error, + path: format!("downloads[{idx}].sha256"), + message: "expected 64 lowercase hex characters".to_owned(), + }); + } + if let Some(url) = mapping + .get(Value::String("url".to_owned())) + .and_then(Value::as_str) + { + match normalize_download_url(url) { + Ok(origin) => { + if !allowed_origins.is_empty() && !allowed_origins.contains(&origin) { + issues.push(Issue { + severity: Severity::Error, + path: format!("downloads[{idx}].url"), + message: format!( + "download origin '{origin}' is not listed in permissions.domains" + ), + }); + } + } + Err(message) => issues.push(Issue { + severity: Severity::Error, + path: format!("downloads[{idx}].url"), + message, + }), + } + } + } +} - if let Some(level) = mapping - .get(Value::String("pgx_level".to_owned())) +fn validate_panel_members(root: &Value, issues: &mut Vec) { + let Some(members) = value_at(root, &["members"]).and_then(Value::as_sequence) else { + issues.push(Issue { + severity: Severity::Error, + path: "members".to_owned(), + message: "missing required field".to_owned(), + }); + return; + }; + if members.is_empty() { + issues.push(Issue { + severity: Severity::Error, + path: "members".to_owned(), + message: "expected at least one member".to_owned(), + }); + return; + } + + let download_ids: BTreeSet = value_at(root, &["downloads"]) + .and_then(Value::as_sequence) + .into_iter() + .flatten() + .filter_map(|item| { + item.as_mapping()? + .get(Value::String("id".to_owned())) .and_then(Value::as_str) - && level.trim().is_empty() - { + .map(ToOwned::to_owned) + }) + .collect(); + + for (idx, item) in members.iter().enumerate() { + let Some(mapping) = item.as_mapping() else { + issues.push(Issue { + severity: Severity::Error, + path: format!("members[{idx}]"), + message: "expected mapping".to_owned(), + }); + continue; + }; + let kind = mapping + .get(Value::String("kind".to_owned())) + .and_then(Value::as_str); + match kind { + Some("variant") => {} + Some(other) => issues.push(Issue { + severity: Severity::Error, + path: format!("members[{idx}].kind"), + message: format!("unsupported member kind '{other}'; panel support is currently variant-only"), + }), + None => issues.push(Issue { + severity: Severity::Error, + path: format!("members[{idx}].kind"), + message: "missing required field".to_owned(), + }), + } + + let path_value = mapping + .get(Value::String("path".to_owned())) + .and_then(Value::as_str); + let download_value = mapping + .get(Value::String("download".to_owned())) + .and_then(Value::as_str); + if path_value.is_some() == download_value.is_some() { + issues.push(Issue { + severity: Severity::Error, + path: format!("members[{idx}]"), + message: "expected exactly one of path or download".to_owned(), + }); + } + if let Some(path) = path_value + && path.trim().is_empty() + { + issues.push(Issue { + severity: Severity::Error, + path: format!("members[{idx}].path"), + message: "empty string".to_owned(), + }); + } + if let Some(download) = download_value { + if download.trim().is_empty() { issues.push(Issue { - severity: Severity::Warning, - path: format!("clinical.pgx.{key}[{idx}].pgx_level"), - message: "empty pgx_level string; prefer null/omitted or a normalized controlled value".to_owned(), + severity: Severity::Error, + path: format!("members[{idx}].download"), + message: "empty string".to_owned(), + }); + } else if !download_ids.contains(download) { + issues.push(Issue { + severity: Severity::Error, + path: format!("members[{idx}].download"), + message: format!("unknown download id '{download}'"), }); } } + if let Some(version) = mapping + .get(Value::String("version".to_owned())) + .and_then(Value::as_str) + && version.trim().is_empty() + { + issues.push(Issue { + severity: Severity::Error, + path: format!("members[{idx}].version"), + message: "empty string".to_owned(), + }); + } + if let Some(sha) = mapping + .get(Value::String("sha256".to_owned())) + .and_then(Value::as_str) + && !is_sha256(sha) + { + issues.push(Issue { + severity: Severity::Error, + path: format!("members[{idx}].sha256"), + message: "expected 64 lowercase hex characters".to_owned(), + }); + } } } +fn variant_spec_from_root(root: &Value) -> Result { + let rsids = seq_of_strings(root, &["identifiers", "rsids"]).unwrap_or_default(); + let grch37 = locus_from_root(root, "grch37")?; + let grch38 = locus_from_root(root, "grch38")?; + let reference = scalar_at(root, &["alleles", "ref"]); + let alternate = seq_of_strings(root, &["alleles", "alts"]) + .and_then(|alts| alts.first().cloned()); + let deletion_length = value_at(root, &["alleles", "deletion_length"]) + .and_then(Value::as_u64) + .map(|value| value as usize); + let motifs = seq_of_strings(root, &["alleles", "motifs"]).unwrap_or_default(); + let kind = scalar_at(root, &["alleles", "kind"]).map(|kind| match kind.as_str() { + "snv" => VariantKind::Snp, + "deletion" => VariantKind::Deletion, + "insertion" => VariantKind::Insertion, + "indel" => VariantKind::Indel, + _ => VariantKind::Other, + }); + + Ok(VariantSpec { + rsids, + grch37, + grch38, + reference, + alternate, + kind, + deletion_length, + motifs, + }) +} + +fn locus_from_root(root: &Value, assembly: &str) -> Result, String> { + let Some(mapping) = mapping_at(root, &["coordinates", assembly]) else { + return Ok(None); + }; + let chrom = mapping + .get(Value::String("chrom".to_owned())) + .and_then(Value::as_str) + .ok_or_else(|| format!("coordinates.{assembly}.chrom missing"))?; + let (start, end) = if let Some(pos) = i64_at_mapping(mapping, "pos") { + (pos, pos) + } else { + let start = i64_at_mapping(mapping, "start") + .ok_or_else(|| format!("coordinates.{assembly}.start missing"))?; + let end = i64_at_mapping(mapping, "end") + .ok_or_else(|| format!("coordinates.{assembly}.end missing"))?; + (start, end) + }; + Ok(Some(GenomicLocus { + chrom: chrom.to_owned(), + start, + end, + })) +} + +fn parse_downloads(root: &Value) -> Result, String> { + let mut downloads = Vec::new(); + let Some(items) = value_at(root, &["downloads"]).and_then(Value::as_sequence) else { + return Ok(downloads); + }; + for (idx, item) in items.iter().enumerate() { + let Some(mapping) = item.as_mapping() else { + return Err(format!("downloads[{idx}] must be a mapping")); + }; + let id = mapping_required_string(mapping, "id", idx, "downloads")?; + let url = mapping_required_string(mapping, "url", idx, "downloads")?; + let sha256 = mapping_required_string(mapping, "sha256", idx, "downloads")?; + let version = mapping_required_string(mapping, "version", idx, "downloads")?; + let origin = normalize_download_url(&url)?; + downloads.push(Download { + id, + url, + origin, + sha256, + version, + }); + } + Ok(downloads) +} + +fn parse_panel_members(root: &Value) -> Result, String> { + let mut members = Vec::new(); + let Some(items) = value_at(root, &["members"]).and_then(Value::as_sequence) else { + return Ok(members); + }; + for (idx, item) in items.iter().enumerate() { + let Some(mapping) = item.as_mapping() else { + return Err(format!("members[{idx}] must be a mapping")); + }; + members.push(PanelMember { + kind: mapping_required_string(mapping, "kind", idx, "members")?, + path: mapping + .get(Value::String("path".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + download: mapping + .get(Value::String("download".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + sha256: mapping + .get(Value::String("sha256".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + version: mapping + .get(Value::String("version".to_owned())) + .and_then(Value::as_str) + .map(ToOwned::to_owned), + }); + } + Ok(members) +} + +fn mapping_required_string( + mapping: &Mapping, + field: &str, + idx: usize, + parent: &str, +) -> Result { + mapping + .get(Value::String(field.to_owned())) + .and_then(Value::as_str) + .filter(|value| !value.trim().is_empty()) + .map(ToOwned::to_owned) + .ok_or_else(|| format!("{parent}[{idx}].{field} missing or empty")) +} + +fn validate_url_string( + value: &str, + path: &str, + require_origin_only: bool, + issues: &mut Vec, +) { + let normalized = if require_origin_only { + normalize_origin(value) + } else { + normalize_download_url(value) + }; + if let Err(message) = normalized { + issues.push(Issue { + severity: Severity::Error, + path: path.to_owned(), + message, + }); + } +} + +fn normalize_origin(value: &str) -> Result { + let url = Url::parse(value).map_err(|err| format!("invalid URL: {err}"))?; + if !matches!(url.scheme(), "http" | "https") { + return Err("expected http or https origin".to_owned()); + } + if url.host_str().is_none() { + return Err("origin is missing host".to_owned()); + } + if url.path() != "/" || url.query().is_some() || url.fragment().is_some() { + return Err("expected origin only, without path, query, or fragment".to_owned()); + } + let mut origin = format!("{}://{}", url.scheme(), url.host_str().unwrap_or_default()); + if let Some(port) = url.port() { + let _ = write!(origin, ":{port}"); + } + Ok(origin) +} + +fn normalize_download_url(value: &str) -> Result { + let url = Url::parse(value).map_err(|err| format!("invalid URL: {err}"))?; + if !matches!(url.scheme(), "http" | "https") { + return Err("expected http or https URL".to_owned()); + } + if url.host_str().is_none() { + return Err("URL is missing host".to_owned()); + } + let mut origin = format!("{}://{}", url.scheme(), url.host_str().unwrap_or_default()); + if let Some(port) = url.port() { + let _ = write!(origin, ":{port}"); + } + Ok(origin) +} + +fn is_allowed_chromosome(value: &str) -> bool { + matches!(value, "X" | "Y" | "MT") + || value + .parse::() + .is_ok_and(|chrom| (1..=22).contains(&chrom)) +} + +fn is_base_allele(value: &str) -> bool { + matches!(value, "A" | "C" | "G" | "T") +} + +fn is_rsid(value: &str) -> bool { + value.starts_with("rs") && value[2..].chars().all(|ch| ch.is_ascii_digit()) +} + +fn is_sha256(value: &str) -> bool { + value.len() == 64 && value.chars().all(|ch| ch.is_ascii_hexdigit() && !ch.is_ascii_uppercase()) +} + +fn i64_at_mapping(mapping: &Mapping, key: &str) -> Option { + mapping + .get(Value::String(key.to_owned())) + .and_then(Value::as_i64) +} + +fn required_non_empty_string(root: &Value, path: &[&str]) -> Result { + scalar_at(root, path) + .filter(|value| !value.trim().is_empty()) + .ok_or_else(|| format!("{} missing or empty", path.join("."))) +} + +fn render_single_manifest_errors(path: &Path, issues: &[Issue]) -> String { + let mut out = format!("invalid manifest {}:\n", path.display()); + for issue in issues { + let _ = writeln!(out, " - [{}] {}: {}", issue.severity, issue.path, issue.message); + } + out +} + +fn load_yaml(path: &Path) -> Result { + let text = fs::read_to_string(path) + .map_err(|err| format!("failed to read {}: {err}", path.display()))?; + serde_yaml::from_str(&text).map_err(|err| format!("failed to parse YAML {}: {err}", path.display())) +} + fn require_const(root: &Value, path: &[&str], expected: &str, issues: &mut Vec) { match scalar_at(root, path) { Some(actual) if actual == expected => {} @@ -304,7 +1332,7 @@ fn value_at<'a>(root: &'a Value, path: &[&str]) -> Option<&'a Value> { Some(current) } -fn mapping_at<'a>(root: &'a Value, path: &[&str]) -> Option<&'a serde_yaml::Mapping> { +fn mapping_at<'a>(root: &'a Value, path: &[&str]) -> Option<&'a Mapping> { value_at(root, path)?.as_mapping() } @@ -316,7 +1344,7 @@ fn scalar_at(root: &Value, path: &[&str]) -> Option { }) } -fn seq_at(root: &Value, path: &[&str]) -> Option> { +fn seq_of_strings(root: &Value, path: &[&str]) -> Option> { value_at(root, path)?.as_sequence().map(|items| { items .iter() diff --git a/rust/bioscript-schema/tests/validate_variants.rs b/rust/bioscript-schema/tests/validate_variants.rs index a529717..74109ed 100644 --- a/rust/bioscript-schema/tests/validate_variants.rs +++ b/rust/bioscript-schema/tests/validate_variants.rs @@ -4,7 +4,7 @@ use std::{ time::{SystemTime, UNIX_EPOCH}, }; -use bioscript_schema::validate_variants_path; +use bioscript_schema::{validate_panels_path, validate_variants_path}; fn temp_dir(label: &str) -> PathBuf { let nanos = SystemTime::now() @@ -20,7 +20,7 @@ fn temp_dir(label: &str) -> PathBuf { } #[test] -fn validate_variants_reports_known_shape_issues() { +fn validate_variants_reports_shape_issues() { let dir = temp_dir("validate"); let fixture = dir.join("variant.yaml"); fs::write( @@ -29,6 +29,47 @@ fn validate_variants_reports_known_shape_issues() { schema: "bioscript:variant" version: "1.0" variant_id: "TEST_rs1" +name: "test-rs1" +identifiers: + rsids: + - "bad-rsid" +coordinates: + grch38: + chrom: "HG7_PATCH" + pos: 0 +alleles: + kind: "snv" + ref: "I" + alts: ["D"] +findings: + - schema: "" + alt: "A" +"#, + ) + .unwrap(); + + let report = validate_variants_path(&fixture).unwrap(); + let text = report.render_text(); + + assert_eq!(report.total_errors(), 9); + assert_eq!(report.total_warnings(), 3); + assert!(text.contains("legacy schema value")); + assert!(text.contains("invalid chromosome")); + assert!(text.contains("symbolic I/D alleles are not allowed")); +} + +#[test] +fn validate_variants_accepts_current_shape() { + let dir = temp_dir("validate-tags"); + let fixture = dir.join("rs1.yaml"); + fs::write( + &fixture, + r#" +schema: "bioscript:variant:1.0" +version: "1.0" +name: "test-rs1" +tags: + - "type:trait" identifiers: rsids: - "rs1" @@ -40,27 +81,77 @@ alleles: kind: "snv" ref: "G" alts: ["A"] -research: - tags: - - "snp" -clinical: - pgx: - drug_labels: - - source: "FDA" - title: "Example" - genes: ["GENE1"] - drugs: ["drug1"] - pgx_level: "" - actionable: false +findings: + - schema: "bioscript:trait:1.0" + alt: "A" + summary: "Example finding" +provenance: + sources: + - kind: "database" + label: "dbSNP" + url: "https://example.org/rs1" "#, ) .unwrap(); let report = validate_variants_path(&fixture).unwrap(); + assert_eq!(report.total_errors(), 0); + assert_eq!(report.total_warnings(), 0); +} + +#[test] +fn validate_panels_checks_permissions_and_download_origins() { + let dir = temp_dir("validate-panel"); + let fixture = dir.join("panel.yaml"); + fs::write( + &fixture, + r#" +schema: "bioscript:panel:1.0" +version: "1.0" +name: "traits-common" +permissions: + domains: + - "https://example.org" +downloads: + - id: "remote-rs1" + url: "https://cdn.example.org/variants/rs1.yaml" + sha256: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" + version: "1.0" +members: + - kind: "variant" + download: "remote-rs1" +"#, + ) + .unwrap(); + + let report = validate_panels_path(&fixture).unwrap(); let text = report.render_text(); + assert_eq!(report.total_errors(), 1); + assert!(text.contains("not listed in permissions.domains")); +} + +#[test] +fn validate_panels_accepts_local_variant_members() { + let dir = temp_dir("validate-panel-ok"); + let fixture = dir.join("panel.yaml"); + fs::write( + &fixture, + r#" +schema: "bioscript:panel:1.0" +version: "1.0" +name: "traits-common" +tags: + - "type:trait" +members: + - kind: "variant" + path: "variants/rs671.yaml" + version: "1.0" +"#, + ) + .unwrap(); + + let report = validate_panels_path(&fixture).unwrap(); assert_eq!(report.total_errors(), 0); - assert_eq!(report.total_warnings(), 2); - assert!(text.contains("research.tags")); - assert!(text.contains("empty pgx_level string")); + assert_eq!(report.total_warnings(), 0); }