diff --git a/src/mavedb/scripts/export_public_data.py b/src/mavedb/scripts/export_public_data.py index 63400aeb8..77d9ac4a8 100644 --- a/src/mavedb/scripts/export_public_data.py +++ b/src/mavedb/scripts/export_public_data.py @@ -6,22 +6,8 @@ python3 -m mavedb.scripts.export_public_data ``` -This generates a ZIP archive named `mavedb-dump.zip` in the working directory. the ZIP file has the following contents: -- main.json: A JSON file providing metadata for all of the published experiment sets, experiments, and score sets -- LICENSE.txt: The text of the Creative Commons Zero license, which applies to all data included in the dump. -- variants/ - - [URN].counts.csv (for each variant URN): The score set's variant count columns, - sorted by variant number - - [URN].scores.csv (for each variant URN): The score set's variant count columns, - sorted by variant number - - [URN].annotations.csv (for each variant URN with mapped variants): The score set's variant annotations, sorted by - variant number. This file is only included for score sets with mapped variants, and includes VEP, gnomAD, and ClinGen annotations. - -In the exported JSON metadata, the root object's `experimentSets` property gives an array of experiment sets. -Experiments are nested in their parent experiment sets, and score sets in their parent experiments. - -The variant URNs used in filenames do not include the `urn:mavedb:` scheme identifier, so they look like -`00000001-a-1.counts.csv` and `00000001-a-1.scores.csv`, for instance. +This generates a ZIP archive named `mavedb-dump.YYYYMMDDHHMMSS.zip` in the working directory. +See `src/mavedb/scripts/resources/README.md` for a full description of the archive contents and file formats. Unpublished data and data sets licensed other than under the Creative Commons Zero license are not included in the dump, and user details are limited to ORCID IDs and names of contributors to published data sets. @@ -37,7 +23,7 @@ from fastapi.encoders import jsonable_encoder from sqlalchemy import select -from sqlalchemy.orm import Session, lazyload +from sqlalchemy.orm import Session, joinedload, lazyload from mavedb.lib.score_sets import get_score_set_variants_as_csv from mavedb.models.experiment import Experiment @@ -47,6 +33,7 @@ from mavedb.models.score_set import ScoreSet from mavedb.models.variant import Variant from mavedb.scripts.environment import script_environment, with_database_session +from mavedb.view_models import mapped_variant as mapped_variant_vm from mavedb.view_models.experiment_set import ExperimentSetPublicDump logger = logging.getLogger(__name__) @@ -114,6 +101,7 @@ def export_public_data(db: Session): # Filter the stream of experiment sets to exclude experiments and experiment sets with no public, CC0-licensed score # sets. experiment_sets = list(filter_experiment_sets(experiment_sets_query.all())) + logger.info(f"Found {len(experiment_sets)} published experiment sets with CC0-licensed score sets.") # TODO To support very large data sets, we may want to use custom code for JSON-encoding an iterator. # Issue: https://github.com/VariantEffect/mavedb-api/issues/192 @@ -129,7 +117,7 @@ def export_public_data(db: Session): timestamp_format = "%Y%m%d%H%M%S" zip_file_name = f"mavedb-dump.{datetime.now().strftime(timestamp_format)}.zip" - logger.info(f"Exporting public data set metadata to {zip_file_name}/main.json") + logger.info(f"Writing {zip_file_name} with {len(score_set_ids)} score sets.") json_data = { "title": "MaveDB public data", "asOf": datetime.now(timezone.utc).isoformat(), @@ -140,21 +128,23 @@ def export_public_data(db: Session): # Write metadata for all data sets to a single JSON file. zipfile.writestr("main.json", json.dumps(jsonable_encoder(json_data))) - # Copy the CC0 license. - zipfile.write(os.path.join(os.path.dirname(__file__), "resources/CC0_license.txt"), "LICENSE.txt") + # Copy the CC0 license and README. + resources_dir = os.path.join(os.path.dirname(__file__), "resources") + zipfile.write(os.path.join(resources_dir, "CC0_license.txt"), "LICENSE.txt") + zipfile.write(os.path.join(resources_dir, "README.md"), "README.md") # Write score and count files for each score set. num_score_sets = len(score_set_ids) for i, score_set_id in enumerate(score_set_ids): score_set = db.scalars(select(ScoreSet).where(ScoreSet.id == score_set_id)).one_or_none() if score_set is not None and score_set.urn is not None: - logger.info(f"{i + 1}/{num_score_sets} Exporting variants for score set {score_set.urn}") + logger.info(f"[{i + 1}/{num_score_sets}] Exporting score set {score_set.urn}") csv_filename_base = score_set.urn.replace(":", "-") csv_str = get_score_set_variants_as_csv(db, score_set, ["scores"], namespaced=True) zipfile.writestr(f"csv/{csv_filename_base}.scores.csv", csv_str) - # Only generate the annotations CSV if mapped variants exist in the score set. + # Only generate annotation files if mapped variants exist in the score set. has_annotations = ( db.scalars( select(ScoreSet).where(ScoreSet.id == score_set_id).join(Variant).join(MappedVariant).limit(1) @@ -167,12 +157,32 @@ def export_public_data(db: Session): ) zipfile.writestr(f"csv/{csv_filename_base}.annotations.csv", csv_str) + # Write mapped variants JSON — mirrors GET /api/v1/score-sets/{urn}/mapped-variants. + mapped_variants = db.scalars( + select(MappedVariant) + .join(Variant, Variant.id == MappedVariant.variant_id) + .options(joinedload(MappedVariant.variant)) + .where(Variant.score_set_id == score_set_id) + ).all() + mapped_variant_views = [ + mapped_variant_vm.MappedVariant.model_validate(mv) for mv in mapped_variants + ] + zipfile.writestr( + f"mapped/{csv_filename_base}.mapped-variants.json", + json.dumps(jsonable_encoder(mapped_variant_views)), + ) + logger.info( + f"[{i + 1}/{num_score_sets}] Wrote annotations + {len(mapped_variants)} mapped variants" + ) + # Only generate the counts CSV if count columns are present. count_columns = score_set.dataset_columns["count_columns"] if score_set.dataset_columns else None if count_columns and len(count_columns) > 0: csv_str = get_score_set_variants_as_csv(db, score_set, ["counts"], namespaced=True) zipfile.writestr(f"csv/{csv_filename_base}.counts.csv", csv_str) + logger.info(f"Export complete: {zip_file_name}") + if __name__ == "__main__": export_public_data() diff --git a/src/mavedb/scripts/resources/README.md b/src/mavedb/scripts/resources/README.md new file mode 100644 index 000000000..3dcd0247d --- /dev/null +++ b/src/mavedb/scripts/resources/README.md @@ -0,0 +1,236 @@ +# MaveDB Public Data Dump + +This archive contains a snapshot of publicly accessible variant effect data from MaveDB. +The `asOf` field in `main.json` records the exact date and time this dump was generated. + +### Useful links +- **MaveDB website:** https://www.mavedb.org +- **API documentation:** https://api.mavedb.org/docs +- **MaveDB documentation:** https://mavedb.org/docs/mavedb/index.html +- **Source code:** + - https://github.com/VariantEffect/mavedb-api + - https://github.com/VariantEffect/mavedb-ui + - https://github.com/VariantEffect/dcd_mapping2 + +--- + +## What's Included + +This dump includes only data that is: + +- **Published** — publicly released on MaveDB +- **CC0-licensed** — released under the Creative Commons CC0 1.0 Public Domain Dedication + +Unpublished data, private datasets, and datasets published under other licenses are excluded. + +--- + +## Archive Structure + +``` +mavedb-dump.YYYYMMDDHHMMSS.zip +├── README.md # This file +├── LICENSE.txt # Creative Commons CC0 1.0 license text +├── main.json # Metadata for all included datasets +├── csv/ +│ ├── {urn}.scores.csv # Variant effect scores (all score sets) +│ ├── {urn}.counts.csv # Variant counts (score sets with count data only) +│ └── {urn}.annotations.csv # Variant annotations from VEP, gnomAD, and ClinGen +│ # (score sets that have completed mapping only) +└── mapped/ + └── {urn}.mapped-variants.json # Mapped variant data including VRS alleles and HGVS + # (score sets that have completed mapping only) +``` + +`{urn}` is the score set URN with colons replaced by hyphens, e.g., `urn-mavedb-00000001-a-1`. + +--- + +## File Descriptions + +### `main.json` + +A JSON object containing MaveDB metadata with three top-level fields: + +- `title` — `"MaveDB public data"` +- `asOf` — ISO 8601 UTC timestamp indicating when this dump was generated +- `experimentSets` — Array of experiment set objects, each containing nested experiments and score + sets with full metadata (targets, publications, licenses, contributors, etc.) + +The hierarchy mirrors the MaveDB data model: each **ExperimentSet** contains one or more +**Experiments**, each of which contains one or more **ScoreSets**. + +Score set metadata includes the `datasetColumns` field, which lists the names of the per-score-set +score and count columns that appear in the corresponding CSV files. + +### CSV column namespacing + +All CSV files exported from MaveDB use a namespaced column naming scheme. The namespace prefix +identifies which data source a column belongs to and is separated from the column name by a dot: + +| Prefix | Source | +|--------|--------| +| *(no prefix)* | Core identifiers — `accession`, `hgvs_nt`, `hgvs_pro`, `hgvs_splice` | +| `scores.` | Score columns defined by the score set author (e.g. `scores.score`) | +| `counts.` | Count columns defined by the score set author | +| `mavedb.` | Columns computed by the MaveDB mapping pipeline (post-mapped HGVS, VRS digest) | +| `vep.` | Ensembl Variant Effect Predictor annotations | +| `gnomad.` | gnomAD population frequency data | +| `clingen.` | ClinGen Allele Registry linkage | + +Missing or inapplicable values in all CSV files are represented as the string `NA`. + +### `csv/{urn}.scores.csv` + +Comma-separated file with variant effect scores. Contains the following fixed columns, followed by +score columns defined by each individual score set: + +| Column | Description | +|--------|-------------| +| `accession` | Full variant URN (e.g., `urn:mavedb:00000001-a-1#1`) | +| `hgvs_nt` | Assay-level nucleotide HGVS string in MAVE-HGVS format, if applicable | +| `hgvs_pro` | Assay-level protein HGVS string in MAVE-HGVS format, if applicable | +| `hgvs_splice` | Assay-level splice HGVS string in MAVE-HGVS format, if applicable | +| `scores.score` | The primary score column — always present | +| `scores.*` | Additional score columns defined by the score set author | + +The `hgvs_nt`, `hgvs_pro`, and `hgvs_splice` columns use **MAVE-HGVS format** — a constrained +subset of HGVS notation used by MaveDB. These strings are often expressed relative to the +assay's reference sequence (a transcript or protein), not the genome, and may not validate against +a standard HGVS parser. Score values are not normalized across score sets; each score set defines +its own scale and units. Refer to the score set's entry in `main.json` for the meaning of each +score column. + +### `csv/{urn}.counts.csv` + +Same structure as `scores.csv`, but with `counts.*` columns in place of score columns. Only +present for score sets that have count data. The count column names are listed in +`datasetColumns.countColumns` in `main.json`. + +### `csv/{urn}.annotations.csv` + +Variant annotation data from external databases, joined with post-mapped HGVS and VRS identifiers +produced by the MaveDB variant mapping pipeline. **Only present for score sets that have completed +the MaveDB mapping pipeline.** Exact columns: + +| Column | Description | +|--------|-------------| +| `accession` | Full variant URN — use this to join with `scores.csv` | +| `hgvs_nt` | Assay-level nucleotide HGVS (MAVE-HGVS format) | +| `hgvs_pro` | Assay-level protein HGVS (MAVE-HGVS format) | +| `hgvs_splice` | Assay-level splice HGVS (MAVE-HGVS format) | +| `mavedb.post_mapped_hgvs_g` | Post-mapped genomic HGVS on GRCh38 (g. notation) | +| `mavedb.post_mapped_hgvs_c` | Post-mapped coding HGVS (c. notation) | +| `mavedb.post_mapped_hgvs_p` | Post-mapped protein HGVS (p. notation) | +| `mavedb.post_mapped_hgvs_at_assay_level` | Post-mapped HGVS at the assay reference level (transcript or protein) | +| `mavedb.post_mapped_vrs_digest` | GA4GH VRS digest identifier for the post-mapped allele | +| `vep.vep_functional_consequence` | VEP functional consequence term (e.g. `missense_variant`) | +| `gnomad.gnomad_af` | gnomAD v4.1 allele frequency | +| `clingen.clingen_allele_id` | ClinGen Allele Registry CA identifier (e.g. `CA12345`) | + +Variants that could not be mapped, or for which a specific annotation is unavailable, will have +`NA` in the corresponding column. For multi-allelic variants (haplotypes), `mavedb.*` HGVS columns +will be `NA` because a single combined HGVS string cannot currently be derived. This may be updated in +a future release. + +### `mapped/{urn}.mapped-variants.json` + +A JSON array of mapped variant records. Each record corresponds to a single variant and contains +the same fields returned by `GET /api/v1/score-sets/{urn}/mapped-variants`: + +| Field | Description | +|-------|-------------| +| `variantUrn` | URN of the source variant — use this to join with `accession` in the CSV files | +| `preMapped` | VRS allele or haplotype using coordinates on the assay's reference sequence (transcript or protein accession) | +| `postMapped` | VRS allele or haplotype lifted over to GRCh38 genomic coordinates | +| `vrsVersion` | VRS schema version used to encode these objects (e.g., `"1.3"`, `"2.0"`) | +| `mappingApiVersion` | Version of the dcd_mapping service that produced this result | +| `mappedDate` | Date the mapping was produced | +| `modificationDate` | Date this mapping record was last modified | +| `current` | `true` if this is the active mapping for the variant; `false` for superseded mappings | +| `errorMessage` | Diagnostic message if mapping failed; `null` on success | +| `clingenAlleleId` | ClinGen Allele Registry identifier, if the variant has been registered | + +`preMapped` and `postMapped` are raw GA4GH VRS objects (JSON). The `type` field within them may be +`"Allele"`, `"Haplotype"`, or `"CisPhasedBlock"` depending on the variant. Records where mapping +failed will have `preMapped: null`, `postMapped: null`, and a non-null `errorMessage`. **Only +present for score sets that have completed the MaveDB mapping pipeline.** + +--- + +## Working with this data + +### Joining files for a single score set + +All files for a given score set share the same variant identifier: + +- In CSV files: the `accession` column (e.g. `urn:mavedb:00000001-a-1#42`) +- In `mapped-variants.json`: the `variantUrn` field + +To combine scores with annotations or with VRS data, join on `accession` = `variantUrn`. + +### Linking files back to metadata + +A filename like `urn-mavedb-00000001-a-1.scores.csv` corresponds to the score set with +`"urn": "urn:mavedb:00000001-a-1"` in `main.json`. The filename prefix is the score set URN with +every colon (`:`) replaced by a hyphen (`-`). + +### Reconstructing score set metadata from `main.json` + +`main.json` contains the full metadata hierarchy. Score sets are nested inside experiments, which +are nested inside experiment sets. To find the metadata for a specific score set: + +```python +import json + +with open("main.json") as f: + data = json.load(f) + +target_urn = "urn:mavedb:00000001-a-1" +score_set = next( + ss + for es in data["experimentSets"] + for exp in es["experiments"] + for ss in exp["scoreSets"] + if ss["urn"] == target_urn +) +``` + +--- + +## Caveats + +- Only **published**, **CC0-licensed** data is included. Datasets with other licenses are not + present in this dump even if they are publicly visible on MaveDB. +- Annotation files (`.annotations.csv`) and mapped variant files (`.mapped-variants.json`) are + **only present for score sets that have been processed by the MaveDB variant mapping pipeline**. + Score sets that have not yet been mapped, or for which mapping failed entirely, will not have + these files. +- Mapping is applied per variant within a score set. A score set that has completed the mapping + pipeline may still contain individual variants with failed mappings. Those variants have `NA` in + all `mavedb.*`, `vep.*`, `gnomad.*`, and `clingen.*` columns in the annotations CSV, and + `preMapped: null` / `postMapped: null` in the JSON. +- The `mapped/` JSON files include **all** mapping records, not only the most recent ones. When a + score set is remapped, the previous records are retained with `current: false`. For most use + cases, filter to records where `current` is `true`. Annotations are always reported with respect + to the current mapping object. +- gnomAD allele frequencies in `annotations.csv` are sourced from **gnomAD v4.1** specifically. +- `preMapped` VRS objects reference the assay's input sequence (a transcript or protein accession). + `postMapped` VRS objects are remapped to the **GRCh38** reference genome. Do not compare + coordinates between `preMapped` and `postMapped` directly. +- Assay-level HGVS strings (`hgvs_nt`, `hgvs_pro`, `hgvs_splice`) are in **MAVE-HGVS format**, a + constrained community convention that may not parse with a standard HGVS library. +- Score values are **not normalized** across score sets. Each score set defines its own scale, + range, and interpretation. A score of `1.0` in one score set has no defined relationship to a + score of `1.0` in another. +- The data in this dump reflects the state of MaveDB at the time of export, as recorded in the + `asOf` UTC timestamp in `main.json`. It may not reflect changes made after that time. + +--- + +## License + +All data in this archive is released under the +[Creative Commons CC0 1.0 Universal (CC0 1.0) Public Domain Dedication](https://creativecommons.org/publicdomain/zero/1.0/). + +See `LICENSE.txt` for the full license text.