Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 32 additions & 22 deletions src/mavedb/scripts/export_public_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,8 @@
python3 -m mavedb.scripts.export_public_data
```

This generates a ZIP archive named `mavedb-dump.zip` in the working directory. the ZIP file has the following contents:
- main.json: A JSON file providing metadata for all of the published experiment sets, experiments, and score sets
- LICENSE.txt: The text of the Creative Commons Zero license, which applies to all data included in the dump.
- variants/
- [URN].counts.csv (for each variant URN): The score set's variant count columns,
sorted by variant number
- [URN].scores.csv (for each variant URN): The score set's variant count columns,
sorted by variant number
- [URN].annotations.csv (for each variant URN with mapped variants): The score set's variant annotations, sorted by
variant number. This file is only included for score sets with mapped variants, and includes VEP, gnomAD, and ClinGen annotations.

In the exported JSON metadata, the root object's `experimentSets` property gives an array of experiment sets.
Experiments are nested in their parent experiment sets, and score sets in their parent experiments.

The variant URNs used in filenames do not include the `urn:mavedb:` scheme identifier, so they look like
`00000001-a-1.counts.csv` and `00000001-a-1.scores.csv`, for instance.
This generates a ZIP archive named `mavedb-dump.YYYYMMDDHHMMSS.zip` in the working directory.
See `src/mavedb/scripts/resources/README.md` for a full description of the archive contents and file formats.

Unpublished data and data sets licensed other than under the Creative Commons Zero license are not included in the dump,
and user details are limited to ORCID IDs and names of contributors to published data sets.
Expand All @@ -37,7 +23,7 @@

from fastapi.encoders import jsonable_encoder
from sqlalchemy import select
from sqlalchemy.orm import Session, lazyload
from sqlalchemy.orm import Session, joinedload, lazyload

from mavedb.lib.score_sets import get_score_set_variants_as_csv
from mavedb.models.experiment import Experiment
Expand All @@ -47,6 +33,7 @@
from mavedb.models.score_set import ScoreSet
from mavedb.models.variant import Variant
from mavedb.scripts.environment import script_environment, with_database_session
from mavedb.view_models import mapped_variant as mapped_variant_vm
from mavedb.view_models.experiment_set import ExperimentSetPublicDump

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -114,6 +101,7 @@ def export_public_data(db: Session):
# Filter the stream of experiment sets to exclude experiments and experiment sets with no public, CC0-licensed score
# sets.
experiment_sets = list(filter_experiment_sets(experiment_sets_query.all()))
logger.info(f"Found {len(experiment_sets)} published experiment sets with CC0-licensed score sets.")

# TODO To support very large data sets, we may want to use custom code for JSON-encoding an iterator.
# Issue: https://github.com/VariantEffect/mavedb-api/issues/192
Expand All @@ -129,7 +117,7 @@ def export_public_data(db: Session):
timestamp_format = "%Y%m%d%H%M%S"
zip_file_name = f"mavedb-dump.{datetime.now().strftime(timestamp_format)}.zip"

logger.info(f"Exporting public data set metadata to {zip_file_name}/main.json")
logger.info(f"Writing {zip_file_name} with {len(score_set_ids)} score sets.")
json_data = {
"title": "MaveDB public data",
"asOf": datetime.now(timezone.utc).isoformat(),
Expand All @@ -140,21 +128,23 @@ def export_public_data(db: Session):
# Write metadata for all data sets to a single JSON file.
zipfile.writestr("main.json", json.dumps(jsonable_encoder(json_data)))

# Copy the CC0 license.
zipfile.write(os.path.join(os.path.dirname(__file__), "resources/CC0_license.txt"), "LICENSE.txt")
# Copy the CC0 license and README.
resources_dir = os.path.join(os.path.dirname(__file__), "resources")
zipfile.write(os.path.join(resources_dir, "CC0_license.txt"), "LICENSE.txt")
zipfile.write(os.path.join(resources_dir, "README.md"), "README.md")

# Write score and count files for each score set.
num_score_sets = len(score_set_ids)
for i, score_set_id in enumerate(score_set_ids):
score_set = db.scalars(select(ScoreSet).where(ScoreSet.id == score_set_id)).one_or_none()
if score_set is not None and score_set.urn is not None:
logger.info(f"{i + 1}/{num_score_sets} Exporting variants for score set {score_set.urn}")
logger.info(f"[{i + 1}/{num_score_sets}] Exporting score set {score_set.urn}")
csv_filename_base = score_set.urn.replace(":", "-")

csv_str = get_score_set_variants_as_csv(db, score_set, ["scores"], namespaced=True)
zipfile.writestr(f"csv/{csv_filename_base}.scores.csv", csv_str)

# Only generate the annotations CSV if mapped variants exist in the score set.
# Only generate annotation files if mapped variants exist in the score set.
has_annotations = (
db.scalars(
select(ScoreSet).where(ScoreSet.id == score_set_id).join(Variant).join(MappedVariant).limit(1)
Expand All @@ -167,12 +157,32 @@ def export_public_data(db: Session):
)
zipfile.writestr(f"csv/{csv_filename_base}.annotations.csv", csv_str)

# Write mapped variants JSON — mirrors GET /api/v1/score-sets/{urn}/mapped-variants.
mapped_variants = db.scalars(
select(MappedVariant)
.join(Variant, Variant.id == MappedVariant.variant_id)
.options(joinedload(MappedVariant.variant))
.where(Variant.score_set_id == score_set_id)
).all()
mapped_variant_views = [
mapped_variant_vm.MappedVariant.model_validate(mv) for mv in mapped_variants
]
zipfile.writestr(
f"mapped/{csv_filename_base}.mapped-variants.json",
json.dumps(jsonable_encoder(mapped_variant_views)),
)
logger.info(
f"[{i + 1}/{num_score_sets}] Wrote annotations + {len(mapped_variants)} mapped variants"
)

# Only generate the counts CSV if count columns are present.
count_columns = score_set.dataset_columns["count_columns"] if score_set.dataset_columns else None
if count_columns and len(count_columns) > 0:
csv_str = get_score_set_variants_as_csv(db, score_set, ["counts"], namespaced=True)
zipfile.writestr(f"csv/{csv_filename_base}.counts.csv", csv_str)

logger.info(f"Export complete: {zip_file_name}")


if __name__ == "__main__":
export_public_data()
236 changes: 236 additions & 0 deletions src/mavedb/scripts/resources/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
# MaveDB Public Data Dump

This archive contains a snapshot of publicly accessible variant effect data from MaveDB.
The `asOf` field in `main.json` records the exact date and time this dump was generated.

### Useful links
- **MaveDB website:** https://www.mavedb.org
- **API documentation:** https://api.mavedb.org/docs
- **MaveDB documentation:** https://mavedb.org/docs/mavedb/index.html
- **Source code:**
- https://github.com/VariantEffect/mavedb-api
- https://github.com/VariantEffect/mavedb-ui
- https://github.com/VariantEffect/dcd_mapping2

---

## What's Included

This dump includes only data that is:

- **Published** — publicly released on MaveDB
- **CC0-licensed** — released under the Creative Commons CC0 1.0 Public Domain Dedication

Unpublished data, private datasets, and datasets published under other licenses are excluded.

---

## Archive Structure

```
mavedb-dump.YYYYMMDDHHMMSS.zip
├── README.md # This file
├── LICENSE.txt # Creative Commons CC0 1.0 license text
├── main.json # Metadata for all included datasets
├── csv/
│ ├── {urn}.scores.csv # Variant effect scores (all score sets)
│ ├── {urn}.counts.csv # Variant counts (score sets with count data only)
│ └── {urn}.annotations.csv # Variant annotations from VEP, gnomAD, and ClinGen
│ # (score sets that have completed mapping only)
└── mapped/
└── {urn}.mapped-variants.json # Mapped variant data including VRS alleles and HGVS
# (score sets that have completed mapping only)
```

`{urn}` is the score set URN with colons replaced by hyphens, e.g., `urn-mavedb-00000001-a-1`.

---

## File Descriptions

### `main.json`

A JSON object containing MaveDB metadata with three top-level fields:

- `title` — `"MaveDB public data"`
- `asOf` — ISO 8601 UTC timestamp indicating when this dump was generated
- `experimentSets` — Array of experiment set objects, each containing nested experiments and score
sets with full metadata (targets, publications, licenses, contributors, etc.)

The hierarchy mirrors the MaveDB data model: each **ExperimentSet** contains one or more
**Experiments**, each of which contains one or more **ScoreSets**.

Score set metadata includes the `datasetColumns` field, which lists the names of the per-score-set
score and count columns that appear in the corresponding CSV files.

### CSV column namespacing

All CSV files exported from MaveDB use a namespaced column naming scheme. The namespace prefix
identifies which data source a column belongs to and is separated from the column name by a dot:

| Prefix | Source |
|--------|--------|
| *(no prefix)* | Core identifiers — `accession`, `hgvs_nt`, `hgvs_pro`, `hgvs_splice` |
| `scores.` | Score columns defined by the score set author (e.g. `scores.score`) |
| `counts.` | Count columns defined by the score set author |
| `mavedb.` | Columns computed by the MaveDB mapping pipeline (post-mapped HGVS, VRS digest) |
| `vep.` | Ensembl Variant Effect Predictor annotations |
| `gnomad.` | gnomAD population frequency data |
| `clingen.` | ClinGen Allele Registry linkage |

Missing or inapplicable values in all CSV files are represented as the string `NA`.

### `csv/{urn}.scores.csv`

Comma-separated file with variant effect scores. Contains the following fixed columns, followed by
score columns defined by each individual score set:

| Column | Description |
|--------|-------------|
| `accession` | Full variant URN (e.g., `urn:mavedb:00000001-a-1#1`) |
| `hgvs_nt` | Assay-level nucleotide HGVS string in MAVE-HGVS format, if applicable |
| `hgvs_pro` | Assay-level protein HGVS string in MAVE-HGVS format, if applicable |
| `hgvs_splice` | Assay-level splice HGVS string in MAVE-HGVS format, if applicable |
| `scores.score` | The primary score column — always present |
| `scores.*` | Additional score columns defined by the score set author |

The `hgvs_nt`, `hgvs_pro`, and `hgvs_splice` columns use **MAVE-HGVS format** — a constrained
subset of HGVS notation used by MaveDB. These strings are often expressed relative to the
assay's reference sequence (a transcript or protein), not the genome, and may not validate against
a standard HGVS parser. Score values are not normalized across score sets; each score set defines
its own scale and units. Refer to the score set's entry in `main.json` for the meaning of each
score column.

### `csv/{urn}.counts.csv`

Same structure as `scores.csv`, but with `counts.*` columns in place of score columns. Only
present for score sets that have count data. The count column names are listed in
`datasetColumns.countColumns` in `main.json`.

### `csv/{urn}.annotations.csv`

Variant annotation data from external databases, joined with post-mapped HGVS and VRS identifiers
produced by the MaveDB variant mapping pipeline. **Only present for score sets that have completed
the MaveDB mapping pipeline.** Exact columns:

| Column | Description |
|--------|-------------|
| `accession` | Full variant URN — use this to join with `scores.csv` |
| `hgvs_nt` | Assay-level nucleotide HGVS (MAVE-HGVS format) |
| `hgvs_pro` | Assay-level protein HGVS (MAVE-HGVS format) |
| `hgvs_splice` | Assay-level splice HGVS (MAVE-HGVS format) |
| `mavedb.post_mapped_hgvs_g` | Post-mapped genomic HGVS on GRCh38 (g. notation) |
| `mavedb.post_mapped_hgvs_c` | Post-mapped coding HGVS (c. notation) |
| `mavedb.post_mapped_hgvs_p` | Post-mapped protein HGVS (p. notation) |
| `mavedb.post_mapped_hgvs_at_assay_level` | Post-mapped HGVS at the assay reference level (transcript or protein) |
| `mavedb.post_mapped_vrs_digest` | GA4GH VRS digest identifier for the post-mapped allele |
| `vep.vep_functional_consequence` | VEP functional consequence term (e.g. `missense_variant`) |
| `gnomad.gnomad_af` | gnomAD v4.1 allele frequency |
| `clingen.clingen_allele_id` | ClinGen Allele Registry CA identifier (e.g. `CA12345`) |

Variants that could not be mapped, or for which a specific annotation is unavailable, will have
`NA` in the corresponding column. For multi-allelic variants (haplotypes), `mavedb.*` HGVS columns
will be `NA` because a single combined HGVS string cannot currently be derived. This may be updated in
a future release.

### `mapped/{urn}.mapped-variants.json`

A JSON array of mapped variant records. Each record corresponds to a single variant and contains
the same fields returned by `GET /api/v1/score-sets/{urn}/mapped-variants`:

| Field | Description |
|-------|-------------|
| `variantUrn` | URN of the source variant — use this to join with `accession` in the CSV files |
| `preMapped` | VRS allele or haplotype using coordinates on the assay's reference sequence (transcript or protein accession) |
| `postMapped` | VRS allele or haplotype lifted over to GRCh38 genomic coordinates |
| `vrsVersion` | VRS schema version used to encode these objects (e.g., `"1.3"`, `"2.0"`) |
| `mappingApiVersion` | Version of the dcd_mapping service that produced this result |
| `mappedDate` | Date the mapping was produced |
| `modificationDate` | Date this mapping record was last modified |
| `current` | `true` if this is the active mapping for the variant; `false` for superseded mappings |
| `errorMessage` | Diagnostic message if mapping failed; `null` on success |
| `clingenAlleleId` | ClinGen Allele Registry identifier, if the variant has been registered |

`preMapped` and `postMapped` are raw GA4GH VRS objects (JSON). The `type` field within them may be
`"Allele"`, `"Haplotype"`, or `"CisPhasedBlock"` depending on the variant. Records where mapping
failed will have `preMapped: null`, `postMapped: null`, and a non-null `errorMessage`. **Only
present for score sets that have completed the MaveDB mapping pipeline.**

---

## Working with this data

### Joining files for a single score set

All files for a given score set share the same variant identifier:

- In CSV files: the `accession` column (e.g. `urn:mavedb:00000001-a-1#42`)
- In `mapped-variants.json`: the `variantUrn` field

To combine scores with annotations or with VRS data, join on `accession` = `variantUrn`.

### Linking files back to metadata

A filename like `urn-mavedb-00000001-a-1.scores.csv` corresponds to the score set with
`"urn": "urn:mavedb:00000001-a-1"` in `main.json`. The filename prefix is the score set URN with
every colon (`:`) replaced by a hyphen (`-`).

### Reconstructing score set metadata from `main.json`

`main.json` contains the full metadata hierarchy. Score sets are nested inside experiments, which
are nested inside experiment sets. To find the metadata for a specific score set:

```python
import json

with open("main.json") as f:
data = json.load(f)

target_urn = "urn:mavedb:00000001-a-1"
score_set = next(
ss
for es in data["experimentSets"]
for exp in es["experiments"]
for ss in exp["scoreSets"]
if ss["urn"] == target_urn
)
```

---

## Caveats

- Only **published**, **CC0-licensed** data is included. Datasets with other licenses are not
present in this dump even if they are publicly visible on MaveDB.
- Annotation files (`.annotations.csv`) and mapped variant files (`.mapped-variants.json`) are
**only present for score sets that have been processed by the MaveDB variant mapping pipeline**.
Score sets that have not yet been mapped, or for which mapping failed entirely, will not have
these files.
- Mapping is applied per variant within a score set. A score set that has completed the mapping
pipeline may still contain individual variants with failed mappings. Those variants have `NA` in
all `mavedb.*`, `vep.*`, `gnomad.*`, and `clingen.*` columns in the annotations CSV, and
`preMapped: null` / `postMapped: null` in the JSON.
- The `mapped/` JSON files include **all** mapping records, not only the most recent ones. When a
score set is remapped, the previous records are retained with `current: false`. For most use
cases, filter to records where `current` is `true`. Annotations are always reported with respect
to the current mapping object.
- gnomAD allele frequencies in `annotations.csv` are sourced from **gnomAD v4.1** specifically.
- `preMapped` VRS objects reference the assay's input sequence (a transcript or protein accession).
`postMapped` VRS objects are remapped to the **GRCh38** reference genome. Do not compare
coordinates between `preMapped` and `postMapped` directly.
- Assay-level HGVS strings (`hgvs_nt`, `hgvs_pro`, `hgvs_splice`) are in **MAVE-HGVS format**, a
constrained community convention that may not parse with a standard HGVS library.
- Score values are **not normalized** across score sets. Each score set defines its own scale,
range, and interpretation. A score of `1.0` in one score set has no defined relationship to a
score of `1.0` in another.
- The data in this dump reflects the state of MaveDB at the time of export, as recorded in the
`asOf` UTC timestamp in `main.json`. It may not reflect changes made after that time.

---

## License

All data in this archive is released under the
[Creative Commons CC0 1.0 Universal (CC0 1.0) Public Domain Dedication](https://creativecommons.org/publicdomain/zero/1.0/).

See `LICENSE.txt` for the full license text.
Loading