Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 42 additions & 20 deletions src/dcd_mapping/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
Expression,
Haplotype,
LiteralSequenceExpression,
ReferenceLengthExpression,
Syntax,
)

Expand Down Expand Up @@ -511,6 +512,12 @@ def _get_hgvs_string(allele: Allele, accession: str) -> tuple[str, Syntax]:
else:
syntax = Syntax.HGVS_G
syntax_value = "g"

# Reference-identical variants are represented as simple Alleles with a ReferenceLengthExpression state rather than being translated from HGVS.
# We can generate a simplified HGVS string for these variants without needing to perform a full translation.
if isinstance(allele.state, ReferenceLengthExpression):
return f"{accession}:{syntax_value}.=", syntax

start: int = allele.location.start
end: int = allele.location.end

Expand Down Expand Up @@ -587,12 +594,16 @@ def _annotate_allele_mapping(
pre_mapped: Allele = mapped_score.pre_mapped
post_mapped: Allele = mapped_score.post_mapped

# get vrs_ref_allele_seq for pre-mapped variants
ref_allele_seq_extension = _get_vrs_ref_allele_seq(
pre_mapped, metadata, urn, tx_results
)
if ref_allele_seq_extension is not None:
pre_mapped.extensions = [ref_allele_seq_extension]
# get vrs_ref_allele_seq for pre-mapped variants if they aren't reference-identical variants, which have a ReferenceLengthExpression state
# and for which the vrs_ref_allele_seq would be redundant with the length and sequence reference information already present in the allele.
# We also want to avoid fetching the reference sequence for long reference-identical variants, as this can cause performance issues and the
# vrs_ref_allele_seq doesn't add much value in these cases.
if not isinstance(pre_mapped.state, ReferenceLengthExpression):
ref_allele_seq_extension = _get_vrs_ref_allele_seq(
pre_mapped, metadata, urn, tx_results
)
if ref_allele_seq_extension is not None:
pre_mapped.extensions = [ref_allele_seq_extension]

if post_mapped:
# Determine reference sequence
Expand All @@ -614,10 +625,14 @@ def _annotate_allele_mapping(
sr = get_seqrepo()
loc = mapped_score.post_mapped.location
sequence_id = f"ga4gh:{loc.sequenceReference.refgetAccession}"
ref = sr.get_sequence(sequence_id, loc.start, loc.end)
post_mapped.extensions = [
Extension(type="Extension", name="vrs_ref_allele_seq", value=ref)
]

# Skip getting refereence sequence for RLE Alleles, see above for pre-mapped alleles.
if not isinstance(post_mapped.state, ReferenceLengthExpression):
ref = sr.get_sequence(sequence_id, loc.start, loc.end)
post_mapped.extensions = [
Extension(type="Extension", name="vrs_ref_allele_seq", value=ref)
]

if accession:
hgvs_string, syntax = _get_hgvs_string(post_mapped, accession)
post_mapped.expressions = [Expression(syntax=syntax, value=hgvs_string)]
Expand Down Expand Up @@ -648,13 +663,14 @@ def _annotate_haplotype_mapping(
pre_mapped: Haplotype = mapped_score.pre_mapped # type: ignore
post_mapped: Haplotype = mapped_score.post_mapped # type: ignore

# get vrs_ref_allele_seq for pre-mapped variants
# see comment in _annotate_allele_mapping regarding why we skip getting vrs_ref_allele_seq for reference-identical variants.
for allele in pre_mapped.members:
ref_allele_seq_extension = _get_vrs_ref_allele_seq(
allele, metadata, urn, tx_results
)
if ref_allele_seq_extension is not None:
allele.extensions = [ref_allele_seq_extension]
if not isinstance(allele.state, ReferenceLengthExpression):
ref_allele_seq_extension = _get_vrs_ref_allele_seq(
allele, metadata, urn, tx_results
)
if ref_allele_seq_extension is not None:
allele.extensions = [ref_allele_seq_extension]

if post_mapped:
# Determine reference sequence
Expand All @@ -678,10 +694,16 @@ def _annotate_haplotype_mapping(
for allele in post_mapped.members:
loc = allele.location
sequence_id = f"ga4gh:{loc.sequenceReference.refgetAccession}"
ref = sr.get_sequence(sequence_id, loc.start, loc.end) # TODO type issues??
allele.extensions = [
Extension(type="Extension", name="vrs_ref_allele_seq", value=ref)
]

# Again, skip getting reference sequence for RLE Alleles.
if not isinstance(allele.state, ReferenceLengthExpression):
ref = sr.get_sequence(
sequence_id, loc.start, loc.end
) # TODO type issues??
allele.extensions = [
Extension(type="Extension", name="vrs_ref_allele_seq", value=ref)
]

if accession:
hgvs, syntax = _get_hgvs_string(allele, accession)
allele.expressions = [Expression(syntax=syntax, value=hgvs)]
Expand Down
63 changes: 63 additions & 0 deletions src/dcd_mapping/lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,9 @@
from ga4gh.vrs._internal.models import (
Allele,
LiteralSequenceExpression,
ReferenceLengthExpression,
SequenceLocation,
SequenceReference,
)
from ga4gh.vrs.dataproxy import SeqRepoDataProxy, coerce_namespace
from ga4gh.vrs.extras.translator import AlleleTranslator
Expand Down Expand Up @@ -70,7 +72,9 @@
"get_ucsc_chromosome_name",
"get_chromosome_identifier_from_vrs_id",
"get_sequence",
"build_ref_identical_allele",
"translate_hgvs_to_vrs",
"translate_ref_identical_to_vrs",
"get_mane_transcripts",
"get_uniprot_sequence",
]
Expand Down Expand Up @@ -590,6 +594,65 @@ def translate_hgvs_to_vrs(hgvs: str) -> Allele:
return allele


def build_ref_identical_allele(sequence_id: str) -> Allele:
"""Build a whole-sequence reference-identical VRS Allele from a sequence identifier.

Accepts either a GA4GH SQ digest (``SQ.xxx``, without the ``ga4gh:`` prefix) or a
named accession such as ``NP_``, ``NM_``, or ``NC_``.

:param sequence_id: GA4GH SQ digest or named accession
:return: VRS Allele spanning the full sequence with a ReferenceLengthExpression state
:raises DataLookupError: if the sequence identifier or metadata lookup fails
"""
if sequence_id.startswith("SQ."):
ga4gh_id = f"ga4gh:{sequence_id}"
else:
try:
ga4gh_id = get_vrs_id_from_identifier(sequence_id)
except KeyError as e:
msg = f"Could not retrieve GA4GH identifier for accession {sequence_id!r}"
_logger.error(msg)
raise DataLookupError(msg) from e

sr = get_seqrepo()
try:
metadata = sr.get_metadata(ga4gh_id)
except KeyError as e:
msg = f"Could not retrieve metadata for sequence {ga4gh_id!r}"
_logger.error(msg)
raise DataLookupError(msg) from e

length = metadata["length"]
refget_accession = ga4gh_id.split("ga4gh:")[-1]

seq_ref = SequenceReference(refgetAccession=refget_accession)
location = SequenceLocation(sequenceReference=seq_ref, start=0, end=length)
state = ReferenceLengthExpression(length=length, repeatSubunitLength=length)

return Allele(location=location, state=state)


def translate_ref_identical_to_vrs(hgvs_string: str) -> Allele:
"""Convert a reference-identical HGVS variant to a VRS Allele.

Handles reference-identical variants such as ``NM_001234.1:c.=``,
``NP_001234.1:p.=``, and ``NC_000001.11:g.=``, which regular VRS
translation does not support. Returns an Allele with a
``ReferenceLengthExpression`` state spanning the full reference sequence.

:param hgvs_string: HGVS reference-identical variant string (e.g. ``NM_001234.1:c.=``)
:return: VRS Allele spanning the full reference sequence
:raises ValueError: if ``hgvs_string`` is not a valid reference-identical HGVS expression
:raises DataLookupError: if the sequence identifier or metadata lookup fails
"""
if ":" not in hgvs_string or not hgvs_string.endswith(".="):
msg = f"Not a reference-identical HGVS expression: {hgvs_string!r}"
raise ValueError(msg)

accession = hgvs_string.split(":")[0]
return build_ref_identical_allele(accession)


# ----------------------------------- MANE ----------------------------------- #


Expand Down
64 changes: 49 additions & 15 deletions src/dcd_mapping/vrs_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,12 @@
UnsupportedReferenceSequencePrefixError,
)
from dcd_mapping.lookup import (
build_ref_identical_allele,
cdot_rest,
get_chromosome_identifier,
get_seqrepo,
translate_hgvs_to_vrs,
translate_ref_identical_to_vrs,
)
from dcd_mapping.resource_utils import is_missing_value, request_with_backoff
from dcd_mapping.schemas import (
Expand Down Expand Up @@ -80,6 +82,10 @@ def is_intronic_variant(variant: Variant) -> bool:
"""Return True if given Variant is intronic, otherwise return False.
Supports single or multi-position variants.
"""
# ref identical variants with no positions should not be considered intronic
if variant.positions is None:
return False

if isinstance(variant.positions, Iterable):
if any(position.is_intronic() for position in variant.positions):
return True
Expand Down Expand Up @@ -218,6 +224,27 @@ def _create_post_mapped_hgvs_strings(
msg = f"Variant is intronic and cannot be processed: {variant}"
raise ValueError(msg)

# Reference-identical variants require no positional adjustment
if str(variant).endswith(".="):
if layer is AnnotationLayer.PROTEIN:
assert tx # noqa: S101. mypy help
hgvs_strings.append(tx.np + ":" + str(variant))

elif layer is AnnotationLayer.GENOMIC:
if accession_id:
hgvs_strings.append(accession_id + ":" + str(variant))
else:
assert alignment # noqa: S101. mypy help
hgvs_strings.append(
get_chromosome_identifier(alignment.chrom) + ":" + str(variant)
)

else:
msg = f"Could not generate HGVS strings for invalid AnnotationLayer: {layer}"
raise ValueError(msg)

continue

if layer is AnnotationLayer.PROTEIN:
assert tx # noqa: S101. mypy help

Expand Down Expand Up @@ -417,11 +444,7 @@ def _map_protein_coding_pro(
:param protein_align_result: The protein-protein alignment result for a target against its selected protein reference
:return: VRS mapping object if mapping succeeds
"""
if (
row.hgvs_pro in {"_wt", "_sy"}
or is_missing_value(row.hgvs_pro)
or len(row.hgvs_pro) == 3
):
if row.hgvs_pro in {"_wt", "_sy"} or is_missing_value(row.hgvs_pro):
_logger.warning(
"Can't process variant syntax %s for %s", row.hgvs_pro, row.accession
)
Expand All @@ -443,7 +466,6 @@ def _map_protein_coding_pro(
error_message="Protein frameshift variants are not supported",
)

# TODO: Handle edge cases without hardcoding URNs.
# Special case for experiment set urn:mavedb:0000097
if row.hgvs_pro.startswith("NP_009225.1:p."):
vrs_variation = translate_hgvs_to_vrs(row.hgvs_pro)
Expand Down Expand Up @@ -545,7 +567,6 @@ def _map_genomic(
row.hgvs_nt in {"_wt", "_sy", "="}
or "fs"
in row.hgvs_nt # TODO I think this line can be taken out, I don't think fs nomenclature can be used for nt anyway
or len(row.hgvs_nt) == 3
):
_logger.warning(
"Can't process variant syntax %s for %s", row.hgvs_nt, row.accession
Expand Down Expand Up @@ -743,11 +764,7 @@ def _hgvs_nt_is_valid(hgvs_nt: str) -> bool:
:param hgvs_nt: MAVE_HGVS nucleotide expression
:return: True if expression appears populated and valid
"""
return (
(not is_missing_value(hgvs_nt))
and (hgvs_nt not in {"_wt", "_sy", "="})
and (len(hgvs_nt) != 3)
)
return (not is_missing_value(hgvs_nt)) and (hgvs_nt not in {"_wt", "_sy", "="})


def _hgvs_pro_is_valid(hgvs_pro: str) -> bool:
Expand All @@ -759,7 +776,6 @@ def _hgvs_pro_is_valid(hgvs_pro: str) -> bool:
return (
(hgvs_pro not in {"_wt", "_sy"})
and (not is_missing_value(hgvs_pro))
and (len(hgvs_pro) != 3)
and ("fs" not in hgvs_pro)
)

Expand Down Expand Up @@ -939,7 +955,26 @@ def _construct_vrs_allele(
) -> Allele | Haplotype:
alleles: list[Allele] = []
for hgvs_string in hgvs_strings:
_logger.info("Processing HGVS string: %s", hgvs_string)
_logger.debug("Processing HGVS string: %s", hgvs_string)

# Special handling for reference-identical variants, which must be represented as simple Alleles with a
# ReferenceLengthExpression state rather than beeing translated from HGVS. This translation is
# currently unsupported by ga4gh hgvs_tools and will raise an error if attempted.
if hgvs_string.endswith(".="):
if pre_map:
if sequence_id is None:
msg = "Must provide sequence id to construct pre-mapped ref-identical VRS allele"
raise ValueError(msg)
allele = build_ref_identical_allele(sequence_id)
else:
allele = translate_ref_identical_to_vrs(hgvs_string)

normalize(allele, data_proxy=get_seqrepo())

allele.id = ga4gh_identify(allele)
alleles.append(allele)
continue

allele = translate_hgvs_to_vrs(hgvs_string)

if pre_map:
Expand All @@ -953,7 +988,6 @@ def _construct_vrs_allele(
if "dup" in hgvs_string:
allele.state.sequence = SequenceString(2 * _get_allele_sequence(allele))

# TODO check assumption that c.= leads to an "N" in the sequence.root
if allele.state.sequence.root == "N" and layer == AnnotationLayer.GENOMIC:
allele.state.sequence = SequenceString(_get_allele_sequence(allele))

Expand Down
Loading