diff --git a/src/dcd_mapping/annotate.py b/src/dcd_mapping/annotate.py index b37e238..941f45f 100644 --- a/src/dcd_mapping/annotate.py +++ b/src/dcd_mapping/annotate.py @@ -19,6 +19,7 @@ Expression, Haplotype, LiteralSequenceExpression, + ReferenceLengthExpression, Syntax, ) @@ -511,6 +512,12 @@ def _get_hgvs_string(allele: Allele, accession: str) -> tuple[str, Syntax]: else: syntax = Syntax.HGVS_G syntax_value = "g" + + # Reference-identical variants are represented as simple Alleles with a ReferenceLengthExpression state rather than being translated from HGVS. + # We can generate a simplified HGVS string for these variants without needing to perform a full translation. + if isinstance(allele.state, ReferenceLengthExpression): + return f"{accession}:{syntax_value}.=", syntax + start: int = allele.location.start end: int = allele.location.end @@ -587,12 +594,16 @@ def _annotate_allele_mapping( pre_mapped: Allele = mapped_score.pre_mapped post_mapped: Allele = mapped_score.post_mapped - # get vrs_ref_allele_seq for pre-mapped variants - ref_allele_seq_extension = _get_vrs_ref_allele_seq( - pre_mapped, metadata, urn, tx_results - ) - if ref_allele_seq_extension is not None: - pre_mapped.extensions = [ref_allele_seq_extension] + # get vrs_ref_allele_seq for pre-mapped variants if they aren't reference-identical variants, which have a ReferenceLengthExpression state + # and for which the vrs_ref_allele_seq would be redundant with the length and sequence reference information already present in the allele. + # We also want to avoid fetching the reference sequence for long reference-identical variants, as this can cause performance issues and the + # vrs_ref_allele_seq doesn't add much value in these cases. + if not isinstance(pre_mapped.state, ReferenceLengthExpression): + ref_allele_seq_extension = _get_vrs_ref_allele_seq( + pre_mapped, metadata, urn, tx_results + ) + if ref_allele_seq_extension is not None: + pre_mapped.extensions = [ref_allele_seq_extension] if post_mapped: # Determine reference sequence @@ -614,10 +625,14 @@ def _annotate_allele_mapping( sr = get_seqrepo() loc = mapped_score.post_mapped.location sequence_id = f"ga4gh:{loc.sequenceReference.refgetAccession}" - ref = sr.get_sequence(sequence_id, loc.start, loc.end) - post_mapped.extensions = [ - Extension(type="Extension", name="vrs_ref_allele_seq", value=ref) - ] + + # Skip getting refereence sequence for RLE Alleles, see above for pre-mapped alleles. + if not isinstance(post_mapped.state, ReferenceLengthExpression): + ref = sr.get_sequence(sequence_id, loc.start, loc.end) + post_mapped.extensions = [ + Extension(type="Extension", name="vrs_ref_allele_seq", value=ref) + ] + if accession: hgvs_string, syntax = _get_hgvs_string(post_mapped, accession) post_mapped.expressions = [Expression(syntax=syntax, value=hgvs_string)] @@ -648,13 +663,14 @@ def _annotate_haplotype_mapping( pre_mapped: Haplotype = mapped_score.pre_mapped # type: ignore post_mapped: Haplotype = mapped_score.post_mapped # type: ignore - # get vrs_ref_allele_seq for pre-mapped variants + # see comment in _annotate_allele_mapping regarding why we skip getting vrs_ref_allele_seq for reference-identical variants. for allele in pre_mapped.members: - ref_allele_seq_extension = _get_vrs_ref_allele_seq( - allele, metadata, urn, tx_results - ) - if ref_allele_seq_extension is not None: - allele.extensions = [ref_allele_seq_extension] + if not isinstance(allele.state, ReferenceLengthExpression): + ref_allele_seq_extension = _get_vrs_ref_allele_seq( + allele, metadata, urn, tx_results + ) + if ref_allele_seq_extension is not None: + allele.extensions = [ref_allele_seq_extension] if post_mapped: # Determine reference sequence @@ -678,10 +694,16 @@ def _annotate_haplotype_mapping( for allele in post_mapped.members: loc = allele.location sequence_id = f"ga4gh:{loc.sequenceReference.refgetAccession}" - ref = sr.get_sequence(sequence_id, loc.start, loc.end) # TODO type issues?? - allele.extensions = [ - Extension(type="Extension", name="vrs_ref_allele_seq", value=ref) - ] + + # Again, skip getting reference sequence for RLE Alleles. + if not isinstance(allele.state, ReferenceLengthExpression): + ref = sr.get_sequence( + sequence_id, loc.start, loc.end + ) # TODO type issues?? + allele.extensions = [ + Extension(type="Extension", name="vrs_ref_allele_seq", value=ref) + ] + if accession: hgvs, syntax = _get_hgvs_string(allele, accession) allele.expressions = [Expression(syntax=syntax, value=hgvs)] diff --git a/src/dcd_mapping/lookup.py b/src/dcd_mapping/lookup.py index 17b9cc3..f4fd2fa 100644 --- a/src/dcd_mapping/lookup.py +++ b/src/dcd_mapping/lookup.py @@ -41,7 +41,9 @@ from ga4gh.vrs._internal.models import ( Allele, LiteralSequenceExpression, + ReferenceLengthExpression, SequenceLocation, + SequenceReference, ) from ga4gh.vrs.dataproxy import SeqRepoDataProxy, coerce_namespace from ga4gh.vrs.extras.translator import AlleleTranslator @@ -70,7 +72,9 @@ "get_ucsc_chromosome_name", "get_chromosome_identifier_from_vrs_id", "get_sequence", + "build_ref_identical_allele", "translate_hgvs_to_vrs", + "translate_ref_identical_to_vrs", "get_mane_transcripts", "get_uniprot_sequence", ] @@ -590,6 +594,65 @@ def translate_hgvs_to_vrs(hgvs: str) -> Allele: return allele +def build_ref_identical_allele(sequence_id: str) -> Allele: + """Build a whole-sequence reference-identical VRS Allele from a sequence identifier. + + Accepts either a GA4GH SQ digest (``SQ.xxx``, without the ``ga4gh:`` prefix) or a + named accession such as ``NP_``, ``NM_``, or ``NC_``. + + :param sequence_id: GA4GH SQ digest or named accession + :return: VRS Allele spanning the full sequence with a ReferenceLengthExpression state + :raises DataLookupError: if the sequence identifier or metadata lookup fails + """ + if sequence_id.startswith("SQ."): + ga4gh_id = f"ga4gh:{sequence_id}" + else: + try: + ga4gh_id = get_vrs_id_from_identifier(sequence_id) + except KeyError as e: + msg = f"Could not retrieve GA4GH identifier for accession {sequence_id!r}" + _logger.error(msg) + raise DataLookupError(msg) from e + + sr = get_seqrepo() + try: + metadata = sr.get_metadata(ga4gh_id) + except KeyError as e: + msg = f"Could not retrieve metadata for sequence {ga4gh_id!r}" + _logger.error(msg) + raise DataLookupError(msg) from e + + length = metadata["length"] + refget_accession = ga4gh_id.split("ga4gh:")[-1] + + seq_ref = SequenceReference(refgetAccession=refget_accession) + location = SequenceLocation(sequenceReference=seq_ref, start=0, end=length) + state = ReferenceLengthExpression(length=length, repeatSubunitLength=length) + + return Allele(location=location, state=state) + + +def translate_ref_identical_to_vrs(hgvs_string: str) -> Allele: + """Convert a reference-identical HGVS variant to a VRS Allele. + + Handles reference-identical variants such as ``NM_001234.1:c.=``, + ``NP_001234.1:p.=``, and ``NC_000001.11:g.=``, which regular VRS + translation does not support. Returns an Allele with a + ``ReferenceLengthExpression`` state spanning the full reference sequence. + + :param hgvs_string: HGVS reference-identical variant string (e.g. ``NM_001234.1:c.=``) + :return: VRS Allele spanning the full reference sequence + :raises ValueError: if ``hgvs_string`` is not a valid reference-identical HGVS expression + :raises DataLookupError: if the sequence identifier or metadata lookup fails + """ + if ":" not in hgvs_string or not hgvs_string.endswith(".="): + msg = f"Not a reference-identical HGVS expression: {hgvs_string!r}" + raise ValueError(msg) + + accession = hgvs_string.split(":")[0] + return build_ref_identical_allele(accession) + + # ----------------------------------- MANE ----------------------------------- # diff --git a/src/dcd_mapping/vrs_map.py b/src/dcd_mapping/vrs_map.py index 4d4be1c..f2bf22a 100644 --- a/src/dcd_mapping/vrs_map.py +++ b/src/dcd_mapping/vrs_map.py @@ -28,10 +28,12 @@ UnsupportedReferenceSequencePrefixError, ) from dcd_mapping.lookup import ( + build_ref_identical_allele, cdot_rest, get_chromosome_identifier, get_seqrepo, translate_hgvs_to_vrs, + translate_ref_identical_to_vrs, ) from dcd_mapping.resource_utils import is_missing_value, request_with_backoff from dcd_mapping.schemas import ( @@ -80,6 +82,10 @@ def is_intronic_variant(variant: Variant) -> bool: """Return True if given Variant is intronic, otherwise return False. Supports single or multi-position variants. """ + # ref identical variants with no positions should not be considered intronic + if variant.positions is None: + return False + if isinstance(variant.positions, Iterable): if any(position.is_intronic() for position in variant.positions): return True @@ -218,6 +224,27 @@ def _create_post_mapped_hgvs_strings( msg = f"Variant is intronic and cannot be processed: {variant}" raise ValueError(msg) + # Reference-identical variants require no positional adjustment + if str(variant).endswith(".="): + if layer is AnnotationLayer.PROTEIN: + assert tx # noqa: S101. mypy help + hgvs_strings.append(tx.np + ":" + str(variant)) + + elif layer is AnnotationLayer.GENOMIC: + if accession_id: + hgvs_strings.append(accession_id + ":" + str(variant)) + else: + assert alignment # noqa: S101. mypy help + hgvs_strings.append( + get_chromosome_identifier(alignment.chrom) + ":" + str(variant) + ) + + else: + msg = f"Could not generate HGVS strings for invalid AnnotationLayer: {layer}" + raise ValueError(msg) + + continue + if layer is AnnotationLayer.PROTEIN: assert tx # noqa: S101. mypy help @@ -417,11 +444,7 @@ def _map_protein_coding_pro( :param protein_align_result: The protein-protein alignment result for a target against its selected protein reference :return: VRS mapping object if mapping succeeds """ - if ( - row.hgvs_pro in {"_wt", "_sy"} - or is_missing_value(row.hgvs_pro) - or len(row.hgvs_pro) == 3 - ): + if row.hgvs_pro in {"_wt", "_sy"} or is_missing_value(row.hgvs_pro): _logger.warning( "Can't process variant syntax %s for %s", row.hgvs_pro, row.accession ) @@ -443,7 +466,6 @@ def _map_protein_coding_pro( error_message="Protein frameshift variants are not supported", ) - # TODO: Handle edge cases without hardcoding URNs. # Special case for experiment set urn:mavedb:0000097 if row.hgvs_pro.startswith("NP_009225.1:p."): vrs_variation = translate_hgvs_to_vrs(row.hgvs_pro) @@ -545,7 +567,6 @@ def _map_genomic( row.hgvs_nt in {"_wt", "_sy", "="} or "fs" in row.hgvs_nt # TODO I think this line can be taken out, I don't think fs nomenclature can be used for nt anyway - or len(row.hgvs_nt) == 3 ): _logger.warning( "Can't process variant syntax %s for %s", row.hgvs_nt, row.accession @@ -743,11 +764,7 @@ def _hgvs_nt_is_valid(hgvs_nt: str) -> bool: :param hgvs_nt: MAVE_HGVS nucleotide expression :return: True if expression appears populated and valid """ - return ( - (not is_missing_value(hgvs_nt)) - and (hgvs_nt not in {"_wt", "_sy", "="}) - and (len(hgvs_nt) != 3) - ) + return (not is_missing_value(hgvs_nt)) and (hgvs_nt not in {"_wt", "_sy", "="}) def _hgvs_pro_is_valid(hgvs_pro: str) -> bool: @@ -759,7 +776,6 @@ def _hgvs_pro_is_valid(hgvs_pro: str) -> bool: return ( (hgvs_pro not in {"_wt", "_sy"}) and (not is_missing_value(hgvs_pro)) - and (len(hgvs_pro) != 3) and ("fs" not in hgvs_pro) ) @@ -939,7 +955,26 @@ def _construct_vrs_allele( ) -> Allele | Haplotype: alleles: list[Allele] = [] for hgvs_string in hgvs_strings: - _logger.info("Processing HGVS string: %s", hgvs_string) + _logger.debug("Processing HGVS string: %s", hgvs_string) + + # Special handling for reference-identical variants, which must be represented as simple Alleles with a + # ReferenceLengthExpression state rather than beeing translated from HGVS. This translation is + # currently unsupported by ga4gh hgvs_tools and will raise an error if attempted. + if hgvs_string.endswith(".="): + if pre_map: + if sequence_id is None: + msg = "Must provide sequence id to construct pre-mapped ref-identical VRS allele" + raise ValueError(msg) + allele = build_ref_identical_allele(sequence_id) + else: + allele = translate_ref_identical_to_vrs(hgvs_string) + + normalize(allele, data_proxy=get_seqrepo()) + + allele.id = ga4gh_identify(allele) + alleles.append(allele) + continue + allele = translate_hgvs_to_vrs(hgvs_string) if pre_map: @@ -953,7 +988,6 @@ def _construct_vrs_allele( if "dup" in hgvs_string: allele.state.sequence = SequenceString(2 * _get_allele_sequence(allele)) - # TODO check assumption that c.= leads to an "N" in the sequence.root if allele.state.sequence.root == "N" and layer == AnnotationLayer.GENOMIC: allele.state.sequence = SequenceString(_get_allele_sequence(allele))