diff --git a/benchmarks/benchmarks/analysis/dssp.py b/benchmarks/benchmarks/analysis/dssp.py new file mode 100644 index 0000000000..db3f511272 --- /dev/null +++ b/benchmarks/benchmarks/analysis/dssp.py @@ -0,0 +1,48 @@ +import MDAnalysis as mda + +try: + from MDAnalysisTests.datafiles import TPR, XTC, PDB_janin +except: + pass + +try: + from MDAnalysis.analysis.dssp import DSSP + from MDAnalysis.analysis.dssp.pydssp_numpy import assign +except: + pass + + +class DsspBench(object): + """Benchmarks for MDAnalysis.analysis.dssp.DSSP over a trajectory.""" + + params = ([1, 5, 10], [True, False]) + param_names = ["n_frames", "guess_hydrogens"] + + def setup(self, n_frames, guess_hydrogens): + self.u = mda.Universe(TPR, XTC) + self.dssp = DSSP(self.u, guess_hydrogens=guess_hydrogens) + self.n_frames = n_frames + + def time_dssp_run(self, n_frames, guess_hydrogens): + """Benchmark DSSP.run() over n_frames of a trajectory.""" + self.dssp.run(stop=self.n_frames) + + +class DsspAssignBench(object): + """Benchmarks for the core DSSP assign() function on a single frame. + + Uses PDB_janin as a large test system for number of residues + """ + + params = [50, 100, 214, 500] + param_names = ["n_residues"] + + def setup(self, n_residues): + self.u = mda.Universe(PDB_janin) + dssp = DSSP(self.u) + coords = dssp._get_coords() + self.coords = coords[:n_residues] + + def time_assign(self, n_residues): + """Benchmark single-frame assign() call.""" + assign(self.coords) diff --git a/package/AUTHORS b/package/AUTHORS index 81ada5e07c..f3d6f7abda 100644 --- a/package/AUTHORS +++ b/package/AUTHORS @@ -192,7 +192,7 @@ Chronological list of authors - Mingyi Xue - Meghan Osato - Anirvinya G - - Rishabh Shukla + - Rishabh Shukla - Manish Kumar - Aditi Tripathi - Sukeerti T @@ -247,14 +247,14 @@ Chronological list of authors - Jia-Xin Zhu - Tanish Yelgoe 2025 - - Joshua Raphael Uy + - Joshua Raphael Uy - Namir Oues - - Pavel Buslaev + - Pavel Buslaev - Lexi Xu - - BHM-Bob G + - BHM-Bob G - Yu-Yuan (Stuart) Yang - James Rowe - - Debasish Mohanty + - Debasish Mohanty - Abdulrahman Elbanna - Tulga-Erdene Sodjargal - Gareth Elliott @@ -262,7 +262,7 @@ Chronological list of authors - Sirsha Ganguly - Amruthesh Thirumalaiswamy - Ch Zhang - - Raúl Lois-Cuns + - Raúl Lois-Cuns - Pranay Pelapkar - Shreejan Dolai - Tanisha Dubey diff --git a/package/CHANGELOG b/package/CHANGELOG index 8a62681c30..1d56314862 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -22,6 +22,9 @@ The rules for this file: * 2.11.0 Fixes + * DSSP now correctly handles periodic boundary conditions in hydrogen bond + detection; previously, hydrogen bonds across periodic boundaries were + missed (PR #5182) * Fix mixed-case atom types in guess_bonds (Issue #5342, PR #5343) * Fixes msd for non-linear frames, when non_linear is not explicitly provided (Issue #5100, PR #5254) @@ -46,6 +49,9 @@ Fixes DSSP by porting upstream PyDSSP 0.9.1 fix (Issue #4913) Enhancements + * DSSP uses ``capped_distance`` and ``calc_bonds`` for faster distance + calculations and supports ``backend`` selection (e.g. ``"distopia"``) + (PR #5182) * Added documentation for all keyword in select_atoms() and selections.rst (Issue #5317, PR #5325) * Added HydrogenBondAnalysis benchmark for performance tracking (PR #5309) @@ -55,7 +61,7 @@ Enhancements * Reduces duplication of code in _apply() function (Issue #5247, PR #5294) * Added new top-level `MDAnalysis.fetch` module (PR #4943) * Added new function `MDAnalysis.fetch.from_PDB` to download structure files from wwPDB - using `pooch` as optional dependency (Issue #4907, PR #4943) + using `pooch` as optional dependency (Issue #4907, PR #4943) * Added benchmarks for package.MDAnalysis.analysis.msd.EinsteinMSD (PR #5277) * Improved performance of `AtomGroup.wrap()` with compounds (PR #5220) * Adds support for parsing `.tpr` files produced by GROMACS 2026.0 diff --git a/package/MDAnalysis/analysis/dssp/dssp.py b/package/MDAnalysis/analysis/dssp/dssp.py index a3082d9315..ea39d78caa 100644 --- a/package/MDAnalysis/analysis/dssp/dssp.py +++ b/package/MDAnalysis/analysis/dssp/dssp.py @@ -144,6 +144,7 @@ .. autofunction:: translate """ +import warnings from typing import Optional, Union import numpy as np @@ -236,6 +237,14 @@ class DSSP(AnalysisBase): (e.g., a :exc:`ValueError` is raised) you must customize `hydrogen_name` for your specific case. + backend : str, optional + :ref:`Backend for distance calculations`, by default ``"serial"``. + Can be set to ``"distopia"`` if `distopia`_ is installed. + + .. versionadded:: 2.11.0 + + .. _distopia: https://www.mdanalysis.org/distopia/ + Raises ------ @@ -308,8 +317,10 @@ def __init__( *, heavyatom_names: tuple[str] = ("N", "CA", "C", "O O1 OT1"), hydrogen_name: str = "H HN HT1 HT2 HT3", + backend: str = "serial", ): self._guess_hydrogens = guess_hydrogens + self._backend = backend ag: AtomGroup = atoms.select_atoms("protein") super().__init__(ag.universe.trajectory) @@ -325,6 +336,15 @@ def __init__( for t in heavyatom_names } self._donor_mask: Optional[np.ndarray] = ag.residues.resnames != "PRO" + self._box = self._trajectory.ts.dimensions + if self._box is not None and np.allclose( + self._box, [1, 1, 1, 90, 90, 90] + ): + self._box = None + warnings.warn( + "Box dimensions are (1, 1, 1, 90, 90, 90), not using " + "periodic boundary conditions in DSSP calculations" + ) self._hydrogens: list["AtomGroup"] = [ res.atoms.select_atoms(f"name {hydrogen_name}") for res in ag.residues @@ -407,7 +427,12 @@ def _get_coords(self) -> np.ndarray: def _single_frame(self): coords = self._get_coords() - dssp = assign(coords, donor_mask=self._donor_mask) + dssp = assign( + coords, + donor_mask=self._donor_mask, + box=self._box, + backend=self._backend, + ) self.results.dssp_ndarray.append(dssp) def _conclude(self): diff --git a/package/MDAnalysis/analysis/dssp/pydssp_numpy.py b/package/MDAnalysis/analysis/dssp/pydssp_numpy.py index 643e510f18..dbbecd7934 100644 --- a/package/MDAnalysis/analysis/dssp/pydssp_numpy.py +++ b/package/MDAnalysis/analysis/dssp/pydssp_numpy.py @@ -11,10 +11,13 @@ import numpy as np +from MDAnalysis.lib.distances import calc_bonds, capped_distance + CONST_Q1Q2 = 0.084 CONST_F = 332 DEFAULT_CUTOFF = -0.5 DEFAULT_MARGIN = 1.0 +HBOND_SEARCH_CUTOFF = 5.0 def _upsample(a: np.ndarray, window: int) -> np.ndarray: @@ -118,10 +121,12 @@ def _get_hydrogen_atom_position(coord: np.ndarray) -> np.ndarray: def get_hbond_map( coord: np.ndarray, - donor_mask: np.ndarray = None, + donor_mask: np.ndarray | None = None, cutoff: float = DEFAULT_CUTOFF, margin: float = DEFAULT_MARGIN, return_e: bool = False, + box: np.ndarray | None = None, + backend: str = "serial", ) -> np.ndarray: """Returns hydrogen bond map @@ -145,6 +150,20 @@ def get_hbond_map( return_e : bool, optional if to return energy instead of hbond map, by default False + box : array_like, optional + The unitcell dimensions of the system, which can be orthogonal or + triclinic and must be provided in the same format as returned by + :attr:`MDAnalysis.coordinates.timestep.Timestep.dimensions`: + ``[lx, ly, lz, alpha, beta, gamma]``. + + .. versionadded:: 2.11.0 + + backend : str, optional + Backend for distance calculations, by default ``"serial"``. + Can be set to ``"distopia"`` if :mod:`distopia` is installed. + + .. versionadded:: 2.11.0 + Returns ------- np.ndarray @@ -156,6 +175,9 @@ def get_hbond_map( .. versionchanged:: 2.10.0 Support masking of hydrogen donors via `donor_mask` (especially needed for ignoring HN on proline residues). Backport of PRO fix from pydssp 0.9.1. + .. versionchanged:: 2.11.0 + Speedup with ``capped_distance`` and support periodic boundary checking + via `box` param. """ n_residues, n_atom_types, _xyz = coord.shape assert n_atom_types in ( @@ -176,58 +198,69 @@ def get_hbond_map( # h.shape == (n_residues, 3) # coord.shape == (n_residues, 4, 3) - # distance matrix - n_1, c_0, o_0 = coord[1:, 0], coord[0:-1, 2], coord[0:-1, 3] - - n = n_residues - 1 - cmap = np.tile(c_0, (n, 1, 1)) - omap = np.tile(o_0, (n, 1, 1)) - nmap = np.tile(n_1, (1, 1, n)).reshape(n, n, 3) - hmap = np.tile(h_1, (1, 1, n)).reshape(n, n, 3) + n_atoms, c_atoms, o_atoms = coord[1:, 0], coord[:-1, 2], coord[:-1, 3] + + # We can reduce the need for a complete pairwise distance matrix by first searching for + # candidate pairs within a certain distance cutoff and only computing the energy + # for these relevant pairs rather than "potential" HBonds between far apart residues + pairs, d_on = capped_distance( + n_atoms, + o_atoms, + max_cutoff=HBOND_SEARCH_CUTOFF, + box=box, + backend=backend, + ) - d_on = np.linalg.norm(omap - nmap, axis=-1) - d_ch = np.linalg.norm(cmap - hmap, axis=-1) - d_oh = np.linalg.norm(omap - hmap, axis=-1) - d_cn = np.linalg.norm(cmap - nmap, axis=-1) + # Exclude local pairs (i, i), (i, i+1), (i, i+2) that are too close for SS HBonds + local_mask = abs(pairs[:, 0] - pairs[:, 1]) >= 2 + pairs = pairs[local_mask] + d_on = d_on[local_mask] + + # Exclude donor H absence (Proline) + if donor_mask is not None: + donor_indices = np.where(np.array(donor_mask) == 0)[0] + mask = ~np.isin(pairs[:, 0], donor_indices - 1) + pairs = pairs[mask] + d_on = d_on[mask] + + # compute distances and energy as previously but only for the potential pairs + # still returning the same energy matrix that would have otherwise been made + o_indices = pairs[:, 1] + n_indices = pairs[:, 0] + + d_ch = calc_bonds( + c_atoms[o_indices], h_1[n_indices], box=box, backend=backend + ) + d_oh = calc_bonds( + o_atoms[o_indices], h_1[n_indices], box=box, backend=backend + ) + d_cn = calc_bonds( + c_atoms[o_indices], n_atoms[n_indices], box=box, backend=backend + ) # electrostatic interaction energy # e[i, j] = e(CO_i) - e(NH_j) - e = np.pad( + e = np.zeros((n_residues, n_residues)) + e[n_indices + 1, o_indices] = ( CONST_Q1Q2 * (1.0 / d_on + 1.0 / d_ch - 1.0 / d_oh - 1.0 / d_cn) - * CONST_F, - [[1, 0], [0, 1]], + * CONST_F ) if return_e: # pragma: no cover return e - # mask for local pairs (i,i), (i,i+1), (i,i+2) - local_mask = ~np.eye(n_residues, dtype=bool) - local_mask *= ~np.diag(np.ones(n_residues - 1, dtype=bool), k=-1) - local_mask *= ~np.diag(np.ones(n_residues - 2, dtype=bool), k=-2) - # mask for donor H absence (Proline) - donor_mask = ( - np.array(donor_mask).astype(float) - if donor_mask is not None - else np.ones(n_residues, dtype=float) - ) - donor_mask = np.tile(donor_mask[:, np.newaxis], (1, n_residues)) - # hydrogen bond map (continuous value extension of original definition) hbond_map = np.clip(cutoff - margin - e, a_min=-margin, a_max=margin) hbond_map = (np.sin(hbond_map / margin * np.pi / 2) + 1.0) / 2 - assert hbond_map.shape == local_mask.shape == donor_mask.shape - - hbond_map *= local_mask - hbond_map *= donor_mask - return hbond_map def assign( coord: np.ndarray, - donor_mask: np.ndarray = None, + donor_mask: np.ndarray | None = None, + box: np.ndarray | None = None, + backend: str = "serial", ) -> np.ndarray: """Assigns secondary structure for a given coordinate array, either with or without assigned hydrogens @@ -240,12 +273,26 @@ def assign( (N, CA, C, O) atoms coordinates (if k=4), or (N, CA, C, O, H) coordinates (when k=5). - donor_mask : np.array + donor_mask : np.array | None, optional Mask out any hydrogens that should not be considered (in particular HN in PRO). If ``None`` then all H will be used (behavior up to 2.9.0). .. versionadded:: 2.10.0 + box : array_like, optional + The unitcell dimensions of the system, which can be orthogonal or + triclinic and must be provided in the same format as returned by + :attr:`MDAnalysis.coordinates.timestep.Timestep.dimensions`: + ``[lx, ly, lz, alpha, beta, gamma]``. + + .. versionadded:: 2.11.0 + + backend : str, optional + Backend for distance calculations, by default ``"serial"``. + Can be set to ``"distopia"`` if `distopia`_ is installed. + + .. versionadded:: 2.11.0 + Returns ------- np.ndarray @@ -257,10 +304,15 @@ def assign( .. versionchanged:: 2.10.0 Support masking of donors. + .. versionchanged:: 2.11.0 + Speedup with ``capped_distance`` and support periodic boundary checking + via `box` param. """ # get hydrogen bond map - hbmap = get_hbond_map(coord, donor_mask=donor_mask) + hbmap = get_hbond_map( + coord, donor_mask=donor_mask, box=box, backend=backend + ) hbmap = np.swapaxes(hbmap, -1, -2) # convert into "i:C=O, j:N-H" form # identify turn 3, 4, 5 diff --git a/testsuite/MDAnalysisTests/analysis/test_dssp.py b/testsuite/MDAnalysisTests/analysis/test_dssp.py index 93596ebdea..b01b79168e 100644 --- a/testsuite/MDAnalysisTests/analysis/test_dssp.py +++ b/testsuite/MDAnalysisTests/analysis/test_dssp.py @@ -1,8 +1,11 @@ import glob +import warnings +from pathlib import Path import MDAnalysis as mda +import numpy as np import pytest -from MDAnalysis.analysis.dssp import DSSP, translate +from MDAnalysis.analysis.dssp import DSSP, assign, translate from MDAnalysisTests.datafiles import DSSP as DSSP_FOLDER from MDAnalysisTests.datafiles import TPR, XTC @@ -35,6 +38,65 @@ def test_trajectory(client_DSSP): ) +# ensure that we get different assigned results when using and not using the +# donor mask which filters out prolines from being potential HBond donors as they +# are missing hydrogens +def test_donor_mask(): + u = mda.Universe(TPR, XTC).select_atoms("protein").universe + dssp = DSSP(u) + coords = dssp._get_coords() + n_residues = coords.shape[0] + + result_no_mask = assign(coords) + result_with_mask = assign(coords, donor_mask=dssp._donor_mask) + + assert result_no_mask.shape == (n_residues, 3) + assert result_with_mask.shape == (n_residues, 3) + assert not np.array_equal(result_no_mask, result_with_mask) + + +def test_donor_mask_proline_indices(): + """Test that donor_mask correctly identifies proline residues.""" + u = mda.Universe(TPR, XTC).select_atoms("protein").universe + dssp = DSSP(u) + resnames = u.select_atoms("protein").residues.resnames + is_pro = resnames == "PRO" + + # donor_mask should be False at proline positions and True at non-proline positions + assert np.all(~dssp._donor_mask[is_pro]) + assert np.all(dssp._donor_mask[~is_pro]) + + +def test_donor_mask_none(): + """Test that assign works with donor_mask=None (pre-2.10.0 behavior).""" + u = mda.Universe(TPR, XTC).select_atoms("protein").universe + dssp = DSSP(u) + coords = dssp._get_coords() + n_residues = coords.shape[0] + + # None mask should work and return valid shape and each residue should have exactly one + # True in the one-hot encoding + result = assign(coords, donor_mask=None) + assert result.shape == (n_residues, 3) + assert np.all(result.sum(axis=-1) == 1) + + +def test_placeholder_box_warnings(): + """Test that placeholder (1,1,1,90,90,90) box dimensions trigger a warning + and disable PBC.""" + u = mda.Universe(f"{DSSP_FOLDER}/2va0A.pdb.gz") + u.trajectory.ts.dimensions = np.array([1.0, 1.0, 1.0, 90.0, 90.0, 90.0]) + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + dssp = DSSP(u) + box_warnings = [ + x for x in w if "not using periodic boundary" in str(x.message) + ] + assert len(box_warnings) == 1 + assert dssp._box is None + + def test_atomgroup(client_DSSP): protein = mda.Universe(TPR, XTC).select_atoms("protein") run = DSSP(protein).run(**client_DSSP, stop=10)