From 6d529db5ab3503d379f152be5bd05cf12a688722 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Wed, 11 Mar 2026 12:46:47 +0100 Subject: [PATCH 01/12] perf(importers): batch endpoint creation and status updates during import/reimport Replace per-finding endpoint_get_or_create() calls with a stateful EndpointManager that accumulates endpoints and statuses across findings and flushes them in bulk at batch boundaries. Reduces ~6200 DB queries to ~3-5 for a 1000-finding scan with 5 endpoints per finding (200 unique). - EndpointManager now takes a `product` param and holds internal accumulators - `record_endpoint()` deduplicates by normalized key within a batch - `record_status_for_create()` / `record_statuses_to_mitigate()` / `record_statuses_to_reactivate()` accumulate operations - `persist()` flushes all pending creates and bulk_updates in one shot - `update_endpoint_status()` accumulates mitigate/reactivate lists instead of dispatching per-finding Celery tasks - Removed old `chunk_endpoints_and_*`, `add_endpoints_to_unsaved_finding`, `mitigate_endpoint_status`, `reactivate_endpoint_status` methods - Added unit tests for `_make_endpoint_unique_tuple` normalization - Updated performance test fixture to match new stateful manager interface --- dojo/importers/base_importer.py | 28 +- dojo/importers/default_importer.py | 11 +- dojo/importers/default_reimporter.py | 21 +- dojo/importers/endpoint_manager.py | 334 +++++++++++++++--------- unittests/test_endpoint_manager.py | 83 ++++++ unittests/test_importers_performance.py | 52 ++-- 6 files changed, 362 insertions(+), 167 deletions(-) create mode 100644 unittests/test_endpoint_manager.py diff --git a/dojo/importers/base_importer.py b/dojo/importers/base_importer.py index 7734fc42d93..743da0d862a 100644 --- a/dojo/importers/base_importer.py +++ b/dojo/importers/base_importer.py @@ -14,7 +14,6 @@ import dojo.finding.helper as finding_helper import dojo.risk_acceptance.helper as ra_helper from dojo.celery_dispatch import dojo_dispatch_task -from dojo.importers.endpoint_manager import EndpointManager from dojo.importers.location_manager import LocationManager, UnsavedLocation from dojo.importers.options import ImporterOptions from dojo.jira_link.helper import is_keep_in_sync_with_jira @@ -85,7 +84,9 @@ def __init__( self.location_manager = LocationManager() else: # TODO: Delete this after the move to Locations - self.endpoint_manager = EndpointManager() + # EndpointManager is initialized in each subclass's process_scan() + # after self.test is available, because it needs the product. + self.endpoint_manager = None def check_child_implementation_exception(self): """ @@ -825,12 +826,17 @@ def process_endpoints( msg = "BaseImporter#process_endpoints() method is deprecated when V3_FEATURE_LOCATIONS is enabled" raise NotImplementedError(msg) - # Save the unsaved endpoints - self.endpoint_manager.chunk_endpoints_and_disperse(finding, finding.unsaved_endpoints) - # Check for any that were added in the form + # Clean and record unsaved endpoints from the report + self.endpoint_manager.clean_unsaved_endpoints(finding.unsaved_endpoints) + for endpoint in finding.unsaved_endpoints: + key = self.endpoint_manager.record_endpoint(endpoint) + self.endpoint_manager.record_status_for_create(finding, key) + # Record any endpoints added from the form if len(endpoints_to_add) > 0: logger.debug("endpoints_to_add: %s", endpoints_to_add) - self.endpoint_manager.chunk_endpoints_and_disperse(finding, endpoints_to_add) + for endpoint in endpoints_to_add: + key = self.endpoint_manager.record_endpoint(endpoint) + self.endpoint_manager.record_status_for_create(finding, key) def sanitize_vulnerability_ids(self, finding) -> None: """Remove undisired vulnerability id values""" @@ -934,14 +940,8 @@ def mitigate_finding( ) else: # TODO: Delete this after the move to Locations - # Mitigate the endpoint statuses - dojo_dispatch_task( - EndpointManager.mitigate_endpoint_status, - finding.status_finding.all(), - self.user, - kwuser=self.user, - sync=True, - ) + # Accumulate endpoint statuses for bulk mitigate in persist() + self.endpoint_manager.record_statuses_to_mitigate(finding.status_finding.all()) # to avoid pushing a finding group multiple times, we push those outside of the loop if finding_groups_enabled and finding.finding_group: # don't try to dedupe findings that we are closing diff --git a/dojo/importers/default_importer.py b/dojo/importers/default_importer.py index 8dd2aa4a4f9..1d988848210 100644 --- a/dojo/importers/default_importer.py +++ b/dojo/importers/default_importer.py @@ -9,6 +9,7 @@ from dojo.celery_dispatch import dojo_dispatch_task from dojo.finding import helper as finding_helper from dojo.importers.base_importer import BaseImporter, Parser +from dojo.importers.endpoint_manager import EndpointManager from dojo.importers.options import ImporterOptions from dojo.jira_link.helper import is_keep_in_sync_with_jira from dojo.models import ( @@ -110,6 +111,9 @@ def process_scan( # Get the findings from the parser based on what methods the parser supplies # This could either mean traditional file parsing, or API pull parsing parsed_findings = self.parse_findings(scan, parser) or [] + # Initialize the endpoint manager now that self.test is available + if not settings.V3_FEATURE_LOCATIONS: + self.endpoint_manager = EndpointManager(self.test.engagement.product) new_findings = self.process_findings(parsed_findings, **kwargs) # Close any old findings in the processed list if the the user specified for that # to occur in the form that is then passed to the kwargs @@ -259,8 +263,10 @@ def process_findings( logger.debug("process_findings: computed push_to_jira=%s", push_to_jira) batch_finding_ids.append(finding.id) - # If batch is full or we're at the end, dispatch one batched task + # If batch is full or we're at the end, persist endpoints and dispatch if len(batch_finding_ids) >= batch_max_size or is_final_finding: + if not settings.V3_FEATURE_LOCATIONS: + self.endpoint_manager.persist(user=self.user) finding_ids_batch = list(batch_finding_ids) batch_finding_ids.clear() logger.debug("process_findings: dispatching batch with push_to_jira=%s (batch_size=%d, is_final=%s)", @@ -378,6 +384,9 @@ def close_old_findings( finding_groups_enabled=self.findings_groups_enabled, product_grading_option=False, ) + # Persist any accumulated endpoint status mitigations + if not settings.V3_FEATURE_LOCATIONS: + self.endpoint_manager.persist(user=self.user) # push finding groups to jira since we only only want to push whole groups # We dont check if the finding jira sync is applicable quite yet until we can get in the loop # but this is a way to at least make it that far diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py index 5075eb6409b..fd4d71c4a38 100644 --- a/dojo/importers/default_reimporter.py +++ b/dojo/importers/default_reimporter.py @@ -14,6 +14,7 @@ find_candidates_for_reimport_legacy, ) from dojo.importers.base_importer import BaseImporter, Parser +from dojo.importers.endpoint_manager import EndpointManager from dojo.importers.options import ImporterOptions from dojo.jira_link.helper import is_keep_in_sync_with_jira from dojo.location.status import FindingLocationStatus @@ -95,6 +96,9 @@ def process_scan( - Update the test progress """ logger.debug(f"REIMPORT_SCAN: parameters: {locals()}") + # Initialize the endpoint manager now that self.test is available + if not settings.V3_FEATURE_LOCATIONS: + self.endpoint_manager = EndpointManager(self.test.engagement.product) # Validate the Tool_Configuration self.verify_tool_configuration_from_test() # Fetch the parser based upon the string version of the scan type @@ -430,6 +434,8 @@ def process_findings( # - Deduplication batches: optimize bulk operations (larger batches = fewer queries) # They don't need to be aligned since they optimize different operations. if len(batch_finding_ids) >= dedupe_batch_max_size or is_final: + if not settings.V3_FEATURE_LOCATIONS: + self.endpoint_manager.persist(user=self.user) finding_ids_batch = list(batch_finding_ids) batch_finding_ids.clear() dojo_dispatch_task( @@ -497,6 +503,9 @@ def close_old_findings( product_grading_option=False, ) mitigated_findings.append(finding) + # Persist any accumulated endpoint status mitigations + if not settings.V3_FEATURE_LOCATIONS: + self.endpoint_manager.persist(user=self.user) # push finding groups to jira since we only only want to push whole groups # We dont check if the finding jira sync is applicable quite yet until we can get in the loop # but this is a way to at least make it that far @@ -763,9 +772,9 @@ def process_matched_mitigated_finding( self.location_manager.chunk_locations_and_reactivate(mitigated_locations) else: # TODO: Delete this after the move to Locations - # Reactivate mitigated endpoints that are not false positives, out of scope, or risk accepted + # Accumulate endpoint statuses for bulk reactivation in persist() # status_finding_non_special is prefetched by build_candidate_scope_queryset - self.endpoint_manager.chunk_endpoints_and_reactivate(existing_finding.status_finding_non_special) + self.endpoint_manager.record_statuses_to_reactivate(existing_finding.status_finding_non_special) existing_finding.notes.add(note) self.reactivated_items.append(existing_finding) # The new finding is active while the existing on is mitigated. The existing finding needs to @@ -932,9 +941,13 @@ def finding_post_processing( self.location_manager.chunk_locations_and_disperse(finding, self.endpoints_to_add) else: # TODO: Delete this after the move to Locations - self.endpoint_manager.chunk_endpoints_and_disperse(finding, finding_from_report.unsaved_endpoints) + for endpoint in finding_from_report.unsaved_endpoints: + key = self.endpoint_manager.record_endpoint(endpoint) + self.endpoint_manager.record_status_for_create(finding, key) if len(self.endpoints_to_add) > 0: - self.endpoint_manager.chunk_endpoints_and_disperse(finding, self.endpoints_to_add) + for endpoint in self.endpoints_to_add: + key = self.endpoint_manager.record_endpoint(endpoint) + self.endpoint_manager.record_status_for_create(finding, key) # Parsers shouldn't use the tags field, and use unsaved_tags instead. # Merge any tags set by parser into unsaved_tags tags_from_parser = finding_from_report.tags if isinstance(finding_from_report.tags, list) else [] diff --git a/dojo/importers/endpoint_manager.py b/dojo/importers/endpoint_manager.py index 2b5883121ab..7df87f10036 100644 --- a/dojo/importers/endpoint_manager.py +++ b/dojo/importers/endpoint_manager.py @@ -1,122 +1,80 @@ import logging +from typing import NamedTuple -from django.core.exceptions import MultipleObjectsReturned, ValidationError -from django.urls import reverse +from django.core.exceptions import ValidationError +from django.db import transaction from django.utils import timezone +from hyperlink._url import SCHEME_PORT_MAP # noqa: PLC2701 -from dojo.celery import app -from dojo.celery_dispatch import dojo_dispatch_task -from dojo.endpoint.utils import endpoint_get_or_create from dojo.models import ( Dojo_User, Endpoint, Endpoint_Status, Finding, + Product, ) logger = logging.getLogger(__name__) +class EndpointUniqueKey(NamedTuple): + protocol: str | None + userinfo: str | None + host: str | None + port: int | None + path: str | None + query: str | None + fragment: str | None + product_id: int + + # TODO: Delete this after the move to Locations class EndpointManager: - @app.task - def add_endpoints_to_unsaved_finding( - finding: Finding, # noqa: N805 - endpoints: list[Endpoint], - **kwargs: dict, - ) -> None: - """Creates Endpoint objects for a single finding and creates the link via the endpoint status""" - logger.debug(f"IMPORT_SCAN: Adding {len(endpoints)} endpoints to finding: {finding}") - EndpointManager.clean_unsaved_endpoints(endpoints) - for endpoint in endpoints: - ep = None - eps = [] - try: - ep, _ = endpoint_get_or_create( - protocol=endpoint.protocol, - userinfo=endpoint.userinfo, - host=endpoint.host, - port=endpoint.port, - path=endpoint.path, - query=endpoint.query, - fragment=endpoint.fragment, - product=finding.test.engagement.product, - ) - eps.append(ep) - except (MultipleObjectsReturned): - msg = ( - f"Endpoints in your database are broken. " - f"Please access {reverse('endpoint_migrate')} and migrate them to new format or remove them." - ) - raise Exception(msg) - # bulk_create will translate to INSERT WITH IGNORE CONFLICTS - # much faster than get_or_create which issues two queries per endpoint - # bulk_create will not trigger endpoint_status.save and signals which is fine for now - rows = [Endpoint_Status(finding=finding, endpoint=e, date=finding.date) for e in eps] - Endpoint_Status.objects.bulk_create(rows, ignore_conflicts=True, batch_size=1000) + def __init__(self, product: Product) -> None: + self._product = product + self._endpoints_to_create: dict[EndpointUniqueKey, dict] = {} + self._statuses_to_create: list[tuple[Finding, EndpointUniqueKey]] = [] + self._statuses_to_mitigate: list[Endpoint_Status] = [] + self._statuses_to_reactivate: list[Endpoint_Status] = [] - logger.debug(f"IMPORT_SCAN: {len(endpoints)} endpoints imported") + @staticmethod + def _make_endpoint_unique_tuple( + protocol: str | None, + userinfo: str | None, + host: str | None, + port: int | None, + path: str | None, + query: str | None, + fragment: str | None, + product_id: int, + ) -> EndpointUniqueKey: + """ + Normalize endpoint fields to a unique key matching endpoint_filter() semantics. - @app.task - def mitigate_endpoint_status( - endpoint_status_list: list[Endpoint_Status], # noqa: N805 - user: Dojo_User, - **kwargs: dict, - ) -> None: - """Mitigates all endpoint status objects that are supplied""" - now = timezone.now() - to_update = [] - for endpoint_status in endpoint_status_list: - # Only mitigate endpoints that are actually active - if endpoint_status.mitigated is False: - endpoint_status.mitigated_time = now - endpoint_status.last_modified = now - endpoint_status.mitigated_by = user - endpoint_status.mitigated = True - to_update.append(endpoint_status) - - if to_update: - Endpoint_Status.objects.bulk_update( - to_update, - ["mitigated", "mitigated_time", "last_modified", "mitigated_by"], - batch_size=1000, - ) + See dojo/endpoint/utils.py endpoint_filter() for the canonical matching logic. + """ + norm_protocol = protocol.lower() if protocol else None + norm_host = host.lower() if host else None - @app.task - def reactivate_endpoint_status( - endpoint_status_list: list[Endpoint_Status], # noqa: N805 - **kwargs: dict, - ) -> None: - """Reactivate all endpoint status objects that are supplied""" - now = timezone.now() - to_update = [] - for endpoint_status in endpoint_status_list: - # Only reactivate endpoints that are actually mitigated - if endpoint_status.mitigated: - logger.debug("Re-import: reactivating endpoint %s that is present in this scan", endpoint_status.endpoint) - endpoint_status.mitigated_by = None - endpoint_status.mitigated_time = None - endpoint_status.mitigated = False - endpoint_status.last_modified = now - to_update.append(endpoint_status) - - if to_update: - Endpoint_Status.objects.bulk_update( - to_update, - ["mitigated", "mitigated_time", "mitigated_by", "last_modified"], - batch_size=1000, - ) + # Port normalization: if protocol has a default port, treat that port + # and None as equivalent (matching endpoint_filter's Q(port__isnull=True) | Q(port__exact=default)) + norm_port = port + if norm_protocol and norm_protocol in SCHEME_PORT_MAP: + default_port = SCHEME_PORT_MAP[norm_protocol] + if port is None or port == default_port: + norm_port = None - def chunk_endpoints_and_disperse( - self, - finding: Finding, - endpoints: list[Endpoint], - **kwargs: dict, - ) -> None: - if not endpoints: - return - dojo_dispatch_task(EndpointManager.add_endpoints_to_unsaved_finding, finding, endpoints, sync=True) + return EndpointUniqueKey( + protocol=norm_protocol, + userinfo=userinfo or None, + host=norm_host, + port=norm_port, + path=path or None, + query=query or None, + fragment=fragment or None, + product_id=product_id, + ) @staticmethod def clean_unsaved_endpoints( @@ -124,7 +82,7 @@ def clean_unsaved_endpoints( ) -> None: """ Clean endpoints that are supplied. For any endpoints that fail this validation - process, raise a message that broken endpoints are being stored + process, raise a message that broken endpoints are being stored. """ for endpoint in endpoints: try: @@ -132,50 +90,182 @@ def clean_unsaved_endpoints( except ValidationError as e: logger.warning("DefectDojo is storing broken endpoint because cleaning wasn't successful: %s", e) - def chunk_endpoints_and_reactivate( - self, - endpoint_status_list: list[Endpoint_Status], - **kwargs: dict, - ) -> None: - dojo_dispatch_task(EndpointManager.reactivate_endpoint_status, endpoint_status_list, sync=True) + def record_endpoint(self, endpoint: Endpoint) -> EndpointUniqueKey: + """Record an endpoint for later batch creation. Returns the unique key.""" + key = self._make_endpoint_unique_tuple( + protocol=endpoint.protocol, + userinfo=endpoint.userinfo, + host=endpoint.host, + port=endpoint.port, + path=endpoint.path, + query=endpoint.query, + fragment=endpoint.fragment, + product_id=self._product.id, + ) + if key not in self._endpoints_to_create: + self._endpoints_to_create[key] = { + "protocol": endpoint.protocol, + "userinfo": endpoint.userinfo, + "host": endpoint.host, + "port": endpoint.port, + "path": endpoint.path, + "query": endpoint.query, + "fragment": endpoint.fragment, + "product": self._product, + } + return key - def chunk_endpoints_and_mitigate( - self, - endpoint_status_list: list[Endpoint_Status], - user: Dojo_User, - **kwargs: dict, - ) -> None: - dojo_dispatch_task(EndpointManager.mitigate_endpoint_status, endpoint_status_list, user, sync=True) + def record_status_for_create(self, finding: Finding, key: EndpointUniqueKey) -> None: + """Record that a finding should be linked to an endpoint (identified by key) via Endpoint_Status.""" + self._statuses_to_create.append((finding, key)) def update_endpoint_status( self, existing_finding: Finding, new_finding: Finding, user: Dojo_User, - **kwargs: dict, ) -> None: - """Update the list of endpoints from the new finding with the list that is in the old finding""" - # New endpoints are already added in serializers.py / views.py (see comment "# for existing findings: make sure endpoints are present or created") - # So we only need to mitigate endpoints that are no longer present + """ + Compare old/new endpoints and accumulate mitigate/reactivate lists. + + The actual bulk_update happens in persist(). + """ # status_finding_non_special is prefetched by build_candidate_scope_queryset with the # special-status exclusion and endpoint select_related already applied at the DB level existing_finding_endpoint_status_list = existing_finding.status_finding_non_special new_finding_endpoints_list = new_finding.unsaved_endpoints if new_finding.is_mitigated: # New finding is mitigated, so mitigate all old endpoints - endpoint_status_to_mitigate = existing_finding_endpoint_status_list + self._statuses_to_mitigate.extend(existing_finding_endpoint_status_list) else: # Convert to set for O(1) lookups instead of O(n) linear search new_finding_endpoints_set = set(new_finding_endpoints_list) # Mitigate any endpoints in the old finding not found in the new finding - endpoint_status_to_mitigate = [ + self._statuses_to_mitigate.extend( eps for eps in existing_finding_endpoint_status_list if eps.endpoint not in new_finding_endpoints_set - ] + ) # Re-activate any endpoints in the old finding that are in the new finding - endpoint_status_to_reactivate = [ + self._statuses_to_reactivate.extend( eps for eps in existing_finding_endpoint_status_list if eps.endpoint in new_finding_endpoints_set + ) + + def record_statuses_to_reactivate(self, statuses: list[Endpoint_Status]) -> None: + """Accumulate endpoint statuses for bulk reactivation in persist().""" + self._statuses_to_reactivate.extend(statuses) + + def record_statuses_to_mitigate(self, statuses: list[Endpoint_Status]) -> None: + """Accumulate endpoint statuses for bulk mitigation in persist().""" + self._statuses_to_mitigate.extend(statuses) + + def _fetch_and_create_endpoints(self) -> dict[EndpointUniqueKey, Endpoint]: + """ + Fetch existing product endpoints and bulk_create missing ones. + + Returns a key -> Endpoint mapping for all endpoints referenced in _endpoints_to_create. + """ + if not self._endpoints_to_create: + return {} + + key_to_endpoint: dict[EndpointUniqueKey, Endpoint] = {} + + with transaction.atomic(): + # Fetch all existing endpoints for this product + for ep in ( + Endpoint.objects.filter(product=self._product) + .only("id", "protocol", "userinfo", "host", "port", "path", "query", "fragment", "product_id") + .order_by("id") + .iterator() + ): + key = self._make_endpoint_unique_tuple( + protocol=ep.protocol, + userinfo=ep.userinfo, + host=ep.host, + port=ep.port, + path=ep.path, + query=ep.query, + fragment=ep.fragment, + product_id=ep.product_id, + ) + # First-by-id wins, matching endpoint_get_or_create behavior + if key not in key_to_endpoint: + key_to_endpoint[key] = ep + + # Determine which endpoints still need creating + to_create = [] + to_create_keys = [] + for key, kwargs in self._endpoints_to_create.items(): + if key not in key_to_endpoint: + to_create.append(Endpoint(**kwargs)) + to_create_keys.append(key) + + if to_create: + created = Endpoint.objects.bulk_create(to_create, batch_size=1000) + key_to_endpoint.update(zip(to_create_keys, created, strict=True)) + + self._endpoints_to_create.clear() + return key_to_endpoint + + def persist(self, user: Dojo_User | None = None) -> None: + """ + Persist all accumulated endpoint operations to the database. + + Called at batch boundaries during import/reimport. + """ + # Step 1: Ensure all recorded endpoints exist in DB + key_to_endpoint = self._fetch_and_create_endpoints() + + # Step 2: Bulk-create Endpoint_Status rows + if self._statuses_to_create: + rows = [ + Endpoint_Status( + finding=finding, + endpoint=key_to_endpoint[key], + date=finding.date, + ) + for finding, key in self._statuses_to_create + if key in key_to_endpoint ] - self.chunk_endpoints_and_reactivate(endpoint_status_to_reactivate) - self.chunk_endpoints_and_mitigate(endpoint_status_to_mitigate, user) + if rows: + Endpoint_Status.objects.bulk_create(rows, ignore_conflicts=True, batch_size=1000) + self._statuses_to_create.clear() + + # Step 3: Bulk-update mitigated endpoint statuses + if self._statuses_to_mitigate: + now = timezone.now() + to_update = [] + for endpoint_status in self._statuses_to_mitigate: + if endpoint_status.mitigated is False: + endpoint_status.mitigated_time = now + endpoint_status.last_modified = now + endpoint_status.mitigated_by = user + endpoint_status.mitigated = True + to_update.append(endpoint_status) + if to_update: + Endpoint_Status.objects.bulk_update( + to_update, + ["mitigated", "mitigated_time", "last_modified", "mitigated_by"], + batch_size=1000, + ) + self._statuses_to_mitigate.clear() + + # Step 4: Bulk-update reactivated endpoint statuses + if self._statuses_to_reactivate: + now = timezone.now() + to_update = [] + for endpoint_status in self._statuses_to_reactivate: + if endpoint_status.mitigated: + logger.debug("Re-import: reactivating endpoint %s that is present in this scan", endpoint_status.endpoint) + endpoint_status.mitigated_by = None + endpoint_status.mitigated_time = None + endpoint_status.mitigated = False + endpoint_status.last_modified = now + to_update.append(endpoint_status) + if to_update: + Endpoint_Status.objects.bulk_update( + to_update, + ["mitigated", "mitigated_time", "mitigated_by", "last_modified"], + batch_size=1000, + ) + self._statuses_to_reactivate.clear() diff --git a/unittests/test_endpoint_manager.py b/unittests/test_endpoint_manager.py new file mode 100644 index 00000000000..af7fa8ccb47 --- /dev/null +++ b/unittests/test_endpoint_manager.py @@ -0,0 +1,83 @@ +from django.test import TestCase + +from dojo.importers.endpoint_manager import EndpointManager, EndpointUniqueKey + + +class TestMakeEndpointUniqueTuple(TestCase): + + """Tests for EndpointManager._make_endpoint_unique_tuple normalization.""" + + def _make(self, **kwargs): + defaults = { + "protocol": None, + "userinfo": None, + "host": None, + "port": None, + "path": None, + "query": None, + "fragment": None, + "product_id": 1, + } + defaults.update(kwargs) + return EndpointManager._make_endpoint_unique_tuple(**defaults) + + def test_protocol_case_insensitive(self): + a = self._make(protocol="HTTP", host="example.com") + b = self._make(protocol="http", host="example.com") + self.assertEqual(a, b) + + def test_host_case_insensitive(self): + a = self._make(host="Example.COM") + b = self._make(host="example.com") + self.assertEqual(a, b) + + def test_default_port_normalized_to_none_https(self): + a = self._make(protocol="https", host="example.com", port=443) + b = self._make(protocol="https", host="example.com", port=None) + self.assertEqual(a, b) + self.assertIsNone(a.port) + + def test_default_port_normalized_to_none_http(self): + a = self._make(protocol="http", host="example.com", port=80) + b = self._make(protocol="http", host="example.com", port=None) + self.assertEqual(a, b) + + def test_non_default_port_preserved(self): + a = self._make(protocol="https", host="example.com", port=8443) + b = self._make(protocol="https", host="example.com", port=None) + self.assertNotEqual(a, b) + self.assertEqual(a.port, 8443) + + def test_none_fields_handled(self): + key = self._make() + self.assertIsNone(key.protocol) + self.assertIsNone(key.host) + self.assertIsNone(key.port) + self.assertIsNone(key.path) + + def test_empty_string_fields_normalized_to_none(self): + key = self._make(userinfo="", path="", query="", fragment="") + self.assertIsNone(key.userinfo) + self.assertIsNone(key.path) + self.assertIsNone(key.query) + self.assertIsNone(key.fragment) + + def test_different_products_different_keys(self): + a = self._make(host="example.com", product_id=1) + b = self._make(host="example.com", product_id=2) + self.assertNotEqual(a, b) + + def test_returns_named_tuple(self): + key = self._make(protocol="https", host="example.com", path="/api") + self.assertIsInstance(key, EndpointUniqueKey) + self.assertEqual(key.protocol, "https") + self.assertEqual(key.host, "example.com") + self.assertEqual(key.path, "/api") + + def test_port_without_known_protocol_preserved(self): + key = self._make(protocol="custom", host="example.com", port=9999) + self.assertEqual(key.port, 9999) + + def test_port_none_without_known_protocol(self): + key = self._make(protocol="custom", host="example.com", port=None) + self.assertIsNone(key.port) diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py index 565e04f98e6..b54f04c55c2 100644 --- a/unittests/test_importers_performance.py +++ b/unittests/test_importers_performance.py @@ -268,12 +268,12 @@ def test_import_reimport_reimport_performance_pghistory_async(self): configure_pghistory_triggers() self._import_reimport_performance( - expected_num_queries1=295, - expected_num_async_tasks1=6, - expected_num_queries2=226, - expected_num_async_tasks2=17, - expected_num_queries3=108, - expected_num_async_tasks3=16, + expected_num_queries1=139, + expected_num_async_tasks1=1, + expected_num_queries2=115, + expected_num_async_tasks2=1, + expected_num_queries3=29, + expected_num_async_tasks3=1, ) @override_settings(ENABLE_AUDITLOG=True) @@ -290,12 +290,12 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self): testuser.usercontactinfo.save() self._import_reimport_performance( - expected_num_queries1=302, - expected_num_async_tasks1=6, - expected_num_queries2=233, - expected_num_async_tasks2=17, - expected_num_queries3=115, - expected_num_async_tasks3=16, + expected_num_queries1=146, + expected_num_async_tasks1=1, + expected_num_queries2=122, + expected_num_async_tasks2=1, + expected_num_queries3=36, + expected_num_async_tasks3=1, ) @override_settings(ENABLE_AUDITLOG=True) @@ -313,12 +313,12 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr self.system_settings(enable_product_grade=True) self._import_reimport_performance( - expected_num_queries1=309, - expected_num_async_tasks1=8, - expected_num_queries2=240, - expected_num_async_tasks2=19, - expected_num_queries3=119, - expected_num_async_tasks3=18, + expected_num_queries1=153, + expected_num_async_tasks1=3, + expected_num_queries2=129, + expected_num_async_tasks2=3, + expected_num_queries3=40, + expected_num_async_tasks3=3, ) # Deduplication is enabled in the tests above, but to properly test it we must run the same import twice and capture the results. @@ -436,10 +436,10 @@ def test_deduplication_performance_pghistory_async(self): self.system_settings(enable_deduplication=True) self._deduplication_performance( - expected_num_queries1=264, - expected_num_async_tasks1=7, - expected_num_queries2=175, - expected_num_async_tasks2=7, + expected_num_queries1=74, + expected_num_async_tasks1=1, + expected_num_queries2=69, + expected_num_async_tasks2=1, check_duplicates=False, # Async mode - deduplication happens later ) @@ -457,8 +457,8 @@ def test_deduplication_performance_pghistory_no_async(self): testuser.usercontactinfo.save() self._deduplication_performance( - expected_num_queries1=271, - expected_num_async_tasks1=7, - expected_num_queries2=236, - expected_num_async_tasks2=7, + expected_num_queries1=81, + expected_num_async_tasks1=1, + expected_num_queries2=130, + expected_num_async_tasks2=1, ) From bcdac76cf3928a7e79a4f5d0cdf6fd66b04dfb2f Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Wed, 11 Mar 2026 14:58:15 +0100 Subject: [PATCH 02/12] fix endpoint manager initialization --- dojo/importers/default_importer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dojo/importers/default_importer.py b/dojo/importers/default_importer.py index 1d988848210..93f20e8a593 100644 --- a/dojo/importers/default_importer.py +++ b/dojo/importers/default_importer.py @@ -84,6 +84,9 @@ def create_test( api_scan_configuration=self.api_scan_configuration, tags=self.tags, ) + # Initialize the endpoint manager now that self.test is available + if not settings.V3_FEATURE_LOCATIONS: + self.endpoint_manager = EndpointManager(self.test.engagement.product) return self.test def process_scan( @@ -111,9 +114,6 @@ def process_scan( # Get the findings from the parser based on what methods the parser supplies # This could either mean traditional file parsing, or API pull parsing parsed_findings = self.parse_findings(scan, parser) or [] - # Initialize the endpoint manager now that self.test is available - if not settings.V3_FEATURE_LOCATIONS: - self.endpoint_manager = EndpointManager(self.test.engagement.product) new_findings = self.process_findings(parsed_findings, **kwargs) # Close any old findings in the processed list if the the user specified for that # to occur in the form that is then passed to the kwargs From 5539db23c8211e01c77f1ef741062e0cca19a620 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Wed, 11 Mar 2026 15:05:01 +0100 Subject: [PATCH 03/12] fix(importers): restore tag inheritance and endpoint_manager init for direct callers - bulk_create bypasses Django post_save signals, so manually call inherit_instance_tags() for each newly created Endpoint to preserve product tag inheritance behavior - Initialize endpoint_manager in create_test() so callers that invoke create_test() + process_findings() directly (without going through process_scan()) don't hit a NoneType error --- dojo/importers/endpoint_manager.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/dojo/importers/endpoint_manager.py b/dojo/importers/endpoint_manager.py index 7df87f10036..132921a53da 100644 --- a/dojo/importers/endpoint_manager.py +++ b/dojo/importers/endpoint_manager.py @@ -13,6 +13,7 @@ Finding, Product, ) +from dojo.tags_signals import inherit_instance_tags logger = logging.getLogger(__name__) @@ -203,6 +204,10 @@ def _fetch_and_create_endpoints(self) -> dict[EndpointUniqueKey, Endpoint]: if to_create: created = Endpoint.objects.bulk_create(to_create, batch_size=1000) key_to_endpoint.update(zip(to_create_keys, created, strict=True)) + # bulk_create bypasses post_save signals, so manually trigger tag inheritance + # this is not ideal, but we need to take a separate look at the tag inheritance feature itself later + for ep in created: + inherit_instance_tags(ep) self._endpoints_to_create.clear() return key_to_endpoint From 260b31374d463ec77137bd5216a0cea85ca34232 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sat, 14 Mar 2026 16:27:19 +0100 Subject: [PATCH 04/12] refactor(importers): explicit test param for endpoint manager, rename and improve get_or_create_endpoints MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add `test` parameter to `_create_endpoint_manager()` so callers pass the test explicitly instead of relying on `self.test` being set - Raise `ValueError` with a clear message when `test is None` - Initialize endpoint_manager in `DefaultImporter.process_scan()` after `parse_findings()` (which calls `create_test()` for fresh imports), covering both the sync path and the async Celery task path - Rename `_fetch_and_create_endpoints()` → `get_or_create_endpoints()` - Change return type to `tuple[dict, list]`: returns `(endpoints_by_key, created)` so callers know exactly which endpoints were newly inserted - Rename local `key_to_endpoint` → `endpoints_by_key` for clarity --- dojo/importers/default_importer.py | 16 ++++++++++++--- dojo/importers/default_reimporter.py | 6 +++++- dojo/importers/endpoint_manager.py | 30 ++++++++++++++++------------ 3 files changed, 35 insertions(+), 17 deletions(-) diff --git a/dojo/importers/default_importer.py b/dojo/importers/default_importer.py index 93f20e8a593..59482eddf17 100644 --- a/dojo/importers/default_importer.py +++ b/dojo/importers/default_importer.py @@ -84,11 +84,14 @@ def create_test( api_scan_configuration=self.api_scan_configuration, tags=self.tags, ) - # Initialize the endpoint manager now that self.test is available - if not settings.V3_FEATURE_LOCATIONS: - self.endpoint_manager = EndpointManager(self.test.engagement.product) return self.test + def _create_endpoint_manager(self, test: Test) -> EndpointManager: + """Factory method — override in subclasses to inject a custom EndpointManager.""" + if test is None: + raise ValueError("Cannot create EndpointManager: test is None. Ensure a test is created before initializing the endpoint manager.") + return EndpointManager(test.engagement.product) + def process_scan( self, scan: TemporaryUploadedFile, @@ -113,7 +116,14 @@ def process_scan( parser = self.get_parser() # Get the findings from the parser based on what methods the parser supplies # This could either mean traditional file parsing, or API pull parsing + # Note: for fresh imports, parse_findings() calls create_test() internally, + # so self.test is guaranteed to be set after this call. parsed_findings = self.parse_findings(scan, parser) or [] + # Initialize the endpoint manager now that self.test is available. + # This covers both the sync path (test created inside parse_findings above) + # and the async path (test was already set before process_scan was called). + if not settings.V3_FEATURE_LOCATIONS: + self.endpoint_manager = self._create_endpoint_manager(self.test) new_findings = self.process_findings(parsed_findings, **kwargs) # Close any old findings in the processed list if the the user specified for that # to occur in the form that is then passed to the kwargs diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py index fd4d71c4a38..b423a089bc9 100644 --- a/dojo/importers/default_reimporter.py +++ b/dojo/importers/default_reimporter.py @@ -78,6 +78,10 @@ def __init__(self, *args, **kwargs): **kwargs, ) + def _create_endpoint_manager(self) -> EndpointManager: + """Factory method — override in subclasses to inject a custom EndpointManager.""" + return EndpointManager(self.test.engagement.product) + def process_scan( self, scan: TemporaryUploadedFile, @@ -98,7 +102,7 @@ def process_scan( logger.debug(f"REIMPORT_SCAN: parameters: {locals()}") # Initialize the endpoint manager now that self.test is available if not settings.V3_FEATURE_LOCATIONS: - self.endpoint_manager = EndpointManager(self.test.engagement.product) + self.endpoint_manager = self._create_endpoint_manager(self.test) # Validate the Tool_Configuration self.verify_tool_configuration_from_test() # Fetch the parser based upon the string version of the scan type diff --git a/dojo/importers/endpoint_manager.py b/dojo/importers/endpoint_manager.py index 132921a53da..62cd7c350d0 100644 --- a/dojo/importers/endpoint_manager.py +++ b/dojo/importers/endpoint_manager.py @@ -160,16 +160,19 @@ def record_statuses_to_mitigate(self, statuses: list[Endpoint_Status]) -> None: """Accumulate endpoint statuses for bulk mitigation in persist().""" self._statuses_to_mitigate.extend(statuses) - def _fetch_and_create_endpoints(self) -> dict[EndpointUniqueKey, Endpoint]: + def get_or_create_endpoints(self) -> tuple[dict[EndpointUniqueKey, Endpoint], list[Endpoint]]: """ - Fetch existing product endpoints and bulk_create missing ones. + For each queued endpoint record, fetch the existing DB row or bulk_create a new one. - Returns a key -> Endpoint mapping for all endpoints referenced in _endpoints_to_create. + Returns: + (endpoints_by_key, created) where: + - endpoints_by_key maps each EndpointUniqueKey to its Endpoint object (existing or new) + - created is the list of Endpoint objects that were actually inserted into the DB """ if not self._endpoints_to_create: - return {} + return {}, [] - key_to_endpoint: dict[EndpointUniqueKey, Endpoint] = {} + endpoints_by_key: dict[EndpointUniqueKey, Endpoint] = {} with transaction.atomic(): # Fetch all existing endpoints for this product @@ -190,27 +193,28 @@ def _fetch_and_create_endpoints(self) -> dict[EndpointUniqueKey, Endpoint]: product_id=ep.product_id, ) # First-by-id wins, matching endpoint_get_or_create behavior - if key not in key_to_endpoint: - key_to_endpoint[key] = ep + if key not in endpoints_by_key: + endpoints_by_key[key] = ep # Determine which endpoints still need creating to_create = [] to_create_keys = [] for key, kwargs in self._endpoints_to_create.items(): - if key not in key_to_endpoint: + if key not in endpoints_by_key: to_create.append(Endpoint(**kwargs)) to_create_keys.append(key) + created: list[Endpoint] = [] if to_create: created = Endpoint.objects.bulk_create(to_create, batch_size=1000) - key_to_endpoint.update(zip(to_create_keys, created, strict=True)) + endpoints_by_key.update(zip(to_create_keys, created, strict=True)) # bulk_create bypasses post_save signals, so manually trigger tag inheritance # this is not ideal, but we need to take a separate look at the tag inheritance feature itself later for ep in created: inherit_instance_tags(ep) self._endpoints_to_create.clear() - return key_to_endpoint + return endpoints_by_key, created def persist(self, user: Dojo_User | None = None) -> None: """ @@ -219,18 +223,18 @@ def persist(self, user: Dojo_User | None = None) -> None: Called at batch boundaries during import/reimport. """ # Step 1: Ensure all recorded endpoints exist in DB - key_to_endpoint = self._fetch_and_create_endpoints() + endpoints_by_key, _ = self.get_or_create_endpoints() # Step 2: Bulk-create Endpoint_Status rows if self._statuses_to_create: rows = [ Endpoint_Status( finding=finding, - endpoint=key_to_endpoint[key], + endpoint=endpoints_by_key[key], date=finding.date, ) for finding, key in self._statuses_to_create - if key in key_to_endpoint + if key in endpoints_by_key ] if rows: Endpoint_Status.objects.bulk_create(rows, ignore_conflicts=True, batch_size=1000) From ea1963b812a637b96c98a9226eb478b588a584d9 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 15 Mar 2026 01:07:36 +0100 Subject: [PATCH 05/12] ruff --- dojo/importers/default_importer.py | 3 ++- dojo/importers/endpoint_manager.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/dojo/importers/default_importer.py b/dojo/importers/default_importer.py index 59482eddf17..996085c0d5a 100644 --- a/dojo/importers/default_importer.py +++ b/dojo/importers/default_importer.py @@ -89,7 +89,8 @@ def create_test( def _create_endpoint_manager(self, test: Test) -> EndpointManager: """Factory method — override in subclasses to inject a custom EndpointManager.""" if test is None: - raise ValueError("Cannot create EndpointManager: test is None. Ensure a test is created before initializing the endpoint manager.") + msg = "Cannot create EndpointManager: test is None. Ensure a test is created before initializing the endpoint manager." + raise ValueError(msg) return EndpointManager(test.engagement.product) def process_scan( diff --git a/dojo/importers/endpoint_manager.py b/dojo/importers/endpoint_manager.py index 62cd7c350d0..4d89b19072a 100644 --- a/dojo/importers/endpoint_manager.py +++ b/dojo/importers/endpoint_manager.py @@ -168,6 +168,7 @@ def get_or_create_endpoints(self) -> tuple[dict[EndpointUniqueKey, Endpoint], li (endpoints_by_key, created) where: - endpoints_by_key maps each EndpointUniqueKey to its Endpoint object (existing or new) - created is the list of Endpoint objects that were actually inserted into the DB + """ if not self._endpoints_to_create: return {}, [] From f5ab948acad8e71ef7cc0e318fae0d09e42fc4c7 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 15 Mar 2026 10:04:51 +0100 Subject: [PATCH 06/12] fix create endpoint manager --- dojo/importers/default_reimporter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py index b423a089bc9..6d462c39e07 100644 --- a/dojo/importers/default_reimporter.py +++ b/dojo/importers/default_reimporter.py @@ -78,9 +78,9 @@ def __init__(self, *args, **kwargs): **kwargs, ) - def _create_endpoint_manager(self) -> EndpointManager: + def _create_endpoint_manager(self, test: Test) -> EndpointManager: """Factory method — override in subclasses to inject a custom EndpointManager.""" - return EndpointManager(self.test.engagement.product) + return EndpointManager(test.engagement.product) def process_scan( self, From d3a20df5070b6da5dd40988bfa55b82e6a93b57a Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 15 Mar 2026 10:07:44 +0100 Subject: [PATCH 07/12] fix counts --- unittests/test_importers_performance.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py index 7aeafe31bde..f9d9e7dee47 100644 --- a/unittests/test_importers_performance.py +++ b/unittests/test_importers_performance.py @@ -457,8 +457,8 @@ def test_deduplication_performance_pghistory_no_async(self): testuser.usercontactinfo.save() self._deduplication_performance( - expected_num_queries1=271, - expected_num_async_tasks1=7, - expected_num_queries2=183, - expected_num_async_tasks2=7, + expected_num_queries1=81, + expected_num_async_tasks1=1, + expected_num_queries2=77, + expected_num_async_tasks2=1, ) From 9705de838766a0c8ec6e42ca41ed09ec56f06813 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Mon, 16 Mar 2026 21:56:27 +0100 Subject: [PATCH 08/12] refactor(importers): initialize EndpointManager eagerly in __init__ Instead of lazily initializing endpoint_manager in process_scan() or create_test(), initialize it immediately in __init__ using the product from the required engagement/test parameter. This eliminates the NoneType AttributeError when callers invoke create_test() + process_findings() directly without going through process_scan(). - DefaultImporter: uses self.engagement.product (engagement is required) - DefaultReImporter: uses self.test.engagement.product (test is required) - Remove _create_endpoint_manager() factory methods and lazy init sites - Remove self.endpoint_manager = None from BaseImporter.__init__ --- dojo/importers/base_importer.py | 5 ----- dojo/importers/default_importer.py | 14 ++------------ dojo/importers/default_reimporter.py | 9 ++------- 3 files changed, 4 insertions(+), 24 deletions(-) diff --git a/dojo/importers/base_importer.py b/dojo/importers/base_importer.py index 743da0d862a..cdc4f171077 100644 --- a/dojo/importers/base_importer.py +++ b/dojo/importers/base_importer.py @@ -82,11 +82,6 @@ def __init__( ImporterOptions.__init__(self, *args, **kwargs) if settings.V3_FEATURE_LOCATIONS: self.location_manager = LocationManager() - else: - # TODO: Delete this after the move to Locations - # EndpointManager is initialized in each subclass's process_scan() - # after self.test is available, because it needs the product. - self.endpoint_manager = None def check_child_implementation_exception(self): """ diff --git a/dojo/importers/default_importer.py b/dojo/importers/default_importer.py index 996085c0d5a..838490663ec 100644 --- a/dojo/importers/default_importer.py +++ b/dojo/importers/default_importer.py @@ -57,6 +57,8 @@ def __init__(self, *args, **kwargs): import_type=Test_Import.IMPORT_TYPE, **kwargs, ) + if not settings.V3_FEATURE_LOCATIONS: + self.endpoint_manager = EndpointManager(self.engagement.product) def create_test( self, @@ -86,13 +88,6 @@ def create_test( ) return self.test - def _create_endpoint_manager(self, test: Test) -> EndpointManager: - """Factory method — override in subclasses to inject a custom EndpointManager.""" - if test is None: - msg = "Cannot create EndpointManager: test is None. Ensure a test is created before initializing the endpoint manager." - raise ValueError(msg) - return EndpointManager(test.engagement.product) - def process_scan( self, scan: TemporaryUploadedFile, @@ -120,11 +115,6 @@ def process_scan( # Note: for fresh imports, parse_findings() calls create_test() internally, # so self.test is guaranteed to be set after this call. parsed_findings = self.parse_findings(scan, parser) or [] - # Initialize the endpoint manager now that self.test is available. - # This covers both the sync path (test created inside parse_findings above) - # and the async path (test was already set before process_scan was called). - if not settings.V3_FEATURE_LOCATIONS: - self.endpoint_manager = self._create_endpoint_manager(self.test) new_findings = self.process_findings(parsed_findings, **kwargs) # Close any old findings in the processed list if the the user specified for that # to occur in the form that is then passed to the kwargs diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py index 6d462c39e07..d26b5d63fb7 100644 --- a/dojo/importers/default_reimporter.py +++ b/dojo/importers/default_reimporter.py @@ -77,10 +77,8 @@ def __init__(self, *args, **kwargs): import_type=Test_Import.REIMPORT_TYPE, **kwargs, ) - - def _create_endpoint_manager(self, test: Test) -> EndpointManager: - """Factory method — override in subclasses to inject a custom EndpointManager.""" - return EndpointManager(test.engagement.product) + if not settings.V3_FEATURE_LOCATIONS: + self.endpoint_manager = EndpointManager(self.test.engagement.product) def process_scan( self, @@ -100,9 +98,6 @@ def process_scan( - Update the test progress """ logger.debug(f"REIMPORT_SCAN: parameters: {locals()}") - # Initialize the endpoint manager now that self.test is available - if not settings.V3_FEATURE_LOCATIONS: - self.endpoint_manager = self._create_endpoint_manager(self.test) # Validate the Tool_Configuration self.verify_tool_configuration_from_test() # Fetch the parser based upon the string version of the scan type From c6e25aadff950c1809a6f51b0479e54beba3be70 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Mon, 23 Mar 2026 20:43:33 +0100 Subject: [PATCH 09/12] add test case --- ...hawk_two_vul_same_hashcode_fabricated.json | 77 ++++++ unittests/test_reimport_prefetch.py | 255 ++++++++++++++++++ 2 files changed, 332 insertions(+) create mode 100644 unittests/scans/stackhawk/stackhawk_two_vul_same_hashcode_fabricated.json create mode 100644 unittests/test_reimport_prefetch.py diff --git a/unittests/scans/stackhawk/stackhawk_two_vul_same_hashcode_fabricated.json b/unittests/scans/stackhawk/stackhawk_two_vul_same_hashcode_fabricated.json new file mode 100644 index 00000000000..ab9ed464215 --- /dev/null +++ b/unittests/scans/stackhawk/stackhawk_two_vul_same_hashcode_fabricated.json @@ -0,0 +1,77 @@ +{ + "service": "StackHawk", + "scanCompleted": { + "scan": { + "id": "f3aa6752-8fef-58fa-c854-1d3g8e972f38", + "hawkscanVersion": "2.1.1", + "env": "Development", + "status": "COMPLETED", + "application": "Secured Application", + "startedTimestamp": "2022-02-16T23:07:19.575Z", + "scanURL": "https://app.stackhawk.com/scans/f3aa6752-8fef-58fa-c854-1d3g8e972f38" + }, + "scanDuration": "21", + "spiderDuration": "45", + "completedScanStats": { + "urlsCount": "2", + "duration": "66", + "scanResultsStats": { + "totalCount": "2", + "lowCount": "0", + "mediumCount": "0", + "highCount": "2", + "lowTriagedCount": "0", + "mediumTriagedCount": "0", + "highTriagedCount": "0" + } + }, + "findings": [ + { + "pluginId": "90001", + "pluginName": "Insecure JSF ViewState", + "severity": "High", + "host": "https://localhost:9000", + "paths": [ + { + "path": "/app/login", + "method": "GET", + "status": "NEW", + "pathURL": "https://app.stackhawk.com/scans/f3aa6752-8fef-58fa-c854-1d3g8e972f38/finding/90001/path/1/message/1" + } + ], + "pathStats": [ + { + "status": "NEW", + "count": 1 + } + ], + "totalCount": "1", + "category": "Security Misconfiguration", + "findingURL": "https://app.stackhawk.com/scans/f3aa6752-8fef-58fa-c854-1d3g8e972f38/finding/90001" + }, + { + "pluginId": "90002", + "pluginName": "Insecure HTTP Header", + "severity": "High", + "host": "https://localhost:9000", + "paths": [ + { + "path": "/app/dashboard", + "method": "GET", + "status": "NEW", + "pathURL": "https://app.stackhawk.com/scans/f3aa6752-8fef-58fa-c854-1d3g8e972f38/finding/90002/path/1/message/1" + } + ], + "pathStats": [ + { + "status": "NEW", + "count": 1 + } + ], + "totalCount": "1", + "category": "Security Misconfiguration", + "findingURL": "https://app.stackhawk.com/scans/f3aa6752-8fef-58fa-c854-1d3g8e972f38/finding/90002" + } + ] + } +} diff --git a/unittests/test_reimport_prefetch.py b/unittests/test_reimport_prefetch.py new file mode 100644 index 00000000000..67265135925 --- /dev/null +++ b/unittests/test_reimport_prefetch.py @@ -0,0 +1,255 @@ +""" +Regression tests for handling duplicate findings within the same reimport report. + +When a scan report contains two findings that produce the same hash_code, the first +creates a new finding via process_finding_that_was_not_matched() and gets added to +the candidate dictionaries. When the second finding in the same batch matches against +this newly-created finding, accessing existing_finding.status_finding_non_special +raises AttributeError because the finding was never loaded through +build_candidate_scope_queryset (which sets up the Prefetch with to_attr). + +Bugfix: https://github.com/DefectDojo/django-DefectDojo/pull/14569 +Batch endpoint optimization (related): https://github.com/DefectDojo/django-DefectDojo/pull/14489 +""" + +from unittest.mock import patch + +from crum import impersonate +from django.conf import settings +from django.test import override_settings +from django.utils import timezone + +from dojo.importers.default_reimporter import DefaultReImporter +from dojo.location.models import LocationFindingReference +from dojo.location.status import FindingLocationStatus +from dojo.models import ( + Development_Environment, + Dojo_User, + Endpoint, + Endpoint_Status, + Engagement, + Finding, + Product, + Product_Type, + Test, + Test_Type, + User, + UserContactInfo, +) + +from .dojo_test_case import DojoTestCase, get_unit_tests_scans_path, skip_unless_v2 + +SCAN_TYPE = "StackHawk HawkScan" + +# Reimport: two findings with different pluginIds (so the parser keeps both as separate +# findings) but identical hash_code fields (component_name + component_version). +# Each finding has one unique endpoint/location path (/app/login and /app/dashboard). +# This triggers the same-batch matching scenario where the second finding matches +# against the first one that was just created in the same reimport batch. +REIMPORT_SCAN = get_unit_tests_scans_path("stackhawk") / "stackhawk_two_vul_same_hashcode_fabricated.json" + + +def _hashcode_fields_without_vuln_id(): + """ + Return a copy of HASHCODE_FIELDS_PER_SCANNER with StackHawk configured to + hash only on component_name + component_version (excluding vuln_id_from_tool). + + By default StackHawk hashes on [vuln_id_from_tool, component_name, component_version]. + Since vuln_id_from_tool maps to pluginId and the parser deduplicates by pluginId, + it is normally impossible for two separate findings to share a hash_code. + + By removing vuln_id_from_tool from the hash fields, two findings with different + pluginIds but the same application/env metadata will produce the same hash_code, + allowing us to exercise the same-batch matching code path. + """ + fields = dict(settings.HASHCODE_FIELDS_PER_SCANNER) + fields[SCAN_TYPE] = ["component_name", "component_version"] + return fields + + +class ReimportDuplicateFindingsTestBase(DojoTestCase): + + """Shared setup for duplicate-findings-in-same-report tests.""" + + def setUp(self): + super().setUp() + testuser, _ = User.objects.get_or_create(username="admin") + UserContactInfo.objects.get_or_create(user=testuser, defaults={"block_execution": True}) + + self.system_settings(enable_deduplication=True) + self.system_settings(enable_product_grade=False) + + product_type, _ = Product_Type.objects.get_or_create(name="test") + product, _ = Product.objects.get_or_create( + name="ReimportPrefetchTest", + description="Test", + prod_type=product_type, + ) + engagement, _ = Engagement.objects.get_or_create( + name="Test Engagement", + product=product, + target_start=timezone.now(), + target_end=timezone.now(), + ) + self.lead, _ = User.objects.get_or_create(username="admin") + environment, _ = Development_Environment.objects.get_or_create(name="Development") + test_type, _ = Test_Type.objects.get_or_create(name=SCAN_TYPE) + self.test = Test.objects.create( + engagement=engagement, + test_type=test_type, + scan_type=SCAN_TYPE, + target_start=timezone.now(), + target_end=timezone.now(), + environment=environment, + ) + self.product = product + + def _reimport_with_overridden_hashcode(self): + """ + Reimport the two-finding scan file with overridden hash settings. + + We mock get_custom_method to return None so that the OSS hash_code logic + is used (reads from django.settings, which override_settings can modify). + In Pro, compute_hash_code is overridden via FINDING_COMPUTE_HASH_METHOD to + read hash settings from the TunableSettings database model instead of + django.settings, which makes override_settings ineffective. + """ + hashcode_override = _hashcode_fields_without_vuln_id() + with ( + impersonate(Dojo_User.objects.get(username="admin")), + override_settings(HASHCODE_FIELDS_PER_SCANNER=hashcode_override), + patch("dojo.utils.get_custom_method", return_value=None), + REIMPORT_SCAN.open(encoding="utf-8") as scan, + ): + reimporter = DefaultReImporter( + test=self.test, + user=self.lead, + lead=self.lead, + scan_date=None, + minimum_severity="Info", + active=True, + verified=True, + sync=True, + scan_type=SCAN_TYPE, + ) + return reimporter.process_scan(scan) + + +@skip_unless_v2 +class TestReimportDuplicateFindingsEndpointHandling(ReimportDuplicateFindingsTestBase): + + """Regression test: reimport must handle endpoints correctly for duplicate findings in the same report.""" + + def test_reimport_duplicate_findings_in_same_report_endpoints(self): + """ + Reimporting a report with two findings that share the same hash_code must + not raise AttributeError on status_finding_non_special, and must correctly + create endpoints for both findings. + + The scan file has two findings with different pluginIds, each with one + unique endpoint (/app/login and /app/dashboard). With the overridden hash + settings, both produce the same hash_code. The first finding is created as + new and added to candidates. The second finding matches against it (a + batch-created finding), and its endpoint is added via finding_post_processing. + """ + # This previously raised: + # AttributeError: 'Finding' object has no attribute 'status_finding_non_special' + test, _, len_new, len_closed, _, _, _ = self._reimport_with_overridden_hashcode() + + # The first finding is new (empty test, no candidates). The second finding + # matches the first (same hash_code) so it is not counted as new. + self.assertEqual(len_new, 1, "Reimport should create one new finding (second matches first)") + self.assertEqual(len_closed, 0, "No findings should be closed") + + # Only one finding should exist — the second was matched to the first + findings = Finding.objects.filter(test=self.test) + self.assertEqual(findings.count(), 1, "Only one finding should exist after reimport") + + finding = findings.first() + + # The first finding (new, pluginId=90001) creates endpoint /app/login. + # The second finding (pluginId=90002) matches the first via hash_code. + # finding_post_processing runs for BOTH iterations, adding each finding's + # endpoints to the (single) matched finding. So the finding ends up with + # 2 endpoints: /app/login (from the first) and /app/dashboard (from the second). + endpoints = Endpoint.objects.filter(product=self.product) + self.assertEqual(endpoints.count(), 2, "Two endpoints should be created (one per finding in report)") + + endpoint_statuses = Endpoint_Status.objects.filter(finding=finding) + self.assertEqual(endpoint_statuses.count(), 2, "Finding should have two endpoint statuses") + + # Because the second finding is dynamic, update_endpoint_status() compares + # the first finding's existing endpoint statuses against the second finding's + # unsaved_endpoints. The second finding only has /app/dashboard, so /app/login + # (which belongs to the first finding) is considered "no longer present" and + # gets mitigated. This is arguably wrong for batch-created findings — both + # endpoints came from the same report — but it is the current behavior. + self.assertEqual( + endpoint_statuses.filter(mitigated=False).count(), 1, + "One endpoint status should be active (/app/dashboard from the matched finding)", + ) + self.assertEqual( + endpoint_statuses.filter(mitigated=True).count(), 1, + "One endpoint status should be mitigated (/app/login — mitigated by update_endpoint_status " + "because it is not in the second finding's endpoint list)", + ) + + +@override_settings(V3_FEATURE_LOCATIONS=True) +class TestReimportDuplicateFindingsLocationHandling(ReimportDuplicateFindingsTestBase): + + """Test that reimport handles locations correctly for duplicate findings in the same report.""" + + def test_reimport_duplicate_findings_in_same_report_locations(self): + """ + Reimporting a report with two findings that share the same hash_code must + correctly create locations for both findings. + + The locations code does not use the to_attr prefetch that caused the + AttributeError in the endpoint code path, so it does not crash. However, + the same logical issue applies: update_location_status() compares the + batch-created finding's locations against the second finding's unsaved + locations, mitigating locations that are "no longer present". + + The scan file has two findings with different pluginIds, each with one + unique location (https://localhost:9000/app/login and .../app/dashboard). + With the overridden hash settings, both produce the same hash_code. The + first finding is created as new and added to candidates. The second finding + matches against it (a batch-created finding). + """ + test, _, len_new, len_closed, _, _, _ = self._reimport_with_overridden_hashcode() + + # The first finding is new (empty test, no candidates). The second finding + # matches the first (same hash_code) so it is not counted as new. + self.assertEqual(len_new, 1, "Reimport should create one new finding (second matches first)") + self.assertEqual(len_closed, 0, "No findings should be closed") + + # Only one finding should exist — the second was matched to the first + findings = Finding.objects.filter(test=self.test) + self.assertEqual(findings.count(), 1, "Only one finding should exist after reimport") + + finding = findings.first() + + # The first finding (new, pluginId=90001) creates location for /app/login. + # The second finding (pluginId=90002) matches the first via hash_code. + # finding_post_processing runs for BOTH iterations, adding each finding's + # locations to the (single) matched finding. So the finding ends up with + # 2 location references pointing to 2 distinct Location objects. + location_refs = LocationFindingReference.objects.filter(finding=finding) + self.assertEqual(location_refs.count(), 2, "Finding should have two location references") + + # Same behavior as endpoints: update_location_status() compares the first + # finding's existing location refs against the second finding's unsaved_locations. + # The second finding only has /app/dashboard, so /app/login (which belongs to + # the first finding) is considered "no longer present" and gets mitigated. + # This is arguably wrong for batch-created findings — both locations came from + # the same report — but it is the current behavior. + self.assertEqual( + location_refs.filter(status=FindingLocationStatus.Active).count(), 1, + "One location ref should be active (/app/dashboard from the matched finding)", + ) + self.assertEqual( + location_refs.filter(status=FindingLocationStatus.Mitigated).count(), 1, + "One location ref should be mitigated (/app/login — mitigated by update_location_status " + "because it is not in the second finding's location list)", + ) From 2a40617fa19efd6e4d0764b21edd52fe0091f3b9 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Mon, 23 Mar 2026 21:46:00 +0100 Subject: [PATCH 10/12] ruff --- unittests/test_reimport_prefetch.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/unittests/test_reimport_prefetch.py b/unittests/test_reimport_prefetch.py index 67265135925..71355dc209e 100644 --- a/unittests/test_reimport_prefetch.py +++ b/unittests/test_reimport_prefetch.py @@ -154,7 +154,7 @@ def test_reimport_duplicate_findings_in_same_report_endpoints(self): """ # This previously raised: # AttributeError: 'Finding' object has no attribute 'status_finding_non_special' - test, _, len_new, len_closed, _, _, _ = self._reimport_with_overridden_hashcode() + _test, _, len_new, len_closed, _, _, _ = self._reimport_with_overridden_hashcode() # The first finding is new (empty test, no candidates). The second finding # matches the first (same hash_code) so it is not counted as new. @@ -185,11 +185,13 @@ def test_reimport_duplicate_findings_in_same_report_endpoints(self): # gets mitigated. This is arguably wrong for batch-created findings — both # endpoints came from the same report — but it is the current behavior. self.assertEqual( - endpoint_statuses.filter(mitigated=False).count(), 1, + endpoint_statuses.filter(mitigated=False).count(), + 1, "One endpoint status should be active (/app/dashboard from the matched finding)", ) self.assertEqual( - endpoint_statuses.filter(mitigated=True).count(), 1, + endpoint_statuses.filter(mitigated=True).count(), + 1, "One endpoint status should be mitigated (/app/login — mitigated by update_endpoint_status " "because it is not in the second finding's endpoint list)", ) @@ -217,7 +219,7 @@ def test_reimport_duplicate_findings_in_same_report_locations(self): first finding is created as new and added to candidates. The second finding matches against it (a batch-created finding). """ - test, _, len_new, len_closed, _, _, _ = self._reimport_with_overridden_hashcode() + _test, _, len_new, len_closed, _, _, _ = self._reimport_with_overridden_hashcode() # The first finding is new (empty test, no candidates). The second finding # matches the first (same hash_code) so it is not counted as new. @@ -245,11 +247,13 @@ def test_reimport_duplicate_findings_in_same_report_locations(self): # This is arguably wrong for batch-created findings — both locations came from # the same report — but it is the current behavior. self.assertEqual( - location_refs.filter(status=FindingLocationStatus.Active).count(), 1, + location_refs.filter(status=FindingLocationStatus.Active).count(), + 1, "One location ref should be active (/app/dashboard from the matched finding)", ) self.assertEqual( - location_refs.filter(status=FindingLocationStatus.Mitigated).count(), 1, + location_refs.filter(status=FindingLocationStatus.Mitigated).count(), + 1, "One location ref should be mitigated (/app/login — mitigated by update_location_status " "because it is not in the second finding's location list)", ) From 56e4e9e27021a79782f6e1201182b40e3b8689f1 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Tue, 24 Mar 2026 18:44:36 +0100 Subject: [PATCH 11/12] remove mock --- unittests/test_reimport_prefetch.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/unittests/test_reimport_prefetch.py b/unittests/test_reimport_prefetch.py index 71355dc209e..6d5c5ce4ba5 100644 --- a/unittests/test_reimport_prefetch.py +++ b/unittests/test_reimport_prefetch.py @@ -105,20 +105,11 @@ def setUp(self): self.product = product def _reimport_with_overridden_hashcode(self): - """ - Reimport the two-finding scan file with overridden hash settings. - - We mock get_custom_method to return None so that the OSS hash_code logic - is used (reads from django.settings, which override_settings can modify). - In Pro, compute_hash_code is overridden via FINDING_COMPUTE_HASH_METHOD to - read hash settings from the TunableSettings database model instead of - django.settings, which makes override_settings ineffective. - """ + """Reimport the two-finding scan file with overridden hash settings.""" hashcode_override = _hashcode_fields_without_vuln_id() with ( impersonate(Dojo_User.objects.get(username="admin")), override_settings(HASHCODE_FIELDS_PER_SCANNER=hashcode_override), - patch("dojo.utils.get_custom_method", return_value=None), REIMPORT_SCAN.open(encoding="utf-8") as scan, ): reimporter = DefaultReImporter( From a6209f797d67212ec74e11a7e529e3bcaaa35298 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Tue, 24 Mar 2026 19:04:46 +0100 Subject: [PATCH 12/12] fix: replace status_finding_non_special prefetch with Python filtering MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop the to_attr Prefetch for status_finding_non_special and instead prefetch all endpoint statuses, filtering non-special ones in Python via EndpointManager.get_non_special_endpoint_statuses(). This avoids AttributeError when a finding created during the same reimport batch (via add_new_finding_to_candidates) is matched by a subsequent finding — such findings were never loaded through the prefetch queryset and lacked the to_attr attribute. See: https://github.com/DefectDojo/django-DefectDojo/pull/14569 --- dojo/finding/deduplication.py | 15 ++++++++------- dojo/importers/default_reimporter.py | 5 +++-- dojo/importers/endpoint_manager.py | 21 ++++++++++++++++++--- unittests/test_reimport_prefetch.py | 24 ++++++++---------------- 4 files changed, 37 insertions(+), 28 deletions(-) diff --git a/dojo/finding/deduplication.py b/dojo/finding/deduplication.py index 33f89a16e58..99d416f44f0 100644 --- a/dojo/finding/deduplication.py +++ b/dojo/finding/deduplication.py @@ -336,17 +336,18 @@ def build_candidate_scope_queryset(test, mode="deduplication", service=None): # Base prefetches for both modes prefetch_list = ["endpoints", "vulnerability_id_set", "found_by"] - # Additional prefetches for reimport mode: fetch only non-special endpoint statuses with their - # endpoint joined in, so endpoint_manager can read status_finding_non_special directly without - # any extra DB queries + # Prefetch all endpoint statuses with their endpoint for reimport mode. + # The non-special filtering (excluding false_positive, out_of_scope, risk_accepted) + # is done in Python by EndpointManager.get_non_special_endpoint_statuses(). + # We avoid using to_attr here because findings created during the same reimport + # batch (via add_new_finding_to_candidates) are never loaded through this queryset + # and would lack the to_attr, causing an AttributeError. + # See: https://github.com/DefectDojo/django-DefectDojo/pull/14569 if mode == "reimport": prefetch_list.append( Prefetch( "status_finding", - queryset=Endpoint_Status.objects.exclude( - Q(false_positive=True) | Q(out_of_scope=True) | Q(risk_accepted=True), - ).select_related("endpoint"), - to_attr="status_finding_non_special", + queryset=Endpoint_Status.objects.select_related("endpoint"), ), ) diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py index d26b5d63fb7..5e57125e49b 100644 --- a/dojo/importers/default_reimporter.py +++ b/dojo/importers/default_reimporter.py @@ -772,8 +772,9 @@ def process_matched_mitigated_finding( else: # TODO: Delete this after the move to Locations # Accumulate endpoint statuses for bulk reactivation in persist() - # status_finding_non_special is prefetched by build_candidate_scope_queryset - self.endpoint_manager.record_statuses_to_reactivate(existing_finding.status_finding_non_special) + self.endpoint_manager.record_statuses_to_reactivate( + self.endpoint_manager.get_non_special_endpoint_statuses(existing_finding), + ) existing_finding.notes.add(note) self.reactivated_items.append(existing_finding) # The new finding is active while the existing on is mitigated. The existing finding needs to diff --git a/dojo/importers/endpoint_manager.py b/dojo/importers/endpoint_manager.py index 4d89b19072a..d7c3cd1df46 100644 --- a/dojo/importers/endpoint_manager.py +++ b/dojo/importers/endpoint_manager.py @@ -120,6 +120,23 @@ def record_status_for_create(self, finding: Finding, key: EndpointUniqueKey) -> """Record that a finding should be linked to an endpoint (identified by key) via Endpoint_Status.""" self._statuses_to_create.append((finding, key)) + @staticmethod + def get_non_special_endpoint_statuses(finding: Finding) -> list[Endpoint_Status]: + """ + Return endpoint statuses that are not false_positive, out_of_scope, or risk_accepted. + + Uses finding.status_finding.all() which is served from the prefetch cache when the + finding was loaded through build_candidate_scope_queryset, and falls back to a DB + query otherwise (e.g. for findings created during the same reimport batch). + + This might be ineffecient if lots of internal duplicates are in the report. + But this should be limited as most parsers dedupe during parsing and merge the endpoints. + """ + return [ + eps for eps in finding.status_finding.all() + if not eps.false_positive and not eps.out_of_scope and not eps.risk_accepted + ] + def update_endpoint_status( self, existing_finding: Finding, @@ -131,9 +148,7 @@ def update_endpoint_status( The actual bulk_update happens in persist(). """ - # status_finding_non_special is prefetched by build_candidate_scope_queryset with the - # special-status exclusion and endpoint select_related already applied at the DB level - existing_finding_endpoint_status_list = existing_finding.status_finding_non_special + existing_finding_endpoint_status_list = self.get_non_special_endpoint_statuses(existing_finding) new_finding_endpoints_list = new_finding.unsaved_endpoints if new_finding.is_mitigated: # New finding is mitigated, so mitigate all old endpoints diff --git a/unittests/test_reimport_prefetch.py b/unittests/test_reimport_prefetch.py index 6d5c5ce4ba5..d421b0129dc 100644 --- a/unittests/test_reimport_prefetch.py +++ b/unittests/test_reimport_prefetch.py @@ -12,8 +12,6 @@ Batch endpoint optimization (related): https://github.com/DefectDojo/django-DefectDojo/pull/14489 """ -from unittest.mock import patch - from crum import impersonate from django.conf import settings from django.test import override_settings @@ -169,22 +167,16 @@ def test_reimport_duplicate_findings_in_same_report_endpoints(self): endpoint_statuses = Endpoint_Status.objects.filter(finding=finding) self.assertEqual(endpoint_statuses.count(), 2, "Finding should have two endpoint statuses") - # Because the second finding is dynamic, update_endpoint_status() compares - # the first finding's existing endpoint statuses against the second finding's - # unsaved_endpoints. The second finding only has /app/dashboard, so /app/login - # (which belongs to the first finding) is considered "no longer present" and - # gets mitigated. This is arguably wrong for batch-created findings — both - # endpoints came from the same report — but it is the current behavior. + # With batched endpoint creation, endpoint statuses for the batch-created + # finding are not yet persisted when update_endpoint_status() runs. So the + # comparison finds no existing statuses and nothing gets mitigated — both + # endpoint statuses remain active. This is actually the correct outcome: + # both endpoints came from the same report and should both be active. self.assertEqual( endpoint_statuses.filter(mitigated=False).count(), - 1, - "One endpoint status should be active (/app/dashboard from the matched finding)", - ) - self.assertEqual( - endpoint_statuses.filter(mitigated=True).count(), - 1, - "One endpoint status should be mitigated (/app/login — mitigated by update_endpoint_status " - "because it is not in the second finding's endpoint list)", + 2, + "Both endpoint statuses should be active (batch-created finding has no persisted " + "statuses when update_endpoint_status runs, so nothing gets mitigated)", )