diff --git a/dojo/finding/deduplication.py b/dojo/finding/deduplication.py index 33f89a16e58..99d416f44f0 100644 --- a/dojo/finding/deduplication.py +++ b/dojo/finding/deduplication.py @@ -336,17 +336,18 @@ def build_candidate_scope_queryset(test, mode="deduplication", service=None): # Base prefetches for both modes prefetch_list = ["endpoints", "vulnerability_id_set", "found_by"] - # Additional prefetches for reimport mode: fetch only non-special endpoint statuses with their - # endpoint joined in, so endpoint_manager can read status_finding_non_special directly without - # any extra DB queries + # Prefetch all endpoint statuses with their endpoint for reimport mode. + # The non-special filtering (excluding false_positive, out_of_scope, risk_accepted) + # is done in Python by EndpointManager.get_non_special_endpoint_statuses(). + # We avoid using to_attr here because findings created during the same reimport + # batch (via add_new_finding_to_candidates) are never loaded through this queryset + # and would lack the to_attr, causing an AttributeError. + # See: https://github.com/DefectDojo/django-DefectDojo/pull/14569 if mode == "reimport": prefetch_list.append( Prefetch( "status_finding", - queryset=Endpoint_Status.objects.exclude( - Q(false_positive=True) | Q(out_of_scope=True) | Q(risk_accepted=True), - ).select_related("endpoint"), - to_attr="status_finding_non_special", + queryset=Endpoint_Status.objects.select_related("endpoint"), ), ) diff --git a/dojo/importers/base_importer.py b/dojo/importers/base_importer.py index 7734fc42d93..cdc4f171077 100644 --- a/dojo/importers/base_importer.py +++ b/dojo/importers/base_importer.py @@ -14,7 +14,6 @@ import dojo.finding.helper as finding_helper import dojo.risk_acceptance.helper as ra_helper from dojo.celery_dispatch import dojo_dispatch_task -from dojo.importers.endpoint_manager import EndpointManager from dojo.importers.location_manager import LocationManager, UnsavedLocation from dojo.importers.options import ImporterOptions from dojo.jira_link.helper import is_keep_in_sync_with_jira @@ -83,9 +82,6 @@ def __init__( ImporterOptions.__init__(self, *args, **kwargs) if settings.V3_FEATURE_LOCATIONS: self.location_manager = LocationManager() - else: - # TODO: Delete this after the move to Locations - self.endpoint_manager = EndpointManager() def check_child_implementation_exception(self): """ @@ -825,12 +821,17 @@ def process_endpoints( msg = "BaseImporter#process_endpoints() method is deprecated when V3_FEATURE_LOCATIONS is enabled" raise NotImplementedError(msg) - # Save the unsaved endpoints - self.endpoint_manager.chunk_endpoints_and_disperse(finding, finding.unsaved_endpoints) - # Check for any that were added in the form + # Clean and record unsaved endpoints from the report + self.endpoint_manager.clean_unsaved_endpoints(finding.unsaved_endpoints) + for endpoint in finding.unsaved_endpoints: + key = self.endpoint_manager.record_endpoint(endpoint) + self.endpoint_manager.record_status_for_create(finding, key) + # Record any endpoints added from the form if len(endpoints_to_add) > 0: logger.debug("endpoints_to_add: %s", endpoints_to_add) - self.endpoint_manager.chunk_endpoints_and_disperse(finding, endpoints_to_add) + for endpoint in endpoints_to_add: + key = self.endpoint_manager.record_endpoint(endpoint) + self.endpoint_manager.record_status_for_create(finding, key) def sanitize_vulnerability_ids(self, finding) -> None: """Remove undisired vulnerability id values""" @@ -934,14 +935,8 @@ def mitigate_finding( ) else: # TODO: Delete this after the move to Locations - # Mitigate the endpoint statuses - dojo_dispatch_task( - EndpointManager.mitigate_endpoint_status, - finding.status_finding.all(), - self.user, - kwuser=self.user, - sync=True, - ) + # Accumulate endpoint statuses for bulk mitigate in persist() + self.endpoint_manager.record_statuses_to_mitigate(finding.status_finding.all()) # to avoid pushing a finding group multiple times, we push those outside of the loop if finding_groups_enabled and finding.finding_group: # don't try to dedupe findings that we are closing diff --git a/dojo/importers/default_importer.py b/dojo/importers/default_importer.py index 8dd2aa4a4f9..838490663ec 100644 --- a/dojo/importers/default_importer.py +++ b/dojo/importers/default_importer.py @@ -9,6 +9,7 @@ from dojo.celery_dispatch import dojo_dispatch_task from dojo.finding import helper as finding_helper from dojo.importers.base_importer import BaseImporter, Parser +from dojo.importers.endpoint_manager import EndpointManager from dojo.importers.options import ImporterOptions from dojo.jira_link.helper import is_keep_in_sync_with_jira from dojo.models import ( @@ -56,6 +57,8 @@ def __init__(self, *args, **kwargs): import_type=Test_Import.IMPORT_TYPE, **kwargs, ) + if not settings.V3_FEATURE_LOCATIONS: + self.endpoint_manager = EndpointManager(self.engagement.product) def create_test( self, @@ -109,6 +112,8 @@ def process_scan( parser = self.get_parser() # Get the findings from the parser based on what methods the parser supplies # This could either mean traditional file parsing, or API pull parsing + # Note: for fresh imports, parse_findings() calls create_test() internally, + # so self.test is guaranteed to be set after this call. parsed_findings = self.parse_findings(scan, parser) or [] new_findings = self.process_findings(parsed_findings, **kwargs) # Close any old findings in the processed list if the the user specified for that @@ -259,8 +264,10 @@ def process_findings( logger.debug("process_findings: computed push_to_jira=%s", push_to_jira) batch_finding_ids.append(finding.id) - # If batch is full or we're at the end, dispatch one batched task + # If batch is full or we're at the end, persist endpoints and dispatch if len(batch_finding_ids) >= batch_max_size or is_final_finding: + if not settings.V3_FEATURE_LOCATIONS: + self.endpoint_manager.persist(user=self.user) finding_ids_batch = list(batch_finding_ids) batch_finding_ids.clear() logger.debug("process_findings: dispatching batch with push_to_jira=%s (batch_size=%d, is_final=%s)", @@ -378,6 +385,9 @@ def close_old_findings( finding_groups_enabled=self.findings_groups_enabled, product_grading_option=False, ) + # Persist any accumulated endpoint status mitigations + if not settings.V3_FEATURE_LOCATIONS: + self.endpoint_manager.persist(user=self.user) # push finding groups to jira since we only only want to push whole groups # We dont check if the finding jira sync is applicable quite yet until we can get in the loop # but this is a way to at least make it that far diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py index 5075eb6409b..5e57125e49b 100644 --- a/dojo/importers/default_reimporter.py +++ b/dojo/importers/default_reimporter.py @@ -14,6 +14,7 @@ find_candidates_for_reimport_legacy, ) from dojo.importers.base_importer import BaseImporter, Parser +from dojo.importers.endpoint_manager import EndpointManager from dojo.importers.options import ImporterOptions from dojo.jira_link.helper import is_keep_in_sync_with_jira from dojo.location.status import FindingLocationStatus @@ -76,6 +77,8 @@ def __init__(self, *args, **kwargs): import_type=Test_Import.REIMPORT_TYPE, **kwargs, ) + if not settings.V3_FEATURE_LOCATIONS: + self.endpoint_manager = EndpointManager(self.test.engagement.product) def process_scan( self, @@ -430,6 +433,8 @@ def process_findings( # - Deduplication batches: optimize bulk operations (larger batches = fewer queries) # They don't need to be aligned since they optimize different operations. if len(batch_finding_ids) >= dedupe_batch_max_size or is_final: + if not settings.V3_FEATURE_LOCATIONS: + self.endpoint_manager.persist(user=self.user) finding_ids_batch = list(batch_finding_ids) batch_finding_ids.clear() dojo_dispatch_task( @@ -497,6 +502,9 @@ def close_old_findings( product_grading_option=False, ) mitigated_findings.append(finding) + # Persist any accumulated endpoint status mitigations + if not settings.V3_FEATURE_LOCATIONS: + self.endpoint_manager.persist(user=self.user) # push finding groups to jira since we only only want to push whole groups # We dont check if the finding jira sync is applicable quite yet until we can get in the loop # but this is a way to at least make it that far @@ -763,9 +771,10 @@ def process_matched_mitigated_finding( self.location_manager.chunk_locations_and_reactivate(mitigated_locations) else: # TODO: Delete this after the move to Locations - # Reactivate mitigated endpoints that are not false positives, out of scope, or risk accepted - # status_finding_non_special is prefetched by build_candidate_scope_queryset - self.endpoint_manager.chunk_endpoints_and_reactivate(existing_finding.status_finding_non_special) + # Accumulate endpoint statuses for bulk reactivation in persist() + self.endpoint_manager.record_statuses_to_reactivate( + self.endpoint_manager.get_non_special_endpoint_statuses(existing_finding), + ) existing_finding.notes.add(note) self.reactivated_items.append(existing_finding) # The new finding is active while the existing on is mitigated. The existing finding needs to @@ -932,9 +941,13 @@ def finding_post_processing( self.location_manager.chunk_locations_and_disperse(finding, self.endpoints_to_add) else: # TODO: Delete this after the move to Locations - self.endpoint_manager.chunk_endpoints_and_disperse(finding, finding_from_report.unsaved_endpoints) + for endpoint in finding_from_report.unsaved_endpoints: + key = self.endpoint_manager.record_endpoint(endpoint) + self.endpoint_manager.record_status_for_create(finding, key) if len(self.endpoints_to_add) > 0: - self.endpoint_manager.chunk_endpoints_and_disperse(finding, self.endpoints_to_add) + for endpoint in self.endpoints_to_add: + key = self.endpoint_manager.record_endpoint(endpoint) + self.endpoint_manager.record_status_for_create(finding, key) # Parsers shouldn't use the tags field, and use unsaved_tags instead. # Merge any tags set by parser into unsaved_tags tags_from_parser = finding_from_report.tags if isinstance(finding_from_report.tags, list) else [] diff --git a/dojo/importers/endpoint_manager.py b/dojo/importers/endpoint_manager.py index 2b5883121ab..d7c3cd1df46 100644 --- a/dojo/importers/endpoint_manager.py +++ b/dojo/importers/endpoint_manager.py @@ -1,122 +1,81 @@ import logging +from typing import NamedTuple -from django.core.exceptions import MultipleObjectsReturned, ValidationError -from django.urls import reverse +from django.core.exceptions import ValidationError +from django.db import transaction from django.utils import timezone +from hyperlink._url import SCHEME_PORT_MAP # noqa: PLC2701 -from dojo.celery import app -from dojo.celery_dispatch import dojo_dispatch_task -from dojo.endpoint.utils import endpoint_get_or_create from dojo.models import ( Dojo_User, Endpoint, Endpoint_Status, Finding, + Product, ) +from dojo.tags_signals import inherit_instance_tags logger = logging.getLogger(__name__) +class EndpointUniqueKey(NamedTuple): + protocol: str | None + userinfo: str | None + host: str | None + port: int | None + path: str | None + query: str | None + fragment: str | None + product_id: int + + # TODO: Delete this after the move to Locations class EndpointManager: - @app.task - def add_endpoints_to_unsaved_finding( - finding: Finding, # noqa: N805 - endpoints: list[Endpoint], - **kwargs: dict, - ) -> None: - """Creates Endpoint objects for a single finding and creates the link via the endpoint status""" - logger.debug(f"IMPORT_SCAN: Adding {len(endpoints)} endpoints to finding: {finding}") - EndpointManager.clean_unsaved_endpoints(endpoints) - for endpoint in endpoints: - ep = None - eps = [] - try: - ep, _ = endpoint_get_or_create( - protocol=endpoint.protocol, - userinfo=endpoint.userinfo, - host=endpoint.host, - port=endpoint.port, - path=endpoint.path, - query=endpoint.query, - fragment=endpoint.fragment, - product=finding.test.engagement.product, - ) - eps.append(ep) - except (MultipleObjectsReturned): - msg = ( - f"Endpoints in your database are broken. " - f"Please access {reverse('endpoint_migrate')} and migrate them to new format or remove them." - ) - raise Exception(msg) - # bulk_create will translate to INSERT WITH IGNORE CONFLICTS - # much faster than get_or_create which issues two queries per endpoint - # bulk_create will not trigger endpoint_status.save and signals which is fine for now - rows = [Endpoint_Status(finding=finding, endpoint=e, date=finding.date) for e in eps] - Endpoint_Status.objects.bulk_create(rows, ignore_conflicts=True, batch_size=1000) + def __init__(self, product: Product) -> None: + self._product = product + self._endpoints_to_create: dict[EndpointUniqueKey, dict] = {} + self._statuses_to_create: list[tuple[Finding, EndpointUniqueKey]] = [] + self._statuses_to_mitigate: list[Endpoint_Status] = [] + self._statuses_to_reactivate: list[Endpoint_Status] = [] - logger.debug(f"IMPORT_SCAN: {len(endpoints)} endpoints imported") + @staticmethod + def _make_endpoint_unique_tuple( + protocol: str | None, + userinfo: str | None, + host: str | None, + port: int | None, + path: str | None, + query: str | None, + fragment: str | None, + product_id: int, + ) -> EndpointUniqueKey: + """ + Normalize endpoint fields to a unique key matching endpoint_filter() semantics. - @app.task - def mitigate_endpoint_status( - endpoint_status_list: list[Endpoint_Status], # noqa: N805 - user: Dojo_User, - **kwargs: dict, - ) -> None: - """Mitigates all endpoint status objects that are supplied""" - now = timezone.now() - to_update = [] - for endpoint_status in endpoint_status_list: - # Only mitigate endpoints that are actually active - if endpoint_status.mitigated is False: - endpoint_status.mitigated_time = now - endpoint_status.last_modified = now - endpoint_status.mitigated_by = user - endpoint_status.mitigated = True - to_update.append(endpoint_status) - - if to_update: - Endpoint_Status.objects.bulk_update( - to_update, - ["mitigated", "mitigated_time", "last_modified", "mitigated_by"], - batch_size=1000, - ) + See dojo/endpoint/utils.py endpoint_filter() for the canonical matching logic. + """ + norm_protocol = protocol.lower() if protocol else None + norm_host = host.lower() if host else None - @app.task - def reactivate_endpoint_status( - endpoint_status_list: list[Endpoint_Status], # noqa: N805 - **kwargs: dict, - ) -> None: - """Reactivate all endpoint status objects that are supplied""" - now = timezone.now() - to_update = [] - for endpoint_status in endpoint_status_list: - # Only reactivate endpoints that are actually mitigated - if endpoint_status.mitigated: - logger.debug("Re-import: reactivating endpoint %s that is present in this scan", endpoint_status.endpoint) - endpoint_status.mitigated_by = None - endpoint_status.mitigated_time = None - endpoint_status.mitigated = False - endpoint_status.last_modified = now - to_update.append(endpoint_status) - - if to_update: - Endpoint_Status.objects.bulk_update( - to_update, - ["mitigated", "mitigated_time", "mitigated_by", "last_modified"], - batch_size=1000, - ) + # Port normalization: if protocol has a default port, treat that port + # and None as equivalent (matching endpoint_filter's Q(port__isnull=True) | Q(port__exact=default)) + norm_port = port + if norm_protocol and norm_protocol in SCHEME_PORT_MAP: + default_port = SCHEME_PORT_MAP[norm_protocol] + if port is None or port == default_port: + norm_port = None - def chunk_endpoints_and_disperse( - self, - finding: Finding, - endpoints: list[Endpoint], - **kwargs: dict, - ) -> None: - if not endpoints: - return - dojo_dispatch_task(EndpointManager.add_endpoints_to_unsaved_finding, finding, endpoints, sync=True) + return EndpointUniqueKey( + protocol=norm_protocol, + userinfo=userinfo or None, + host=norm_host, + port=norm_port, + path=path or None, + query=query or None, + fragment=fragment or None, + product_id=product_id, + ) @staticmethod def clean_unsaved_endpoints( @@ -124,7 +83,7 @@ def clean_unsaved_endpoints( ) -> None: """ Clean endpoints that are supplied. For any endpoints that fail this validation - process, raise a message that broken endpoints are being stored + process, raise a message that broken endpoints are being stored. """ for endpoint in endpoints: try: @@ -132,50 +91,206 @@ def clean_unsaved_endpoints( except ValidationError as e: logger.warning("DefectDojo is storing broken endpoint because cleaning wasn't successful: %s", e) - def chunk_endpoints_and_reactivate( - self, - endpoint_status_list: list[Endpoint_Status], - **kwargs: dict, - ) -> None: - dojo_dispatch_task(EndpointManager.reactivate_endpoint_status, endpoint_status_list, sync=True) + def record_endpoint(self, endpoint: Endpoint) -> EndpointUniqueKey: + """Record an endpoint for later batch creation. Returns the unique key.""" + key = self._make_endpoint_unique_tuple( + protocol=endpoint.protocol, + userinfo=endpoint.userinfo, + host=endpoint.host, + port=endpoint.port, + path=endpoint.path, + query=endpoint.query, + fragment=endpoint.fragment, + product_id=self._product.id, + ) + if key not in self._endpoints_to_create: + self._endpoints_to_create[key] = { + "protocol": endpoint.protocol, + "userinfo": endpoint.userinfo, + "host": endpoint.host, + "port": endpoint.port, + "path": endpoint.path, + "query": endpoint.query, + "fragment": endpoint.fragment, + "product": self._product, + } + return key - def chunk_endpoints_and_mitigate( - self, - endpoint_status_list: list[Endpoint_Status], - user: Dojo_User, - **kwargs: dict, - ) -> None: - dojo_dispatch_task(EndpointManager.mitigate_endpoint_status, endpoint_status_list, user, sync=True) + def record_status_for_create(self, finding: Finding, key: EndpointUniqueKey) -> None: + """Record that a finding should be linked to an endpoint (identified by key) via Endpoint_Status.""" + self._statuses_to_create.append((finding, key)) + + @staticmethod + def get_non_special_endpoint_statuses(finding: Finding) -> list[Endpoint_Status]: + """ + Return endpoint statuses that are not false_positive, out_of_scope, or risk_accepted. + + Uses finding.status_finding.all() which is served from the prefetch cache when the + finding was loaded through build_candidate_scope_queryset, and falls back to a DB + query otherwise (e.g. for findings created during the same reimport batch). + + This might be ineffecient if lots of internal duplicates are in the report. + But this should be limited as most parsers dedupe during parsing and merge the endpoints. + """ + return [ + eps for eps in finding.status_finding.all() + if not eps.false_positive and not eps.out_of_scope and not eps.risk_accepted + ] def update_endpoint_status( self, existing_finding: Finding, new_finding: Finding, user: Dojo_User, - **kwargs: dict, ) -> None: - """Update the list of endpoints from the new finding with the list that is in the old finding""" - # New endpoints are already added in serializers.py / views.py (see comment "# for existing findings: make sure endpoints are present or created") - # So we only need to mitigate endpoints that are no longer present - # status_finding_non_special is prefetched by build_candidate_scope_queryset with the - # special-status exclusion and endpoint select_related already applied at the DB level - existing_finding_endpoint_status_list = existing_finding.status_finding_non_special + """ + Compare old/new endpoints and accumulate mitigate/reactivate lists. + + The actual bulk_update happens in persist(). + """ + existing_finding_endpoint_status_list = self.get_non_special_endpoint_statuses(existing_finding) new_finding_endpoints_list = new_finding.unsaved_endpoints if new_finding.is_mitigated: # New finding is mitigated, so mitigate all old endpoints - endpoint_status_to_mitigate = existing_finding_endpoint_status_list + self._statuses_to_mitigate.extend(existing_finding_endpoint_status_list) else: # Convert to set for O(1) lookups instead of O(n) linear search new_finding_endpoints_set = set(new_finding_endpoints_list) # Mitigate any endpoints in the old finding not found in the new finding - endpoint_status_to_mitigate = [ + self._statuses_to_mitigate.extend( eps for eps in existing_finding_endpoint_status_list if eps.endpoint not in new_finding_endpoints_set - ] + ) # Re-activate any endpoints in the old finding that are in the new finding - endpoint_status_to_reactivate = [ + self._statuses_to_reactivate.extend( eps for eps in existing_finding_endpoint_status_list if eps.endpoint in new_finding_endpoints_set + ) + + def record_statuses_to_reactivate(self, statuses: list[Endpoint_Status]) -> None: + """Accumulate endpoint statuses for bulk reactivation in persist().""" + self._statuses_to_reactivate.extend(statuses) + + def record_statuses_to_mitigate(self, statuses: list[Endpoint_Status]) -> None: + """Accumulate endpoint statuses for bulk mitigation in persist().""" + self._statuses_to_mitigate.extend(statuses) + + def get_or_create_endpoints(self) -> tuple[dict[EndpointUniqueKey, Endpoint], list[Endpoint]]: + """ + For each queued endpoint record, fetch the existing DB row or bulk_create a new one. + + Returns: + (endpoints_by_key, created) where: + - endpoints_by_key maps each EndpointUniqueKey to its Endpoint object (existing or new) + - created is the list of Endpoint objects that were actually inserted into the DB + + """ + if not self._endpoints_to_create: + return {}, [] + + endpoints_by_key: dict[EndpointUniqueKey, Endpoint] = {} + + with transaction.atomic(): + # Fetch all existing endpoints for this product + for ep in ( + Endpoint.objects.filter(product=self._product) + .only("id", "protocol", "userinfo", "host", "port", "path", "query", "fragment", "product_id") + .order_by("id") + .iterator() + ): + key = self._make_endpoint_unique_tuple( + protocol=ep.protocol, + userinfo=ep.userinfo, + host=ep.host, + port=ep.port, + path=ep.path, + query=ep.query, + fragment=ep.fragment, + product_id=ep.product_id, + ) + # First-by-id wins, matching endpoint_get_or_create behavior + if key not in endpoints_by_key: + endpoints_by_key[key] = ep + + # Determine which endpoints still need creating + to_create = [] + to_create_keys = [] + for key, kwargs in self._endpoints_to_create.items(): + if key not in endpoints_by_key: + to_create.append(Endpoint(**kwargs)) + to_create_keys.append(key) + + created: list[Endpoint] = [] + if to_create: + created = Endpoint.objects.bulk_create(to_create, batch_size=1000) + endpoints_by_key.update(zip(to_create_keys, created, strict=True)) + # bulk_create bypasses post_save signals, so manually trigger tag inheritance + # this is not ideal, but we need to take a separate look at the tag inheritance feature itself later + for ep in created: + inherit_instance_tags(ep) + + self._endpoints_to_create.clear() + return endpoints_by_key, created + + def persist(self, user: Dojo_User | None = None) -> None: + """ + Persist all accumulated endpoint operations to the database. + + Called at batch boundaries during import/reimport. + """ + # Step 1: Ensure all recorded endpoints exist in DB + endpoints_by_key, _ = self.get_or_create_endpoints() + + # Step 2: Bulk-create Endpoint_Status rows + if self._statuses_to_create: + rows = [ + Endpoint_Status( + finding=finding, + endpoint=endpoints_by_key[key], + date=finding.date, + ) + for finding, key in self._statuses_to_create + if key in endpoints_by_key ] - self.chunk_endpoints_and_reactivate(endpoint_status_to_reactivate) - self.chunk_endpoints_and_mitigate(endpoint_status_to_mitigate, user) + if rows: + Endpoint_Status.objects.bulk_create(rows, ignore_conflicts=True, batch_size=1000) + self._statuses_to_create.clear() + + # Step 3: Bulk-update mitigated endpoint statuses + if self._statuses_to_mitigate: + now = timezone.now() + to_update = [] + for endpoint_status in self._statuses_to_mitigate: + if endpoint_status.mitigated is False: + endpoint_status.mitigated_time = now + endpoint_status.last_modified = now + endpoint_status.mitigated_by = user + endpoint_status.mitigated = True + to_update.append(endpoint_status) + if to_update: + Endpoint_Status.objects.bulk_update( + to_update, + ["mitigated", "mitigated_time", "last_modified", "mitigated_by"], + batch_size=1000, + ) + self._statuses_to_mitigate.clear() + + # Step 4: Bulk-update reactivated endpoint statuses + if self._statuses_to_reactivate: + now = timezone.now() + to_update = [] + for endpoint_status in self._statuses_to_reactivate: + if endpoint_status.mitigated: + logger.debug("Re-import: reactivating endpoint %s that is present in this scan", endpoint_status.endpoint) + endpoint_status.mitigated_by = None + endpoint_status.mitigated_time = None + endpoint_status.mitigated = False + endpoint_status.last_modified = now + to_update.append(endpoint_status) + if to_update: + Endpoint_Status.objects.bulk_update( + to_update, + ["mitigated", "mitigated_time", "mitigated_by", "last_modified"], + batch_size=1000, + ) + self._statuses_to_reactivate.clear() diff --git a/unittests/scans/stackhawk/stackhawk_two_vul_same_hashcode_fabricated.json b/unittests/scans/stackhawk/stackhawk_two_vul_same_hashcode_fabricated.json new file mode 100644 index 00000000000..ab9ed464215 --- /dev/null +++ b/unittests/scans/stackhawk/stackhawk_two_vul_same_hashcode_fabricated.json @@ -0,0 +1,77 @@ +{ + "service": "StackHawk", + "scanCompleted": { + "scan": { + "id": "f3aa6752-8fef-58fa-c854-1d3g8e972f38", + "hawkscanVersion": "2.1.1", + "env": "Development", + "status": "COMPLETED", + "application": "Secured Application", + "startedTimestamp": "2022-02-16T23:07:19.575Z", + "scanURL": "https://app.stackhawk.com/scans/f3aa6752-8fef-58fa-c854-1d3g8e972f38" + }, + "scanDuration": "21", + "spiderDuration": "45", + "completedScanStats": { + "urlsCount": "2", + "duration": "66", + "scanResultsStats": { + "totalCount": "2", + "lowCount": "0", + "mediumCount": "0", + "highCount": "2", + "lowTriagedCount": "0", + "mediumTriagedCount": "0", + "highTriagedCount": "0" + } + }, + "findings": [ + { + "pluginId": "90001", + "pluginName": "Insecure JSF ViewState", + "severity": "High", + "host": "https://localhost:9000", + "paths": [ + { + "path": "/app/login", + "method": "GET", + "status": "NEW", + "pathURL": "https://app.stackhawk.com/scans/f3aa6752-8fef-58fa-c854-1d3g8e972f38/finding/90001/path/1/message/1" + } + ], + "pathStats": [ + { + "status": "NEW", + "count": 1 + } + ], + "totalCount": "1", + "category": "Security Misconfiguration", + "findingURL": "https://app.stackhawk.com/scans/f3aa6752-8fef-58fa-c854-1d3g8e972f38/finding/90001" + }, + { + "pluginId": "90002", + "pluginName": "Insecure HTTP Header", + "severity": "High", + "host": "https://localhost:9000", + "paths": [ + { + "path": "/app/dashboard", + "method": "GET", + "status": "NEW", + "pathURL": "https://app.stackhawk.com/scans/f3aa6752-8fef-58fa-c854-1d3g8e972f38/finding/90002/path/1/message/1" + } + ], + "pathStats": [ + { + "status": "NEW", + "count": 1 + } + ], + "totalCount": "1", + "category": "Security Misconfiguration", + "findingURL": "https://app.stackhawk.com/scans/f3aa6752-8fef-58fa-c854-1d3g8e972f38/finding/90002" + } + ] + } +} diff --git a/unittests/test_endpoint_manager.py b/unittests/test_endpoint_manager.py new file mode 100644 index 00000000000..af7fa8ccb47 --- /dev/null +++ b/unittests/test_endpoint_manager.py @@ -0,0 +1,83 @@ +from django.test import TestCase + +from dojo.importers.endpoint_manager import EndpointManager, EndpointUniqueKey + + +class TestMakeEndpointUniqueTuple(TestCase): + + """Tests for EndpointManager._make_endpoint_unique_tuple normalization.""" + + def _make(self, **kwargs): + defaults = { + "protocol": None, + "userinfo": None, + "host": None, + "port": None, + "path": None, + "query": None, + "fragment": None, + "product_id": 1, + } + defaults.update(kwargs) + return EndpointManager._make_endpoint_unique_tuple(**defaults) + + def test_protocol_case_insensitive(self): + a = self._make(protocol="HTTP", host="example.com") + b = self._make(protocol="http", host="example.com") + self.assertEqual(a, b) + + def test_host_case_insensitive(self): + a = self._make(host="Example.COM") + b = self._make(host="example.com") + self.assertEqual(a, b) + + def test_default_port_normalized_to_none_https(self): + a = self._make(protocol="https", host="example.com", port=443) + b = self._make(protocol="https", host="example.com", port=None) + self.assertEqual(a, b) + self.assertIsNone(a.port) + + def test_default_port_normalized_to_none_http(self): + a = self._make(protocol="http", host="example.com", port=80) + b = self._make(protocol="http", host="example.com", port=None) + self.assertEqual(a, b) + + def test_non_default_port_preserved(self): + a = self._make(protocol="https", host="example.com", port=8443) + b = self._make(protocol="https", host="example.com", port=None) + self.assertNotEqual(a, b) + self.assertEqual(a.port, 8443) + + def test_none_fields_handled(self): + key = self._make() + self.assertIsNone(key.protocol) + self.assertIsNone(key.host) + self.assertIsNone(key.port) + self.assertIsNone(key.path) + + def test_empty_string_fields_normalized_to_none(self): + key = self._make(userinfo="", path="", query="", fragment="") + self.assertIsNone(key.userinfo) + self.assertIsNone(key.path) + self.assertIsNone(key.query) + self.assertIsNone(key.fragment) + + def test_different_products_different_keys(self): + a = self._make(host="example.com", product_id=1) + b = self._make(host="example.com", product_id=2) + self.assertNotEqual(a, b) + + def test_returns_named_tuple(self): + key = self._make(protocol="https", host="example.com", path="/api") + self.assertIsInstance(key, EndpointUniqueKey) + self.assertEqual(key.protocol, "https") + self.assertEqual(key.host, "example.com") + self.assertEqual(key.path, "/api") + + def test_port_without_known_protocol_preserved(self): + key = self._make(protocol="custom", host="example.com", port=9999) + self.assertEqual(key.port, 9999) + + def test_port_none_without_known_protocol(self): + key = self._make(protocol="custom", host="example.com", port=None) + self.assertIsNone(key.port) diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py index 665522c0de3..a3dfd37074e 100644 --- a/unittests/test_importers_performance.py +++ b/unittests/test_importers_performance.py @@ -269,12 +269,12 @@ def test_import_reimport_reimport_performance_pghistory_async(self): configure_pghistory_triggers() self._import_reimport_performance( - expected_num_queries1=295, - expected_num_async_tasks1=6, - expected_num_queries2=226, - expected_num_async_tasks2=17, - expected_num_queries3=108, - expected_num_async_tasks3=16, + expected_num_queries1=139, + expected_num_async_tasks1=1, + expected_num_queries2=115, + expected_num_async_tasks2=1, + expected_num_queries3=29, + expected_num_async_tasks3=1, ) @override_settings(ENABLE_AUDITLOG=True) @@ -291,12 +291,12 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self): testuser.usercontactinfo.save() self._import_reimport_performance( - expected_num_queries1=302, - expected_num_async_tasks1=6, - expected_num_queries2=233, - expected_num_async_tasks2=17, - expected_num_queries3=115, - expected_num_async_tasks3=16, + expected_num_queries1=146, + expected_num_async_tasks1=1, + expected_num_queries2=122, + expected_num_async_tasks2=1, + expected_num_queries3=36, + expected_num_async_tasks3=1, ) @override_settings(ENABLE_AUDITLOG=True) @@ -314,12 +314,12 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr self.system_settings(enable_product_grade=True) self._import_reimport_performance( - expected_num_queries1=309, - expected_num_async_tasks1=8, - expected_num_queries2=240, - expected_num_async_tasks2=19, - expected_num_queries3=119, - expected_num_async_tasks3=18, + expected_num_queries1=153, + expected_num_async_tasks1=3, + expected_num_queries2=129, + expected_num_async_tasks2=3, + expected_num_queries3=40, + expected_num_async_tasks3=3, ) # Deduplication is enabled in the tests above, but to properly test it we must run the same import twice and capture the results. @@ -437,10 +437,10 @@ def test_deduplication_performance_pghistory_async(self): self.system_settings(enable_deduplication=True) self._deduplication_performance( - expected_num_queries1=264, - expected_num_async_tasks1=7, - expected_num_queries2=175, - expected_num_async_tasks2=7, + expected_num_queries1=74, + expected_num_async_tasks1=1, + expected_num_queries2=69, + expected_num_async_tasks2=1, check_duplicates=False, # Async mode - deduplication happens later ) @@ -458,10 +458,10 @@ def test_deduplication_performance_pghistory_no_async(self): testuser.usercontactinfo.save() self._deduplication_performance( - expected_num_queries1=271, - expected_num_async_tasks1=7, - expected_num_queries2=183, - expected_num_async_tasks2=7, + expected_num_queries1=81, + expected_num_async_tasks1=1, + expected_num_queries2=77, + expected_num_async_tasks2=1, ) diff --git a/unittests/test_reimport_prefetch.py b/unittests/test_reimport_prefetch.py new file mode 100644 index 00000000000..d421b0129dc --- /dev/null +++ b/unittests/test_reimport_prefetch.py @@ -0,0 +1,242 @@ +""" +Regression tests for handling duplicate findings within the same reimport report. + +When a scan report contains two findings that produce the same hash_code, the first +creates a new finding via process_finding_that_was_not_matched() and gets added to +the candidate dictionaries. When the second finding in the same batch matches against +this newly-created finding, accessing existing_finding.status_finding_non_special +raises AttributeError because the finding was never loaded through +build_candidate_scope_queryset (which sets up the Prefetch with to_attr). + +Bugfix: https://github.com/DefectDojo/django-DefectDojo/pull/14569 +Batch endpoint optimization (related): https://github.com/DefectDojo/django-DefectDojo/pull/14489 +""" + +from crum import impersonate +from django.conf import settings +from django.test import override_settings +from django.utils import timezone + +from dojo.importers.default_reimporter import DefaultReImporter +from dojo.location.models import LocationFindingReference +from dojo.location.status import FindingLocationStatus +from dojo.models import ( + Development_Environment, + Dojo_User, + Endpoint, + Endpoint_Status, + Engagement, + Finding, + Product, + Product_Type, + Test, + Test_Type, + User, + UserContactInfo, +) + +from .dojo_test_case import DojoTestCase, get_unit_tests_scans_path, skip_unless_v2 + +SCAN_TYPE = "StackHawk HawkScan" + +# Reimport: two findings with different pluginIds (so the parser keeps both as separate +# findings) but identical hash_code fields (component_name + component_version). +# Each finding has one unique endpoint/location path (/app/login and /app/dashboard). +# This triggers the same-batch matching scenario where the second finding matches +# against the first one that was just created in the same reimport batch. +REIMPORT_SCAN = get_unit_tests_scans_path("stackhawk") / "stackhawk_two_vul_same_hashcode_fabricated.json" + + +def _hashcode_fields_without_vuln_id(): + """ + Return a copy of HASHCODE_FIELDS_PER_SCANNER with StackHawk configured to + hash only on component_name + component_version (excluding vuln_id_from_tool). + + By default StackHawk hashes on [vuln_id_from_tool, component_name, component_version]. + Since vuln_id_from_tool maps to pluginId and the parser deduplicates by pluginId, + it is normally impossible for two separate findings to share a hash_code. + + By removing vuln_id_from_tool from the hash fields, two findings with different + pluginIds but the same application/env metadata will produce the same hash_code, + allowing us to exercise the same-batch matching code path. + """ + fields = dict(settings.HASHCODE_FIELDS_PER_SCANNER) + fields[SCAN_TYPE] = ["component_name", "component_version"] + return fields + + +class ReimportDuplicateFindingsTestBase(DojoTestCase): + + """Shared setup for duplicate-findings-in-same-report tests.""" + + def setUp(self): + super().setUp() + testuser, _ = User.objects.get_or_create(username="admin") + UserContactInfo.objects.get_or_create(user=testuser, defaults={"block_execution": True}) + + self.system_settings(enable_deduplication=True) + self.system_settings(enable_product_grade=False) + + product_type, _ = Product_Type.objects.get_or_create(name="test") + product, _ = Product.objects.get_or_create( + name="ReimportPrefetchTest", + description="Test", + prod_type=product_type, + ) + engagement, _ = Engagement.objects.get_or_create( + name="Test Engagement", + product=product, + target_start=timezone.now(), + target_end=timezone.now(), + ) + self.lead, _ = User.objects.get_or_create(username="admin") + environment, _ = Development_Environment.objects.get_or_create(name="Development") + test_type, _ = Test_Type.objects.get_or_create(name=SCAN_TYPE) + self.test = Test.objects.create( + engagement=engagement, + test_type=test_type, + scan_type=SCAN_TYPE, + target_start=timezone.now(), + target_end=timezone.now(), + environment=environment, + ) + self.product = product + + def _reimport_with_overridden_hashcode(self): + """Reimport the two-finding scan file with overridden hash settings.""" + hashcode_override = _hashcode_fields_without_vuln_id() + with ( + impersonate(Dojo_User.objects.get(username="admin")), + override_settings(HASHCODE_FIELDS_PER_SCANNER=hashcode_override), + REIMPORT_SCAN.open(encoding="utf-8") as scan, + ): + reimporter = DefaultReImporter( + test=self.test, + user=self.lead, + lead=self.lead, + scan_date=None, + minimum_severity="Info", + active=True, + verified=True, + sync=True, + scan_type=SCAN_TYPE, + ) + return reimporter.process_scan(scan) + + +@skip_unless_v2 +class TestReimportDuplicateFindingsEndpointHandling(ReimportDuplicateFindingsTestBase): + + """Regression test: reimport must handle endpoints correctly for duplicate findings in the same report.""" + + def test_reimport_duplicate_findings_in_same_report_endpoints(self): + """ + Reimporting a report with two findings that share the same hash_code must + not raise AttributeError on status_finding_non_special, and must correctly + create endpoints for both findings. + + The scan file has two findings with different pluginIds, each with one + unique endpoint (/app/login and /app/dashboard). With the overridden hash + settings, both produce the same hash_code. The first finding is created as + new and added to candidates. The second finding matches against it (a + batch-created finding), and its endpoint is added via finding_post_processing. + """ + # This previously raised: + # AttributeError: 'Finding' object has no attribute 'status_finding_non_special' + _test, _, len_new, len_closed, _, _, _ = self._reimport_with_overridden_hashcode() + + # The first finding is new (empty test, no candidates). The second finding + # matches the first (same hash_code) so it is not counted as new. + self.assertEqual(len_new, 1, "Reimport should create one new finding (second matches first)") + self.assertEqual(len_closed, 0, "No findings should be closed") + + # Only one finding should exist — the second was matched to the first + findings = Finding.objects.filter(test=self.test) + self.assertEqual(findings.count(), 1, "Only one finding should exist after reimport") + + finding = findings.first() + + # The first finding (new, pluginId=90001) creates endpoint /app/login. + # The second finding (pluginId=90002) matches the first via hash_code. + # finding_post_processing runs for BOTH iterations, adding each finding's + # endpoints to the (single) matched finding. So the finding ends up with + # 2 endpoints: /app/login (from the first) and /app/dashboard (from the second). + endpoints = Endpoint.objects.filter(product=self.product) + self.assertEqual(endpoints.count(), 2, "Two endpoints should be created (one per finding in report)") + + endpoint_statuses = Endpoint_Status.objects.filter(finding=finding) + self.assertEqual(endpoint_statuses.count(), 2, "Finding should have two endpoint statuses") + + # With batched endpoint creation, endpoint statuses for the batch-created + # finding are not yet persisted when update_endpoint_status() runs. So the + # comparison finds no existing statuses and nothing gets mitigated — both + # endpoint statuses remain active. This is actually the correct outcome: + # both endpoints came from the same report and should both be active. + self.assertEqual( + endpoint_statuses.filter(mitigated=False).count(), + 2, + "Both endpoint statuses should be active (batch-created finding has no persisted " + "statuses when update_endpoint_status runs, so nothing gets mitigated)", + ) + + +@override_settings(V3_FEATURE_LOCATIONS=True) +class TestReimportDuplicateFindingsLocationHandling(ReimportDuplicateFindingsTestBase): + + """Test that reimport handles locations correctly for duplicate findings in the same report.""" + + def test_reimport_duplicate_findings_in_same_report_locations(self): + """ + Reimporting a report with two findings that share the same hash_code must + correctly create locations for both findings. + + The locations code does not use the to_attr prefetch that caused the + AttributeError in the endpoint code path, so it does not crash. However, + the same logical issue applies: update_location_status() compares the + batch-created finding's locations against the second finding's unsaved + locations, mitigating locations that are "no longer present". + + The scan file has two findings with different pluginIds, each with one + unique location (https://localhost:9000/app/login and .../app/dashboard). + With the overridden hash settings, both produce the same hash_code. The + first finding is created as new and added to candidates. The second finding + matches against it (a batch-created finding). + """ + _test, _, len_new, len_closed, _, _, _ = self._reimport_with_overridden_hashcode() + + # The first finding is new (empty test, no candidates). The second finding + # matches the first (same hash_code) so it is not counted as new. + self.assertEqual(len_new, 1, "Reimport should create one new finding (second matches first)") + self.assertEqual(len_closed, 0, "No findings should be closed") + + # Only one finding should exist — the second was matched to the first + findings = Finding.objects.filter(test=self.test) + self.assertEqual(findings.count(), 1, "Only one finding should exist after reimport") + + finding = findings.first() + + # The first finding (new, pluginId=90001) creates location for /app/login. + # The second finding (pluginId=90002) matches the first via hash_code. + # finding_post_processing runs for BOTH iterations, adding each finding's + # locations to the (single) matched finding. So the finding ends up with + # 2 location references pointing to 2 distinct Location objects. + location_refs = LocationFindingReference.objects.filter(finding=finding) + self.assertEqual(location_refs.count(), 2, "Finding should have two location references") + + # Same behavior as endpoints: update_location_status() compares the first + # finding's existing location refs against the second finding's unsaved_locations. + # The second finding only has /app/dashboard, so /app/login (which belongs to + # the first finding) is considered "no longer present" and gets mitigated. + # This is arguably wrong for batch-created findings — both locations came from + # the same report — but it is the current behavior. + self.assertEqual( + location_refs.filter(status=FindingLocationStatus.Active).count(), + 1, + "One location ref should be active (/app/dashboard from the matched finding)", + ) + self.assertEqual( + location_refs.filter(status=FindingLocationStatus.Mitigated).count(), + 1, + "One location ref should be mitigated (/app/login — mitigated by update_location_status " + "because it is not in the second finding's location list)", + )