diff --git a/dojo/importers/base_importer.py b/dojo/importers/base_importer.py index e42eae0f4a2..052b006356c 100644 --- a/dojo/importers/base_importer.py +++ b/dojo/importers/base_importer.py @@ -1,7 +1,6 @@ import base64 import logging import time -from collections.abc import Iterable from django.conf import settings from django.core.exceptions import ValidationError @@ -337,59 +336,28 @@ def update_test_tags(self): if self.tags is not None and len(self.tags) > 0: self.test.tags.set(self.tags) - def apply_import_tags( - self, - new_findings: Iterable[Finding] | None = None, - closed_findings: Iterable[Finding] | None = None, - reactivated_findings: Iterable[Finding] | None = None, - untouched_findings: Iterable[Finding] | None = None, - ) -> None: - """Apply tags to findings and endpoints from an import operation.""" - # Normalize None values to empty lists and convert sets/other iterables to lists - if untouched_findings is None: - untouched_findings = [] - elif not isinstance(untouched_findings, list): - untouched_findings = list(untouched_findings) - - if reactivated_findings is None: - reactivated_findings = [] - elif not isinstance(reactivated_findings, list): - reactivated_findings = list(reactivated_findings) - - if closed_findings is None: - closed_findings = [] - elif not isinstance(closed_findings, list): - closed_findings = list(closed_findings) - - if new_findings is None: - new_findings = [] - elif not isinstance(new_findings, list): - new_findings = list(new_findings) - - # Collect all affected findings - findings_to_tag = new_findings + closed_findings + reactivated_findings + untouched_findings + def apply_import_tags_for_batch(self, findings: list[Finding]) -> None: + """ + Apply import-time tags to a batch of already-saved findings and their endpoints. - if not findings_to_tag: + Called per batch inside process_findings(), before post_process_findings_batch is + dispatched, so that rules/deduplication tasks see the import tags on the findings. + """ + if not findings or not self.tags: return - - # Add any tags to the findings imported if necessary - if self.apply_tags_to_findings and self.tags: - findings_qs = Finding.objects.filter(id__in=[f.id for f in findings_to_tag]) + if self.apply_tags_to_findings: try: bulk_add_tags_to_instances( tag_or_tags=self.tags, - instances=findings_qs, + instances=findings, tag_field_name="tags", ) except IntegrityError: - # Fallback to safe per-instance tagging if concurrent deletes occur - for finding in findings_to_tag: + for finding in findings: for tag in self.tags: self.add_tags_safe(finding, tag) - - # Add any tags to any locations/endpoints of the findings imported if necessary - if self.apply_tags_to_endpoints and self.tags: - locations_qs = self.location_handler.get_locations_for_tagging(findings_to_tag) + if self.apply_tags_to_endpoints: + locations_qs = self.location_handler.get_locations_for_tagging(findings) try: bulk_add_tags_to_instances( tag_or_tags=self.tags, @@ -397,7 +365,7 @@ def apply_import_tags( tag_field_name="tags", ) except IntegrityError: - for finding in findings_to_tag: + for finding in findings: for location in self.location_handler.get_location_tag_fallback(finding): for tag in self.tags: self.add_tags_safe(location, tag) diff --git a/dojo/importers/default_importer.py b/dojo/importers/default_importer.py index 25d41c2b5cb..428b8ad01a7 100644 --- a/dojo/importers/default_importer.py +++ b/dojo/importers/default_importer.py @@ -132,11 +132,6 @@ def process_scan( new_findings=new_findings, closed_findings=closed_findings, ) - # Apply tags to findings and endpoints/locations - self.apply_import_tags( - new_findings=new_findings, - closed_findings=closed_findings, - ) # Send out some notifications to the user logger.debug("IMPORT_SCAN: Generating notifications") dojo_dispatch_task( @@ -169,6 +164,7 @@ def process_findings( ) -> list[Finding]: # Batched post-processing (no chord): dispatch a task per 1000 findings or on final finding batch_finding_ids: list[int] = [] + batch_findings: list[Finding] = [] batch_max_size = getattr(settings, "IMPORT_REIMPORT_DEDUPE_BATCH_SIZE", 1000) """ @@ -259,6 +255,7 @@ def process_findings( push_to_jira = self.push_to_jira and ((not self.findings_groups_enabled or not self.group_by) or not finding_will_be_grouped) logger.debug("process_findings: computed push_to_jira=%s", push_to_jira) batch_finding_ids.append(finding.id) + batch_findings.append(finding) # If batch is full or we're at the end, persist locations/endpoints and dispatch if len(batch_finding_ids) >= batch_max_size or is_final_finding: @@ -267,6 +264,9 @@ def process_findings( # so rules/deduplication tasks see the tags already on the findings. bulk_apply_parser_tags(findings_with_parser_tags) findings_with_parser_tags.clear() + # Apply import-time tags before post-processing so rules/deduplication see them. + self.apply_import_tags_for_batch(batch_findings) + batch_findings.clear() finding_ids_batch = list(batch_finding_ids) batch_finding_ids.clear() logger.debug("process_findings: dispatching batch with push_to_jira=%s (batch_size=%d, is_final=%s)", diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py index 06f06ca368f..3df3c0bd4cc 100644 --- a/dojo/importers/default_reimporter.py +++ b/dojo/importers/default_reimporter.py @@ -137,13 +137,6 @@ def process_scan( reactivated_findings=reactivated_findings, untouched_findings=untouched_findings, ) - # Apply tags to findings and endpoints - self.apply_import_tags( - new_findings=new_findings, - closed_findings=closed_findings, - reactivated_findings=reactivated_findings, - untouched_findings=untouched_findings, - ) # Send out som notifications to the user logger.debug("REIMPORT_SCAN: Generating notifications") updated_count = ( @@ -173,7 +166,7 @@ def process_scan( def get_reimport_match_candidates_for_batch( self, - batch_findings: list[Finding], + unsaved_findings_batch: list[Finding], ) -> tuple[dict, dict, dict]: """ Fetch candidate matches for a batch of *unsaved* findings during reimport. @@ -195,23 +188,23 @@ def get_reimport_match_candidates_for_batch( if self.deduplication_algorithm == "hash_code": candidates_by_hash = find_candidates_for_deduplication_hash( self.test, - batch_findings, + unsaved_findings_batch, mode="reimport", ) elif self.deduplication_algorithm == "unique_id_from_tool": candidates_by_uid = find_candidates_for_deduplication_unique_id( self.test, - batch_findings, + unsaved_findings_batch, mode="reimport", ) elif self.deduplication_algorithm == "unique_id_from_tool_or_hash_code": candidates_by_uid, candidates_by_hash = find_candidates_for_deduplication_uid_or_hash( self.test, - batch_findings, + unsaved_findings_batch, mode="reimport", ) elif self.deduplication_algorithm == "legacy": - candidates_by_key = find_candidates_for_reimport_legacy(self.test, batch_findings) + candidates_by_key = find_candidates_for_reimport_legacy(self.test, unsaved_findings_batch) return candidates_by_hash, candidates_by_uid, candidates_by_key @@ -308,6 +301,7 @@ def process_findings( cleaned_findings.append(sanitized) batch_finding_ids: list[int] = [] + batch_findings: list[Finding] = [] findings_with_parser_tags: list[tuple] = [] # Batch size for deduplication/post-processing (only new findings) dedupe_batch_max_size = getattr(settings, "IMPORT_REIMPORT_DEDUPE_BATCH_SIZE", 1000) @@ -318,13 +312,13 @@ def process_findings( # This avoids the 1+N query problem by fetching all candidates for a batch at once for batch_start in range(0, len(cleaned_findings), match_batch_max_size): batch_end = min(batch_start + match_batch_max_size, len(cleaned_findings)) - batch_findings = cleaned_findings[batch_start:batch_end] + unsaved_findings_batch = cleaned_findings[batch_start:batch_end] is_final_batch = batch_end == len(cleaned_findings) logger.debug(f"Processing reimport batch {batch_start}-{batch_end} of {len(cleaned_findings)} findings") # Prepare findings in batch: set test, service, calculate hash codes - for unsaved_finding in batch_findings: + for unsaved_finding in unsaved_findings_batch: # Some parsers provide "mitigated" field but do not set timezone (because they are probably not available in the report) # Finding.mitigated is DateTimeField and it requires timezone if unsaved_finding.mitigated and not unsaved_finding.mitigated.tzinfo: @@ -342,12 +336,12 @@ def process_findings( # Fetch all candidates for this batch at once (batch candidate finding) candidates_by_hash, candidates_by_uid, candidates_by_key = self.get_reimport_match_candidates_for_batch( - batch_findings, + unsaved_findings_batch, ) # Process each finding in the batch using pre-fetched candidates - for idx, unsaved_finding in enumerate(batch_findings): - is_final = is_final_batch and idx == len(batch_findings) - 1 + for idx, unsaved_finding in enumerate(unsaved_findings_batch): + is_final = is_final_batch and idx == len(unsaved_findings_batch) - 1 # Match any findings to this new one coming in using pre-fetched candidates matched_findings = self.match_finding_to_candidate_reimport( @@ -403,6 +397,7 @@ def process_findings( # all data is already saved on the finding, we only need to trigger post processing in batches push_to_jira = self.push_to_jira and ((not self.findings_groups_enabled or not self.group_by) or not finding_will_be_grouped) batch_finding_ids.append(finding.id) + batch_findings.append(finding) # Post-processing batches (deduplication, rules, etc.) are separate from matching batches. # These batches only contain "new" findings that were saved (not matched to existing findings). @@ -425,6 +420,9 @@ def process_findings( # so rules/deduplication tasks see the tags already on the findings. bulk_apply_parser_tags(findings_with_parser_tags) findings_with_parser_tags.clear() + # Apply import-time tags before post-processing so rules/deduplication see them. + self.apply_import_tags_for_batch(batch_findings) + batch_findings.clear() finding_ids_batch = list(batch_finding_ids) batch_finding_ids.clear() dojo_dispatch_task( diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py index c72a9b3c890..297f8733143 100644 --- a/unittests/test_importers_performance.py +++ b/unittests/test_importers_performance.py @@ -343,9 +343,9 @@ def test_import_reimport_reimport_performance_pghistory_async(self): configure_pghistory_triggers() self._import_reimport_performance( - expected_num_queries1=172, + expected_num_queries1=171, expected_num_async_tasks1=2, - expected_num_queries2=125, + expected_num_queries2=124, expected_num_async_tasks2=1, expected_num_queries3=29, expected_num_async_tasks3=1, @@ -367,9 +367,9 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self): testuser.usercontactinfo.save() self._import_reimport_performance( - expected_num_queries1=188, + expected_num_queries1=187, expected_num_async_tasks1=2, - expected_num_queries2=133, + expected_num_queries2=132, expected_num_async_tasks2=1, expected_num_queries3=37, expected_num_async_tasks3=1, @@ -392,9 +392,9 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr self.system_settings(enable_product_grade=True) self._import_reimport_performance( - expected_num_queries1=198, + expected_num_queries1=197, expected_num_async_tasks1=4, - expected_num_queries2=143, + expected_num_queries2=142, expected_num_async_tasks2=3, expected_num_queries3=44, expected_num_async_tasks3=3, @@ -633,9 +633,9 @@ def test_import_reimport_reimport_performance_pghistory_async(self): configure_pghistory_triggers() self._import_reimport_performance( - expected_num_queries1=179, + expected_num_queries1=178, expected_num_async_tasks1=2, - expected_num_queries2=134, + expected_num_queries2=133, expected_num_async_tasks2=1, expected_num_queries3=37, expected_num_async_tasks3=1, @@ -657,9 +657,9 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self): testuser.usercontactinfo.save() self._import_reimport_performance( - expected_num_queries1=197, + expected_num_queries1=196, expected_num_async_tasks1=2, - expected_num_queries2=144, + expected_num_queries2=143, expected_num_async_tasks2=1, expected_num_queries3=47, expected_num_async_tasks3=1, @@ -682,9 +682,9 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr self.system_settings(enable_product_grade=True) self._import_reimport_performance( - expected_num_queries1=210, + expected_num_queries1=209, expected_num_async_tasks1=4, - expected_num_queries2=157, + expected_num_queries2=156, expected_num_async_tasks2=3, expected_num_queries3=54, expected_num_async_tasks3=3,