Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 13 additions & 45 deletions dojo/importers/base_importer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import base64
import logging
import time
from collections.abc import Iterable

from django.conf import settings
from django.core.exceptions import ValidationError
Expand Down Expand Up @@ -337,67 +336,36 @@ def update_test_tags(self):
if self.tags is not None and len(self.tags) > 0:
self.test.tags.set(self.tags)

def apply_import_tags(
self,
new_findings: Iterable[Finding] | None = None,
closed_findings: Iterable[Finding] | None = None,
reactivated_findings: Iterable[Finding] | None = None,
untouched_findings: Iterable[Finding] | None = None,
) -> None:
"""Apply tags to findings and endpoints from an import operation."""
# Normalize None values to empty lists and convert sets/other iterables to lists
if untouched_findings is None:
untouched_findings = []
elif not isinstance(untouched_findings, list):
untouched_findings = list(untouched_findings)

if reactivated_findings is None:
reactivated_findings = []
elif not isinstance(reactivated_findings, list):
reactivated_findings = list(reactivated_findings)

if closed_findings is None:
closed_findings = []
elif not isinstance(closed_findings, list):
closed_findings = list(closed_findings)

if new_findings is None:
new_findings = []
elif not isinstance(new_findings, list):
new_findings = list(new_findings)

# Collect all affected findings
findings_to_tag = new_findings + closed_findings + reactivated_findings + untouched_findings
def apply_import_tags_for_batch(self, findings: list[Finding]) -> None:
"""
Apply import-time tags to a batch of already-saved findings and their endpoints.

if not findings_to_tag:
Called per batch inside process_findings(), before post_process_findings_batch is
dispatched, so that rules/deduplication tasks see the import tags on the findings.
"""
if not findings or not self.tags:
return

# Add any tags to the findings imported if necessary
if self.apply_tags_to_findings and self.tags:
findings_qs = Finding.objects.filter(id__in=[f.id for f in findings_to_tag])
if self.apply_tags_to_findings:
try:
bulk_add_tags_to_instances(
tag_or_tags=self.tags,
instances=findings_qs,
instances=findings,
tag_field_name="tags",
)
except IntegrityError:
# Fallback to safe per-instance tagging if concurrent deletes occur
for finding in findings_to_tag:
for finding in findings:
for tag in self.tags:
self.add_tags_safe(finding, tag)

# Add any tags to any locations/endpoints of the findings imported if necessary
if self.apply_tags_to_endpoints and self.tags:
locations_qs = self.location_handler.get_locations_for_tagging(findings_to_tag)
if self.apply_tags_to_endpoints:
locations_qs = self.location_handler.get_locations_for_tagging(findings)
try:
bulk_add_tags_to_instances(
tag_or_tags=self.tags,
instances=locations_qs,
tag_field_name="tags",
)
except IntegrityError:
for finding in findings_to_tag:
for finding in findings:
for location in self.location_handler.get_location_tag_fallback(finding):
for tag in self.tags:
self.add_tags_safe(location, tag)
Expand Down
10 changes: 5 additions & 5 deletions dojo/importers/default_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,11 +132,6 @@ def process_scan(
new_findings=new_findings,
closed_findings=closed_findings,
)
# Apply tags to findings and endpoints/locations
self.apply_import_tags(
new_findings=new_findings,
closed_findings=closed_findings,
)
# Send out some notifications to the user
logger.debug("IMPORT_SCAN: Generating notifications")
dojo_dispatch_task(
Expand Down Expand Up @@ -169,6 +164,7 @@ def process_findings(
) -> list[Finding]:
# Batched post-processing (no chord): dispatch a task per 1000 findings or on final finding
batch_finding_ids: list[int] = []
batch_findings: list[Finding] = []
batch_max_size = getattr(settings, "IMPORT_REIMPORT_DEDUPE_BATCH_SIZE", 1000)

"""
Expand Down Expand Up @@ -259,6 +255,7 @@ def process_findings(
push_to_jira = self.push_to_jira and ((not self.findings_groups_enabled or not self.group_by) or not finding_will_be_grouped)
logger.debug("process_findings: computed push_to_jira=%s", push_to_jira)
batch_finding_ids.append(finding.id)
batch_findings.append(finding)

# If batch is full or we're at the end, persist locations/endpoints and dispatch
if len(batch_finding_ids) >= batch_max_size or is_final_finding:
Expand All @@ -267,6 +264,9 @@ def process_findings(
# so rules/deduplication tasks see the tags already on the findings.
bulk_apply_parser_tags(findings_with_parser_tags)
findings_with_parser_tags.clear()
# Apply import-time tags before post-processing so rules/deduplication see them.
self.apply_import_tags_for_batch(batch_findings)
batch_findings.clear()
finding_ids_batch = list(batch_finding_ids)
batch_finding_ids.clear()
logger.debug("process_findings: dispatching batch with push_to_jira=%s (batch_size=%d, is_final=%s)",
Expand Down
32 changes: 15 additions & 17 deletions dojo/importers/default_reimporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,13 +137,6 @@ def process_scan(
reactivated_findings=reactivated_findings,
untouched_findings=untouched_findings,
)
# Apply tags to findings and endpoints
self.apply_import_tags(
new_findings=new_findings,
closed_findings=closed_findings,
reactivated_findings=reactivated_findings,
untouched_findings=untouched_findings,
)
# Send out som notifications to the user
logger.debug("REIMPORT_SCAN: Generating notifications")
updated_count = (
Expand Down Expand Up @@ -173,7 +166,7 @@ def process_scan(

def get_reimport_match_candidates_for_batch(
self,
batch_findings: list[Finding],
unsaved_findings_batch: list[Finding],
) -> tuple[dict, dict, dict]:
"""
Fetch candidate matches for a batch of *unsaved* findings during reimport.
Expand All @@ -195,23 +188,23 @@ def get_reimport_match_candidates_for_batch(
if self.deduplication_algorithm == "hash_code":
candidates_by_hash = find_candidates_for_deduplication_hash(
self.test,
batch_findings,
unsaved_findings_batch,
mode="reimport",
)
elif self.deduplication_algorithm == "unique_id_from_tool":
candidates_by_uid = find_candidates_for_deduplication_unique_id(
self.test,
batch_findings,
unsaved_findings_batch,
mode="reimport",
)
elif self.deduplication_algorithm == "unique_id_from_tool_or_hash_code":
candidates_by_uid, candidates_by_hash = find_candidates_for_deduplication_uid_or_hash(
self.test,
batch_findings,
unsaved_findings_batch,
mode="reimport",
)
elif self.deduplication_algorithm == "legacy":
candidates_by_key = find_candidates_for_reimport_legacy(self.test, batch_findings)
candidates_by_key = find_candidates_for_reimport_legacy(self.test, unsaved_findings_batch)

return candidates_by_hash, candidates_by_uid, candidates_by_key

Expand Down Expand Up @@ -308,6 +301,7 @@ def process_findings(
cleaned_findings.append(sanitized)

batch_finding_ids: list[int] = []
batch_findings: list[Finding] = []
findings_with_parser_tags: list[tuple] = []
# Batch size for deduplication/post-processing (only new findings)
dedupe_batch_max_size = getattr(settings, "IMPORT_REIMPORT_DEDUPE_BATCH_SIZE", 1000)
Expand All @@ -318,13 +312,13 @@ def process_findings(
# This avoids the 1+N query problem by fetching all candidates for a batch at once
for batch_start in range(0, len(cleaned_findings), match_batch_max_size):
batch_end = min(batch_start + match_batch_max_size, len(cleaned_findings))
batch_findings = cleaned_findings[batch_start:batch_end]
unsaved_findings_batch = cleaned_findings[batch_start:batch_end]
is_final_batch = batch_end == len(cleaned_findings)

logger.debug(f"Processing reimport batch {batch_start}-{batch_end} of {len(cleaned_findings)} findings")

# Prepare findings in batch: set test, service, calculate hash codes
for unsaved_finding in batch_findings:
for unsaved_finding in unsaved_findings_batch:
# Some parsers provide "mitigated" field but do not set timezone (because they are probably not available in the report)
# Finding.mitigated is DateTimeField and it requires timezone
if unsaved_finding.mitigated and not unsaved_finding.mitigated.tzinfo:
Expand All @@ -342,12 +336,12 @@ def process_findings(

# Fetch all candidates for this batch at once (batch candidate finding)
candidates_by_hash, candidates_by_uid, candidates_by_key = self.get_reimport_match_candidates_for_batch(
batch_findings,
unsaved_findings_batch,
)

# Process each finding in the batch using pre-fetched candidates
for idx, unsaved_finding in enumerate(batch_findings):
is_final = is_final_batch and idx == len(batch_findings) - 1
for idx, unsaved_finding in enumerate(unsaved_findings_batch):
is_final = is_final_batch and idx == len(unsaved_findings_batch) - 1

# Match any findings to this new one coming in using pre-fetched candidates
matched_findings = self.match_finding_to_candidate_reimport(
Expand Down Expand Up @@ -403,6 +397,7 @@ def process_findings(
# all data is already saved on the finding, we only need to trigger post processing in batches
push_to_jira = self.push_to_jira and ((not self.findings_groups_enabled or not self.group_by) or not finding_will_be_grouped)
batch_finding_ids.append(finding.id)
batch_findings.append(finding)

# Post-processing batches (deduplication, rules, etc.) are separate from matching batches.
# These batches only contain "new" findings that were saved (not matched to existing findings).
Expand All @@ -425,6 +420,9 @@ def process_findings(
# so rules/deduplication tasks see the tags already on the findings.
bulk_apply_parser_tags(findings_with_parser_tags)
findings_with_parser_tags.clear()
# Apply import-time tags before post-processing so rules/deduplication see them.
self.apply_import_tags_for_batch(batch_findings)
batch_findings.clear()
finding_ids_batch = list(batch_finding_ids)
batch_finding_ids.clear()
dojo_dispatch_task(
Expand Down
24 changes: 12 additions & 12 deletions unittests/test_importers_performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,9 +343,9 @@ def test_import_reimport_reimport_performance_pghistory_async(self):
configure_pghistory_triggers()

self._import_reimport_performance(
expected_num_queries1=172,
expected_num_queries1=171,
expected_num_async_tasks1=2,
expected_num_queries2=125,
expected_num_queries2=124,
expected_num_async_tasks2=1,
expected_num_queries3=29,
expected_num_async_tasks3=1,
Expand All @@ -367,9 +367,9 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self):
testuser.usercontactinfo.save()

self._import_reimport_performance(
expected_num_queries1=188,
expected_num_queries1=187,
expected_num_async_tasks1=2,
expected_num_queries2=133,
expected_num_queries2=132,
expected_num_async_tasks2=1,
expected_num_queries3=37,
expected_num_async_tasks3=1,
Expand All @@ -392,9 +392,9 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr
self.system_settings(enable_product_grade=True)

self._import_reimport_performance(
expected_num_queries1=198,
expected_num_queries1=197,
expected_num_async_tasks1=4,
expected_num_queries2=143,
expected_num_queries2=142,
expected_num_async_tasks2=3,
expected_num_queries3=44,
expected_num_async_tasks3=3,
Expand Down Expand Up @@ -633,9 +633,9 @@ def test_import_reimport_reimport_performance_pghistory_async(self):
configure_pghistory_triggers()

self._import_reimport_performance(
expected_num_queries1=179,
expected_num_queries1=178,
expected_num_async_tasks1=2,
expected_num_queries2=134,
expected_num_queries2=133,
expected_num_async_tasks2=1,
expected_num_queries3=37,
expected_num_async_tasks3=1,
Expand All @@ -657,9 +657,9 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self):
testuser.usercontactinfo.save()

self._import_reimport_performance(
expected_num_queries1=197,
expected_num_queries1=196,
expected_num_async_tasks1=2,
expected_num_queries2=144,
expected_num_queries2=143,
expected_num_async_tasks2=1,
expected_num_queries3=47,
expected_num_async_tasks3=1,
Expand All @@ -682,9 +682,9 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr
self.system_settings(enable_product_grade=True)

self._import_reimport_performance(
expected_num_queries1=210,
expected_num_queries1=209,
expected_num_async_tasks1=4,
expected_num_queries2=157,
expected_num_queries2=156,
expected_num_async_tasks2=3,
expected_num_queries3=54,
expected_num_async_tasks3=3,
Expand Down
Loading