Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 92 additions & 10 deletions server.core/Remediate/PdfRemediationProcessor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -577,14 +577,18 @@ public async Task<PdfRemediationResult> ProcessAsync(
// Fallback safety-net: ensure any remaining tagged Figures get *some* alt text.
// This keeps remediation robust even when we can't reliably match content-stream occurrences to tag-tree elements.
var fallbackImageAltSet = 0;
var contentlessFiguresDemoted = 0;
var contentlessFiguresRemoved = 0;
foreach (var figure in PdfStructTreeIndex.ListStructElementsByRole(pdf, PdfName.Figure))
{
if (!StructElemHasAssociatedContent(figure))
{
figure.Remove(PdfName.Alt);
figure.Put(PdfName.S, RoleSpan);
contentlessFiguresDemoted++;
if (!TryRemoveStructElemFromParent(figure))
{
figure.Put(PdfName.S, RoleSpan);
}

contentlessFiguresRemoved++;
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated
continue;
}

Expand All @@ -598,14 +602,18 @@ public async Task<PdfRemediationResult> ProcessAsync(
if (_options.GenerateLinkAltText)
{
var fallbackLinkAltSet = 0;
var contentlessLinksDemoted = 0;
var contentlessLinksRemoved = 0;
foreach (var link in PdfStructTreeIndex.ListStructElementsByRole(pdf, PdfName.Link))
{
if (!StructElemHasAssociatedContent(link))
{
link.Remove(PdfName.Alt);
link.Put(PdfName.S, RoleSpan);
contentlessLinksDemoted++;
if (!TryRemoveStructElemFromParent(link))
{
link.Put(PdfName.S, RoleSpan);
}

contentlessLinksRemoved++;
continue;
}

Expand All @@ -617,21 +625,21 @@ public async Task<PdfRemediationResult> ProcessAsync(
}

_logger.LogInformation(
"PDF remediation link alt summary: {fileId} linkOccurrences={linkOccurrences} linkAltSet={linkAltSet} fallbackLinkAltSet={fallbackLinkAltSet} contentlessLinksDemoted={contentlessLinksDemoted}",
"PDF remediation link alt summary: {fileId} linkOccurrences={linkOccurrences} linkAltSet={linkAltSet} fallbackLinkAltSet={fallbackLinkAltSet} contentlessLinksRemoved={contentlessLinksRemoved}",
fileId,
linkOccurrences,
linkAltSet,
fallbackLinkAltSet,
contentlessLinksDemoted);
contentlessLinksRemoved);
}

_logger.LogInformation(
"PDF remediation image alt summary: {fileId} imageOccurrences={imageOccurrences} imageAltSet={imageAltSet} fallbackImageAltSet={fallbackImageAltSet} contentlessFiguresDemoted={contentlessFiguresDemoted}",
"PDF remediation image alt summary: {fileId} imageOccurrences={imageOccurrences} imageAltSet={imageAltSet} fallbackImageAltSet={fallbackImageAltSet} contentlessFiguresRemoved={contentlessFiguresRemoved}",
fileId,
imageOccurrences,
imageAltSet,
fallbackImageAltSet,
contentlessFiguresDemoted);
contentlessFiguresRemoved);

if (vectorFigureCandidates > 0)
{
Expand Down Expand Up @@ -1030,6 +1038,80 @@ private static bool LooksLikeJpeg2000(byte[] bytes) =>
return null;
}

private static bool TryRemoveStructElemFromParent(PdfDictionary structElem)
{
var parent = structElem.GetAsDictionary(PdfName.P);
if (parent is null)
{
return false;
}

var targetRef = structElem.GetIndirectReference();
var kids = parent.Get(PdfName.K);
if (kids is null)
{
return false;
}

kids = DereferenceStructTreeNode(kids);

if (kids is PdfArray array)
{
var removedAny = false;
for (var i = array.Size() - 1; i >= 0; i--)
{
var item = array.Get(i);
if (!IsSameStructElem(item, structElem, targetRef))
{
continue;
}

array.Remove(i);
removedAny = true;
}

return removedAny;
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.

if (IsSameStructElem(kids, structElem, targetRef))
{
parent.Remove(PdfName.K);
return true;
}

return false;
}

private static bool IsSameStructElem(PdfObject candidate, PdfDictionary target, PdfIndirectReference? targetRef)
{
if (targetRef is not null && candidate is PdfIndirectReference candidateRef)
{
return candidateRef.GetObjNumber() == targetRef.GetObjNumber()
&& candidateRef.GetGenNumber() == targetRef.GetGenNumber();
}

candidate = DereferenceStructTreeNode(candidate);
if (candidate is not PdfDictionary candidateDict)
{
return false;
}

if (ReferenceEquals(candidateDict, target))
{
return true;
}

if (targetRef is null)
{
return false;
}

var candidateRef2 = candidateDict.GetIndirectReference();
return candidateRef2 is not null
&& candidateRef2.GetObjNumber() == targetRef.GetObjNumber()
&& candidateRef2.GetGenNumber() == targetRef.GetGenNumber();
}

private static bool StructElemHasAssociatedContent(PdfDictionary structElem)
{
var kids = structElem.Get(PdfName.K);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@ namespace server.tests.Integration.Remediate;

public sealed class PdfRemediationProcessorAltTextAssociationTests
{
private static readonly PdfName RoleSpan = new("Span");

[Fact]
public async Task ProcessAsync_ContentlessFigureWithoutAlt_DemotesRole()
{
Expand Down Expand Up @@ -45,7 +43,7 @@ await sut.ProcessAsync(
using var outputPdf = new PdfDocument(new PdfReader(outputPdfPath));
outputPdf.IsTagged().Should().BeTrue();
ListStructElementsByRole(outputPdf, PdfName.Figure).Should().BeEmpty("contentless /Figure nodes should be demoted");
ListStructElementsByRole(outputPdf, RoleSpan).Count.Should().BeGreaterThan(0, "demoted nodes should use a neutral /Span role");
ListStructElements(outputPdf).Should().NotBeEmpty("document structure should remain present");
}
finally
{
Expand Down Expand Up @@ -93,7 +91,7 @@ await sut.ProcessAsync(
outputPdf.IsTagged().Should().BeTrue();

ListStructElementsByRole(outputPdf, PdfName.Figure).Should().BeEmpty("contentless /Figure nodes should be demoted");
ListStructElementsByRole(outputPdf, RoleSpan).Count.Should().BeGreaterThan(0, "demoted nodes should use a neutral /Span role");
ListStructElements(outputPdf).Should().NotBeEmpty("document structure should remain present");

ListStructElements(outputPdf)
.Any(e => string.Equals(GetAlt(e), "alt text for image", StringComparison.OrdinalIgnoreCase))
Expand Down Expand Up @@ -251,4 +249,3 @@ public Task<string> GenerateTitleAsync(PdfTitleRequest request, CancellationToke
}
}
}