Skip to content

Commit

Permalink
Merge pull request #652 from mvantzet/PreventCorruptionWhenRemovingTe…
Browse files Browse the repository at this point in the history
…xt-538

Prevent PDF corruption when removing text (#538)
  • Loading branch information
EliotJones authored Jul 23, 2023
2 parents 94cc9be + 928c2ef commit 6c0f8b7
Show file tree
Hide file tree
Showing 8 changed files with 68 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1279,6 +1279,7 @@ public class TestTokenWriter : ITokenWriter
public int Tokens { get; private set; }
public int Objects { get; private set; }
public bool WroteCrossReferenceTable { get; private set; }
public bool WritingPageContents { get; set; }

public void WriteToken(IToken token, Stream outputStream)
{
Expand Down
7 changes: 5 additions & 2 deletions src/UglyToad.PdfPig/Writer/IPdfStreamWriter.cs
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
namespace UglyToad.PdfPig.Writer
{
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using Tokens;

internal interface IPdfStreamWriter : IDisposable
Expand All @@ -20,6 +18,11 @@ internal interface IPdfStreamWriter : IDisposable
/// </summary>
Stream Stream { get; }

/// <summary>
/// Hints that the stream writer is used for writing page contents.
/// </summary>
bool WritingPageContents { get; set; }

/// <summary>
/// Writes a single token to the stream.
/// </summary>
Expand Down
5 changes: 5 additions & 0 deletions src/UglyToad.PdfPig/Writer/ITokenWriter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -37,5 +37,10 @@ void WriteCrossReferenceTable(
IReadOnlyDictionary<IndirectReference, long> objectOffsets,
IndirectReference catalogToken, Stream outputStream,
IndirectReference? documentInformationReference);

/// <summary>
/// Hints to the token writer that we are currently writing page contents.
/// </summary>
bool WritingPageContents { get; set; }
}
}
42 changes: 38 additions & 4 deletions src/UglyToad.PdfPig/Writer/NoTextTokenWriter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,28 @@ namespace UglyToad.PdfPig.Writer
/// </summary>
internal class NoTextTokenWriter : TokenWriter
{
/// <summary>
/// Set this value prior to processing page to get the right page number in log messages
/// </summary>
internal int Page { get; set; }

/// <summary>
/// Write stream without <see cref="ShowText"/> or <see cref="ShowTextsWithPositioning"/> operations
/// </summary>
/// <param name="streamToken"></param>
/// <param name="outputStream"></param>
protected override void WriteStream(StreamToken streamToken, Stream outputStream)
{
if (!TryGetStreamWithoutText(streamToken, out var outputStreamToken))
StreamToken outputStreamToken;
if (!WritingPageContents && !IsFormStream(streamToken))
{
outputStreamToken = streamToken;
}
else if (!TryGetStreamWithoutText(streamToken, out outputStreamToken))
{
outputStreamToken = streamToken;
}

WriteDictionary(outputStreamToken.StreamDictionary, outputStream);
WriteLineBreak(outputStream);
outputStream.Write(StreamStart, 0, StreamStart.Length);
Expand All @@ -38,6 +49,12 @@ protected override void WriteStream(StreamToken streamToken, Stream outputStream
outputStream.Write(StreamEnd, 0, StreamEnd.Length);
}

private bool IsFormStream(StreamToken streamToken)
{
return streamToken.StreamDictionary.Data.TryGetValue(NameToken.Subtype.Data, out var value)
&& (NameToken)value == NameToken.Form;
}

/// <summary>
/// Try get a stream without <see cref="ShowText"/> or <see cref="ShowTextsWithPositioning"/> operations.
/// </summary>
Expand All @@ -63,7 +80,7 @@ private bool TryGetStreamWithoutText(StreamToken streamToken, out StreamToken ou
IReadOnlyList<IGraphicsStateOperation> operations;
try
{
operations = pageContentParser.Parse(1, new ByteArrayInputBytes(bytes), new NoOpLog());
operations = pageContentParser.Parse(Page, new ByteArrayInputBytes(bytes), new NoOpLog());
}
catch (Exception)
{
Expand All @@ -76,7 +93,9 @@ private bool TryGetStreamWithoutText(StreamToken streamToken, out StreamToken ou
var haveText = false;
foreach (var op in operations)
{
if (op.Operator == ShowText.Symbol || op.Operator == ShowTextsWithPositioning.Symbol)
if (op.Operator == ShowText.Symbol
|| op.Operator == ShowTextsWithPositioning.Symbol
|| op.Operator == MoveToNextLineShowText.Symbol)
{
haveText = true;
continue;
Expand All @@ -89,7 +108,22 @@ private bool TryGetStreamWithoutText(StreamToken streamToken, out StreamToken ou
return false;
}
outputStreamT.Seek(0, SeekOrigin.Begin);
outputStreamToken = DataCompresser.CompressToStream(outputStreamT.ToArray());

var compressedBytes = DataCompresser.CompressBytes(outputStreamT.ToArray());
var outputStreamDictionary = new Dictionary<NameToken, IToken>()
{
{ NameToken.Length, new NumericToken(compressedBytes.Length) },
{ NameToken.Filter, NameToken.FlateDecode }
};
foreach (var kv in streamToken.StreamDictionary.Data)
{
var key = NameToken.Create(kv.Key);
if (!outputStreamDictionary.ContainsKey(key))
{
outputStreamDictionary[key] = kv.Value;
}
};
outputStreamToken = new StreamToken(new DictionaryToken(outputStreamDictionary), compressedBytes);
return true;
}
}
Expand Down
3 changes: 2 additions & 1 deletion src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,7 @@ public PdfPageBuilder AddPage(PdfDocument document, int pageNumber, Func<PdfActi
// dedup if on to avoid issues
var prev = context.AttemptDeduplication;
context.AttemptDeduplication = false;
context.WritingPageContents = true;
if (contentsToken is ArrayToken array)
{
foreach (var item in array.Data)
Expand All @@ -378,6 +379,7 @@ public PdfPageBuilder AddPage(PdfDocument document, int pageNumber, Func<PdfActi
WriterUtil.CopyToken(context, ir, document.Structure.TokenScanner, refs) as IndirectReferenceToken));
}
context.AttemptDeduplication = prev;
context.WritingPageContents = false;
}

// manually copy page dict / resources as we need to modify some
Expand Down Expand Up @@ -406,7 +408,6 @@ public PdfPageBuilder AddPage(PdfDocument document, int pageNumber, Func<PdfActi
}
}


foreach (var kvp in pageInfo.Page.Data)
{
if (kvp.Key == NameToken.Contents || kvp.Key == NameToken.Parent || kvp.Key == NameToken.Type)
Expand Down
6 changes: 6 additions & 0 deletions src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@ internal class PdfStreamWriter : IPdfStreamWriter

public bool AttemptDeduplication { get; set; } = true;

public bool WritingPageContents
{
get => TokenWriter.WritingPageContents;
set => TokenWriter.WritingPageContents = value;
}

internal PdfStreamWriter(
Stream baseStream,
bool disposeStream = true,
Expand Down
7 changes: 5 additions & 2 deletions src/UglyToad.PdfPig/Writer/PdfTextRemover.cs
Original file line number Diff line number Diff line change
Expand Up @@ -83,19 +83,22 @@ public static void RemoveText(Stream stream, Stream output, IReadOnlyList<int> p
/// </summary>
public static void RemoveText(PdfDocument file, Stream output, IReadOnlyList<int> pagesBundle = null)
{
using (var document = new PdfDocumentBuilder(output, false, PdfWriterType.Default, file.Version, tokenWriter: new NoTextTokenWriter()))
var tokenWriter = new NoTextTokenWriter();
using (var document = new PdfDocumentBuilder(output, false, PdfWriterType.Default, file.Version, tokenWriter: tokenWriter))
{
if (pagesBundle == null)
{
for (var i = 1; i <= file.NumberOfPages; i++)
{
tokenWriter.Page = i;
document.AddPage(file, i);
}
}
}
else
{
foreach (var i in pagesBundle)
{
tokenWriter.Page = i;
document.AddPage(file, i);
}
}
Expand Down
6 changes: 6 additions & 0 deletions src/UglyToad.PdfPig/Writer/TokenWriter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,12 @@ public void WriteCrossReferenceTable(IReadOnlyDictionary<IndirectReference, long
outputStream.Write(Eof, 0, Eof.Length);
}

/// <summary>
/// Indicates that we are writing page contents.
/// Can be used by a derived class.
/// </summary>
public bool WritingPageContents { get; set; }

/// <inheritdoc cref="ITokenWriter.WriteObject" />
public void WriteObject(long objectNumber, int generation, byte[] data, Stream outputStream)
{
Expand Down

0 comments on commit 6c0f8b7

Please sign in to comment.