From 928c2ef2fc73911667ab7e812d68aeebe8b16c2f Mon Sep 17 00:00:00 2001 From: mvantzet Date: Wed, 21 Jun 2023 14:54:11 +0200 Subject: [PATCH] Prevent reading (and modifying!) non-content streams, reducing chances of PDF corruption. Added skipping operation MoveToNextLineShowText as well. Also duplicate the original stream's dictionary which solves disappearing elements (due to missing SubType / BBox for example). --- .../Writer/PdfDocumentBuilderTests.cs | 1 + .../Writer/IPdfStreamWriter.cs | 7 +++- src/UglyToad.PdfPig/Writer/ITokenWriter.cs | 5 +++ .../Writer/NoTextTokenWriter.cs | 42 +++++++++++++++++-- .../Writer/PdfDocumentBuilder.cs | 3 +- src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs | 6 +++ src/UglyToad.PdfPig/Writer/PdfTextRemover.cs | 7 +++- src/UglyToad.PdfPig/Writer/TokenWriter.cs | 6 +++ 8 files changed, 68 insertions(+), 9 deletions(-) diff --git a/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs b/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs index fc58ae2db..ea31263b9 100644 --- a/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs +++ b/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs @@ -1279,6 +1279,7 @@ public class TestTokenWriter : ITokenWriter public int Tokens { get; private set; } public int Objects { get; private set; } public bool WroteCrossReferenceTable { get; private set; } + public bool WritingPageContents { get; set; } public void WriteToken(IToken token, Stream outputStream) { diff --git a/src/UglyToad.PdfPig/Writer/IPdfStreamWriter.cs b/src/UglyToad.PdfPig/Writer/IPdfStreamWriter.cs index 035260bc3..cac1b1dec 100644 --- a/src/UglyToad.PdfPig/Writer/IPdfStreamWriter.cs +++ b/src/UglyToad.PdfPig/Writer/IPdfStreamWriter.cs @@ -1,9 +1,7 @@ namespace UglyToad.PdfPig.Writer { using System; - using System.Collections.Generic; using System.IO; - using System.Text; using Tokens; internal interface IPdfStreamWriter : IDisposable @@ -20,6 +18,11 @@ internal interface IPdfStreamWriter : IDisposable /// Stream Stream { get; } + /// + /// Hints that the stream writer is used for writing page contents. + /// + bool WritingPageContents { get; set; } + /// /// Writes a single token to the stream. /// diff --git a/src/UglyToad.PdfPig/Writer/ITokenWriter.cs b/src/UglyToad.PdfPig/Writer/ITokenWriter.cs index a8efc980b..4b9f449fc 100644 --- a/src/UglyToad.PdfPig/Writer/ITokenWriter.cs +++ b/src/UglyToad.PdfPig/Writer/ITokenWriter.cs @@ -37,5 +37,10 @@ void WriteCrossReferenceTable( IReadOnlyDictionary objectOffsets, IndirectReference catalogToken, Stream outputStream, IndirectReference? documentInformationReference); + + /// + /// Hints to the token writer that we are currently writing page contents. + /// + bool WritingPageContents { get; set; } } } \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Writer/NoTextTokenWriter.cs b/src/UglyToad.PdfPig/Writer/NoTextTokenWriter.cs index fe77829a4..418bacd18 100644 --- a/src/UglyToad.PdfPig/Writer/NoTextTokenWriter.cs +++ b/src/UglyToad.PdfPig/Writer/NoTextTokenWriter.cs @@ -18,6 +18,11 @@ namespace UglyToad.PdfPig.Writer /// internal class NoTextTokenWriter : TokenWriter { + /// + /// Set this value prior to processing page to get the right page number in log messages + /// + internal int Page { get; set; } + /// /// Write stream without or operations /// @@ -25,10 +30,16 @@ internal class NoTextTokenWriter : TokenWriter /// protected override void WriteStream(StreamToken streamToken, Stream outputStream) { - if (!TryGetStreamWithoutText(streamToken, out var outputStreamToken)) + StreamToken outputStreamToken; + if (!WritingPageContents && !IsFormStream(streamToken)) + { + outputStreamToken = streamToken; + } + else if (!TryGetStreamWithoutText(streamToken, out outputStreamToken)) { outputStreamToken = streamToken; } + WriteDictionary(outputStreamToken.StreamDictionary, outputStream); WriteLineBreak(outputStream); outputStream.Write(StreamStart, 0, StreamStart.Length); @@ -38,6 +49,12 @@ protected override void WriteStream(StreamToken streamToken, Stream outputStream outputStream.Write(StreamEnd, 0, StreamEnd.Length); } + private bool IsFormStream(StreamToken streamToken) + { + return streamToken.StreamDictionary.Data.TryGetValue(NameToken.Subtype.Data, out var value) + && (NameToken)value == NameToken.Form; + } + /// /// Try get a stream without or operations. /// @@ -63,7 +80,7 @@ private bool TryGetStreamWithoutText(StreamToken streamToken, out StreamToken ou IReadOnlyList operations; try { - operations = pageContentParser.Parse(1, new ByteArrayInputBytes(bytes), new NoOpLog()); + operations = pageContentParser.Parse(Page, new ByteArrayInputBytes(bytes), new NoOpLog()); } catch (Exception) { @@ -76,7 +93,9 @@ private bool TryGetStreamWithoutText(StreamToken streamToken, out StreamToken ou var haveText = false; foreach (var op in operations) { - if (op.Operator == ShowText.Symbol || op.Operator == ShowTextsWithPositioning.Symbol) + if (op.Operator == ShowText.Symbol + || op.Operator == ShowTextsWithPositioning.Symbol + || op.Operator == MoveToNextLineShowText.Symbol) { haveText = true; continue; @@ -89,7 +108,22 @@ private bool TryGetStreamWithoutText(StreamToken streamToken, out StreamToken ou return false; } outputStreamT.Seek(0, SeekOrigin.Begin); - outputStreamToken = DataCompresser.CompressToStream(outputStreamT.ToArray()); + + var compressedBytes = DataCompresser.CompressBytes(outputStreamT.ToArray()); + var outputStreamDictionary = new Dictionary() + { + { NameToken.Length, new NumericToken(compressedBytes.Length) }, + { NameToken.Filter, NameToken.FlateDecode } + }; + foreach (var kv in streamToken.StreamDictionary.Data) + { + var key = NameToken.Create(kv.Key); + if (!outputStreamDictionary.ContainsKey(key)) + { + outputStreamDictionary[key] = kv.Value; + } + }; + outputStreamToken = new StreamToken(new DictionaryToken(outputStreamDictionary), compressedBytes); return true; } } diff --git a/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs b/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs index de114a9c5..9f7102608 100644 --- a/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs +++ b/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs @@ -360,6 +360,7 @@ public PdfPageBuilder AddPage(PdfDocument document, int pageNumber, Func TokenWriter.WritingPageContents; + set => TokenWriter.WritingPageContents = value; + } + internal PdfStreamWriter( Stream baseStream, bool disposeStream = true, diff --git a/src/UglyToad.PdfPig/Writer/PdfTextRemover.cs b/src/UglyToad.PdfPig/Writer/PdfTextRemover.cs index 046b4c7af..ed407fe1f 100644 --- a/src/UglyToad.PdfPig/Writer/PdfTextRemover.cs +++ b/src/UglyToad.PdfPig/Writer/PdfTextRemover.cs @@ -83,19 +83,22 @@ public static void RemoveText(Stream stream, Stream output, IReadOnlyList p /// public static void RemoveText(PdfDocument file, Stream output, IReadOnlyList pagesBundle = null) { - using (var document = new PdfDocumentBuilder(output, false, PdfWriterType.Default, file.Version, tokenWriter: new NoTextTokenWriter())) + var tokenWriter = new NoTextTokenWriter(); + using (var document = new PdfDocumentBuilder(output, false, PdfWriterType.Default, file.Version, tokenWriter: tokenWriter)) { if (pagesBundle == null) { for (var i = 1; i <= file.NumberOfPages; i++) { + tokenWriter.Page = i; document.AddPage(file, i); } - } + } else { foreach (var i in pagesBundle) { + tokenWriter.Page = i; document.AddPage(file, i); } } diff --git a/src/UglyToad.PdfPig/Writer/TokenWriter.cs b/src/UglyToad.PdfPig/Writer/TokenWriter.cs index 2d0429b4d..058a38494 100644 --- a/src/UglyToad.PdfPig/Writer/TokenWriter.cs +++ b/src/UglyToad.PdfPig/Writer/TokenWriter.cs @@ -276,6 +276,12 @@ public void WriteCrossReferenceTable(IReadOnlyDictionary + /// Indicates that we are writing page contents. + /// Can be used by a derived class. + /// + public bool WritingPageContents { get; set; } + /// public void WriteObject(long objectNumber, int generation, byte[] data, Stream outputStream) {