Handle invalid xml characters in text exporters and fix #655

UglyToad · Aug 5, 2023 · 45e2171 · 45e2171
1 parent 9aaf20c
commit 45e2171
Show file tree

Hide file tree

Showing 11 changed files with 1,068 additions and 117 deletions.
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/AltoXmlTextExporter.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/AltoXmlTextExporter.cs
@@ -16,11 +16,11 @@
     /// Alto 4.1 (XML) text exporter.
     /// <para>See https://github.com/altoxml/schema </para>
     /// </summary>
-    public class AltoXmlTextExporter : ITextExporter
+    public sealed class AltoXmlTextExporter : ITextExporter
     {
         private readonly IPageSegmenter pageSegmenter;
         private readonly IWordExtractor wordExtractor;
-
+        private readonly Func<string, string> invalidCharacterHandler;
         private readonly double scale;
         private readonly string indentChar;
 
@@ -33,20 +33,60 @@ public class AltoXmlTextExporter : ITextExporter
         private int stringCount;
         private int glyphCount;
 
+        /// <inheritdoc/>
+        public InvalidCharStrategy InvalidCharStrategy { get; }
+
         /// <summary>
         /// Alto 4.1 (XML).
         /// <para>See https://github.com/altoxml/schema </para>
         /// </summary>
         /// <param name="wordExtractor">Extractor used to identify words in the document.</param>
         /// <param name="pageSegmenter">Segmenter used to split page into blocks.</param>
         /// <param name="scale">Scale multiplier to apply to output document, defaults to 1.</param>
-        /// <param name="indent">Character to use for indentation, defaults to tab.</param>
-        public AltoXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, double scale = 1, string indent = "\t")
+        /// <param name="indentChar">Character to use for indentation, defaults to tab.</param>
+        /// <param name="invalidCharacterHandler">How to handle invalid characters.</param>
+        public AltoXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter,
+                                   double scale, string indentChar,
+                                   Func<string, string> invalidCharacterHandler)
+            : this(wordExtractor, pageSegmenter, scale, indentChar,
+                  InvalidCharStrategy.Custom, invalidCharacterHandler)
+        { }
+
+        /// <summary>
+        /// Alto 4.1 (XML).
+        /// <para>See https://github.com/altoxml/schema </para>
+        /// </summary>
+        /// <param name="wordExtractor">Extractor used to identify words in the document.</param>
+        /// <param name="pageSegmenter">Segmenter used to split page into blocks.</param>
+        /// <param name="scale">Scale multiplier to apply to output document, defaults to 1.</param>
+        /// <param name="indentChar">Character to use for indentation, defaults to tab.</param>
+        /// <param name="invalidCharacterStrategy">How to handle invalid characters.</param>
+        public AltoXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter,
+                                   double scale = 1, string indentChar = "\t",
+                                   InvalidCharStrategy invalidCharacterStrategy = InvalidCharStrategy.DoNotCheck)
+             : this(wordExtractor, pageSegmenter, scale, indentChar,
+                  invalidCharacterStrategy, null)
+        { }
+
+        private AltoXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter,
+                           double scale, string indentChar,
+                           InvalidCharStrategy invalidCharacterStrategy,
+                           Func<string, string> invalidCharacterHandler)
         {
-            this.wordExtractor = wordExtractor ?? throw new ArgumentNullException(nameof(wordExtractor));
-            this.pageSegmenter = pageSegmenter ?? throw new ArgumentNullException(nameof(pageSegmenter));
+            this.wordExtractor = wordExtractor;
+            this.pageSegmenter = pageSegmenter;
             this.scale = scale;
-            indentChar = indent ?? string.Empty;
+            this.indentChar = indentChar ?? string.Empty;
+            InvalidCharStrategy = invalidCharacterStrategy;
+
+            if (invalidCharacterHandler is null)
+            {
+                this.invalidCharacterHandler = TextExporterHelper.GetXmlInvalidCharHandler(InvalidCharStrategy);
+            }
+            else
+            {
+                this.invalidCharacterHandler = invalidCharacterHandler;
+            }
         }
 
         /// <summary>
@@ -57,10 +97,7 @@ public AltoXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegm
         public string Get(PdfDocument document, bool includePaths = false)
         {
             var altoDocument = CreateAltoDocument("unknown");
-            var altoPages = document.GetPages().Select(x => ToAltoPage(x, includePaths)).ToArray();
-
-            altoDocument.Layout.Pages = altoPages;
-
+            altoDocument.Layout.Pages = document.GetPages().Select(x => ToAltoPage(x, includePaths)).ToArray();
             return Serialize(altoDocument);
         }
 
@@ -128,8 +165,8 @@ private AltoDocument.AltoPage ToAltoPage(Page page, bool includePaths)
                 {
                     Height = (float)Math.Round(page.Height * scale),    // TBD
                     Width = (float)Math.Round(page.Width * scale),      // TBD
-                    VerticalPosition = 0f,                                          // TBD
-                    HorizontalPosition = 0f,                                          // TBD
+                    VerticalPosition = 0f,                              // TBD
+                    HorizontalPosition = 0f,                            // TBD
                     ComposedBlocks = null,                              // TBD
                     GraphicalElements = null,                           // TBD
                     Illustrations = null,                               // TBD
@@ -141,9 +178,7 @@ private AltoDocument.AltoPage ToAltoPage(Page page, bool includePaths)
             };
 
             var words = page.GetWords(wordExtractor);
-            var blocks = pageSegmenter.GetBlocks(words).Select(b => ToAltoTextBlock(b, page.Height)).ToArray();
-
-            altoPage.PrintSpace.TextBlock = blocks;
+            altoPage.PrintSpace.TextBlock = pageSegmenter.GetBlocks(words).Select(b => ToAltoTextBlock(b, page.Height)).ToArray();
 
             altoPage.PrintSpace.Illustrations = page.GetImages().Select(i => ToAltoIllustration(i, page.Height)).ToArray();
 
@@ -222,7 +257,6 @@ private AltoDocument.AltoTextBlockTextLine ToAltoTextLine(TextLine textLine, dou
         {
             textLineCount++;
             var strings = textLine.Words
-                .Where(x => x.Text.All(XmlConvert.IsXmlChar))
                 .Select(w => ToAltoString(w, height)).ToArray();
 
             return new AltoDocument.AltoTextBlockTextLine
@@ -252,7 +286,7 @@ private AltoDocument.AltoString ToAltoString(Word word, double height)
                 Width = (float)Math.Round(word.BoundingBox.Width * scale),
                 Glyph = glyphs,
                 Cc = string.Join("", glyphs.Select(g => 9f * (1f - g.Gc))), // from 0->1 to 9->0
-                Content = word.Text,
+                Content = invalidCharacterHandler(word.Text),
                 Language = null,
                 StyleRefs = null,
                 SubsContent = null,
@@ -272,7 +306,7 @@ private AltoDocument.AltoGlyph ToAltoGlyph(Letter letter, double height)
                 Height = (float)Math.Round(letter.GlyphRectangle.Height * scale),
                 Width = (float)Math.Round(letter.GlyphRectangle.Width * scale),
                 Gc = 1.0f,
-                Content = letter.Value,
+                Content = invalidCharacterHandler(letter.Value),
                 Id = "P" + pageCount + "_ST" + stringCount.ToString("#00000") + "_G" + glyphCount.ToString("#00")
             };
         }
@@ -314,8 +348,8 @@ private AltoDocument.AltoDescription GetAltoDescription(string fileName)
                 Processings = new[] { processing },
                 SourceImageInformation = new AltoDocument.AltoSourceImageInformation
                 {
-                    DocumentIdentifiers = new [] { documentIdentifier },
-                    FileIdentifiers = new [] { fileIdentifier },
+                    DocumentIdentifiers = new[] { documentIdentifier },
+                    FileIdentifiers = new[] { fileIdentifier },
                     FileName = fileName
                 }
             };
@@ -329,6 +363,7 @@ private string Serialize(AltoDocument altoDocument)
                 Encoding = System.Text.Encoding.UTF8,
                 Indent = true,
                 IndentChars = indentChar,
+                CheckCharacters = InvalidCharStrategy != InvalidCharStrategy.DoNotCheck,
             };
 
             using (var memoryStream = new System.IO.MemoryStream())
@@ -346,7 +381,12 @@ public static AltoDocument Deserialize(string xmlPath)
         {
             var serializer = new XmlSerializer(typeof(AltoDocument));
 
-            using (var reader = XmlReader.Create(xmlPath))
+            var settings = new XmlReaderSettings()
+            {
+                CheckCharacters = false
+            };
+
+            using (var reader = XmlReader.Create(xmlPath, settings))
             {
                 return (AltoDocument)serializer.Deserialize(reader);
             }

diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/HOcrTextExporter.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/HOcrTextExporter.cs
@@ -13,14 +13,14 @@
     /// hOCR v1.2 (HTML) text exporter.
     /// <para>See http://kba.cloud/hocr-spec/1.2/ </para>
     /// </summary>
-    public class HOcrTextExporter : ITextExporter
+    public sealed class HOcrTextExporter : ITextExporter
     {
         private const string XmlHeader = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n\t\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n";
         private const string Hocrjs = "<script src='https://unpkg.com/hocrjs'></script>\n";
 
         private readonly IPageSegmenter pageSegmenter;
         private readonly IWordExtractor wordExtractor;
-
+        private readonly Func<string, string> invalidCharacterHandler;
         private readonly double scale;
         private readonly string indentChar;
 
@@ -32,16 +32,60 @@ public class HOcrTextExporter : ITextExporter
         private int paraCount;
         private int imageCount;
 
+        /// <inheritdoc/>
+        public InvalidCharStrategy InvalidCharStrategy { get; }
+
+        /// <summary>
+        /// hOCR v1.2 (HTML)
+        /// <para>See http://kba.cloud/hocr-spec/1.2/ </para>
+        /// </summary>
+        /// <param name="wordExtractor">Extractor used to identify words in the document.</param>
+        /// <param name="pageSegmenter">Segmenter used to split page into blocks.</param>
+        /// <param name="scale">Scale multiplier to apply to output document, defaults to 1.</param>
+        /// <param name="indentChar">Character to use for indentation, defaults to tab.</param>
+        /// <param name="invalidCharacterHandler">How to handle invalid characters.</param>
+        public HOcrTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter,
+                                   double scale, string indentChar,
+                                   Func<string, string> invalidCharacterHandler)
+            : this(wordExtractor, pageSegmenter, scale, indentChar,
+                  InvalidCharStrategy.Custom, invalidCharacterHandler)
+        { }
+
         /// <summary>
         /// hOCR v1.2 (HTML)
         /// <para>See http://kba.cloud/hocr-spec/1.2/ </para>
         /// </summary>
-        public HOcrTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, double scale = 1.0, string indent = "\t")
+        /// <param name="wordExtractor">Extractor used to identify words in the document.</param>
+        /// <param name="pageSegmenter">Segmenter used to split page into blocks.</param>
+        /// <param name="scale">Scale multiplier to apply to output document, defaults to 1.</param>
+        /// <param name="indentChar">Character to use for indentation, defaults to tab.</param>
+        /// <param name="invalidCharacterStrategy">How to handle invalid characters.</param>
+        public HOcrTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter,
+                                   double scale = 1, string indentChar = "\t",
+                                   InvalidCharStrategy invalidCharacterStrategy = InvalidCharStrategy.DoNotCheck)
+             : this(wordExtractor, pageSegmenter, scale, indentChar,
+                  invalidCharacterStrategy, null)
+        { }
+
+        private HOcrTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter,
+                           double scale, string indentChar,
+                           InvalidCharStrategy invalidCharacterStrategy,
+                           Func<string, string> invalidCharacterHandler)
         {
             this.wordExtractor = wordExtractor;
             this.pageSegmenter = pageSegmenter;
             this.scale = scale;
-            indentChar = indent;
+            this.indentChar = indentChar ?? string.Empty;
+            InvalidCharStrategy = invalidCharacterStrategy;
+
+            if (invalidCharacterHandler is null)
+            {
+                this.invalidCharacterHandler = TextExporterHelper.GetXmlInvalidCharHandler(InvalidCharStrategy);
+            }
+            else
+            {
+                this.invalidCharacterHandler = invalidCharacterHandler;
+            }
         }
 
         /// <summary>
@@ -325,7 +369,7 @@ private string GetCode(Word word, double pageHeight, int level)
             }
             hocr += "'";
 
-            hocr += ">" + word.Text + "</span> ";
+            hocr += ">" + invalidCharacterHandler(word.Text) + "</span> ";
             return hocr;
         }
 

diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/InvalidCharStrategy.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/InvalidCharStrategy.cs
@@ -0,0 +1,28 @@
+namespace UglyToad.PdfPig.DocumentLayoutAnalysis.Export
+{
+    /// <summary>
+    /// How to handle invalid characters.
+    /// </summary>
+    public enum InvalidCharStrategy : byte
+    {
+        /// <summary>
+        /// Custom strategy.
+        /// </summary>
+        Custom = 0,
+
+        /// <summary>
+        /// Do not check invalid character.
+        /// </summary>
+        DoNotCheck = 1,
+
+        /// <summary>
+        /// Remove invalid character.
+        /// </summary>
+        Remove = 2,
+
+        /// <summary>
+        /// Convert invalid character to hexadecimal representation.
+        /// </summary>
+        ConvertToHexadecimal = 3
+    }
+}