From 45e21717c83a2f3c0797759df2728732cb229b5e Mon Sep 17 00:00:00 2001 From: BobLd <38405645+BobLd@users.noreply.github.com> Date: Wed, 28 Jun 2023 19:20:51 +0100 Subject: [PATCH] Handle invalid xml characters in text exporters and fix #655 --- .../Export/AltoXmlTextExporter.cs | 84 +++++-- .../Export/HOcrTextExporter.cs | 54 ++++- .../Export/InvalidCharStrategy.cs | 28 +++ .../Export/PageXmlTextExporter.cs | 206 ++++++++++++------ .../Export/SvgTextExporter.cs | 71 ++++-- .../Export/TextExporterHelper.cs | 71 ++++++ .../Integration/AltoXmlTextExporterTests.cs | 189 ++++++++++++++++ .../Integration/Documents/hex_0x0006.pdf | Bin 0 -> 10002 bytes .../Integration/HOcrTextExporterTests.cs | 144 ++++++++++++ .../Integration/PageXmlTextExporterTests.cs | 185 +++++++++++++++- .../Integration/SvgTextExporterTests.cs | 153 +++++++++++++ 11 files changed, 1068 insertions(+), 117 deletions(-) create mode 100644 src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/InvalidCharStrategy.cs create mode 100644 src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/TextExporterHelper.cs create mode 100644 src/UglyToad.PdfPig.Tests/Integration/AltoXmlTextExporterTests.cs create mode 100644 src/UglyToad.PdfPig.Tests/Integration/Documents/hex_0x0006.pdf create mode 100644 src/UglyToad.PdfPig.Tests/Integration/HOcrTextExporterTests.cs create mode 100644 src/UglyToad.PdfPig.Tests/Integration/SvgTextExporterTests.cs diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/AltoXmlTextExporter.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/AltoXmlTextExporter.cs index acbe58276..c259f8338 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/AltoXmlTextExporter.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/AltoXmlTextExporter.cs @@ -16,11 +16,11 @@ /// Alto 4.1 (XML) text exporter. /// See https://github.com/altoxml/schema /// - public class AltoXmlTextExporter : ITextExporter + public sealed class AltoXmlTextExporter : ITextExporter { private readonly IPageSegmenter pageSegmenter; private readonly IWordExtractor wordExtractor; - + private readonly Func invalidCharacterHandler; private readonly double scale; private readonly string indentChar; @@ -33,6 +33,9 @@ public class AltoXmlTextExporter : ITextExporter private int stringCount; private int glyphCount; + /// + public InvalidCharStrategy InvalidCharStrategy { get; } + /// /// Alto 4.1 (XML). /// See https://github.com/altoxml/schema @@ -40,13 +43,50 @@ public class AltoXmlTextExporter : ITextExporter /// Extractor used to identify words in the document. /// Segmenter used to split page into blocks. /// Scale multiplier to apply to output document, defaults to 1. - /// Character to use for indentation, defaults to tab. - public AltoXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, double scale = 1, string indent = "\t") + /// Character to use for indentation, defaults to tab. + /// How to handle invalid characters. + public AltoXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, + double scale, string indentChar, + Func invalidCharacterHandler) + : this(wordExtractor, pageSegmenter, scale, indentChar, + InvalidCharStrategy.Custom, invalidCharacterHandler) + { } + + /// + /// Alto 4.1 (XML). + /// See https://github.com/altoxml/schema + /// + /// Extractor used to identify words in the document. + /// Segmenter used to split page into blocks. + /// Scale multiplier to apply to output document, defaults to 1. + /// Character to use for indentation, defaults to tab. + /// How to handle invalid characters. + public AltoXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, + double scale = 1, string indentChar = "\t", + InvalidCharStrategy invalidCharacterStrategy = InvalidCharStrategy.DoNotCheck) + : this(wordExtractor, pageSegmenter, scale, indentChar, + invalidCharacterStrategy, null) + { } + + private AltoXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, + double scale, string indentChar, + InvalidCharStrategy invalidCharacterStrategy, + Func invalidCharacterHandler) { - this.wordExtractor = wordExtractor ?? throw new ArgumentNullException(nameof(wordExtractor)); - this.pageSegmenter = pageSegmenter ?? throw new ArgumentNullException(nameof(pageSegmenter)); + this.wordExtractor = wordExtractor; + this.pageSegmenter = pageSegmenter; this.scale = scale; - indentChar = indent ?? string.Empty; + this.indentChar = indentChar ?? string.Empty; + InvalidCharStrategy = invalidCharacterStrategy; + + if (invalidCharacterHandler is null) + { + this.invalidCharacterHandler = TextExporterHelper.GetXmlInvalidCharHandler(InvalidCharStrategy); + } + else + { + this.invalidCharacterHandler = invalidCharacterHandler; + } } /// @@ -57,10 +97,7 @@ public AltoXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegm public string Get(PdfDocument document, bool includePaths = false) { var altoDocument = CreateAltoDocument("unknown"); - var altoPages = document.GetPages().Select(x => ToAltoPage(x, includePaths)).ToArray(); - - altoDocument.Layout.Pages = altoPages; - + altoDocument.Layout.Pages = document.GetPages().Select(x => ToAltoPage(x, includePaths)).ToArray(); return Serialize(altoDocument); } @@ -128,8 +165,8 @@ private AltoDocument.AltoPage ToAltoPage(Page page, bool includePaths) { Height = (float)Math.Round(page.Height * scale), // TBD Width = (float)Math.Round(page.Width * scale), // TBD - VerticalPosition = 0f, // TBD - HorizontalPosition = 0f, // TBD + VerticalPosition = 0f, // TBD + HorizontalPosition = 0f, // TBD ComposedBlocks = null, // TBD GraphicalElements = null, // TBD Illustrations = null, // TBD @@ -141,9 +178,7 @@ private AltoDocument.AltoPage ToAltoPage(Page page, bool includePaths) }; var words = page.GetWords(wordExtractor); - var blocks = pageSegmenter.GetBlocks(words).Select(b => ToAltoTextBlock(b, page.Height)).ToArray(); - - altoPage.PrintSpace.TextBlock = blocks; + altoPage.PrintSpace.TextBlock = pageSegmenter.GetBlocks(words).Select(b => ToAltoTextBlock(b, page.Height)).ToArray(); altoPage.PrintSpace.Illustrations = page.GetImages().Select(i => ToAltoIllustration(i, page.Height)).ToArray(); @@ -222,7 +257,6 @@ private AltoDocument.AltoTextBlockTextLine ToAltoTextLine(TextLine textLine, dou { textLineCount++; var strings = textLine.Words - .Where(x => x.Text.All(XmlConvert.IsXmlChar)) .Select(w => ToAltoString(w, height)).ToArray(); return new AltoDocument.AltoTextBlockTextLine @@ -252,7 +286,7 @@ private AltoDocument.AltoString ToAltoString(Word word, double height) Width = (float)Math.Round(word.BoundingBox.Width * scale), Glyph = glyphs, Cc = string.Join("", glyphs.Select(g => 9f * (1f - g.Gc))), // from 0->1 to 9->0 - Content = word.Text, + Content = invalidCharacterHandler(word.Text), Language = null, StyleRefs = null, SubsContent = null, @@ -272,7 +306,7 @@ private AltoDocument.AltoGlyph ToAltoGlyph(Letter letter, double height) Height = (float)Math.Round(letter.GlyphRectangle.Height * scale), Width = (float)Math.Round(letter.GlyphRectangle.Width * scale), Gc = 1.0f, - Content = letter.Value, + Content = invalidCharacterHandler(letter.Value), Id = "P" + pageCount + "_ST" + stringCount.ToString("#00000") + "_G" + glyphCount.ToString("#00") }; } @@ -314,8 +348,8 @@ private AltoDocument.AltoDescription GetAltoDescription(string fileName) Processings = new[] { processing }, SourceImageInformation = new AltoDocument.AltoSourceImageInformation { - DocumentIdentifiers = new [] { documentIdentifier }, - FileIdentifiers = new [] { fileIdentifier }, + DocumentIdentifiers = new[] { documentIdentifier }, + FileIdentifiers = new[] { fileIdentifier }, FileName = fileName } }; @@ -329,6 +363,7 @@ private string Serialize(AltoDocument altoDocument) Encoding = System.Text.Encoding.UTF8, Indent = true, IndentChars = indentChar, + CheckCharacters = InvalidCharStrategy != InvalidCharStrategy.DoNotCheck, }; using (var memoryStream = new System.IO.MemoryStream()) @@ -346,7 +381,12 @@ public static AltoDocument Deserialize(string xmlPath) { var serializer = new XmlSerializer(typeof(AltoDocument)); - using (var reader = XmlReader.Create(xmlPath)) + var settings = new XmlReaderSettings() + { + CheckCharacters = false + }; + + using (var reader = XmlReader.Create(xmlPath, settings)) { return (AltoDocument)serializer.Deserialize(reader); } diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/HOcrTextExporter.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/HOcrTextExporter.cs index 40cc53dc0..89be28588 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/HOcrTextExporter.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/HOcrTextExporter.cs @@ -13,14 +13,14 @@ /// hOCR v1.2 (HTML) text exporter. /// See http://kba.cloud/hocr-spec/1.2/ /// - public class HOcrTextExporter : ITextExporter + public sealed class HOcrTextExporter : ITextExporter { private const string XmlHeader = "\n\n"; private const string Hocrjs = "\n"; private readonly IPageSegmenter pageSegmenter; private readonly IWordExtractor wordExtractor; - + private readonly Func invalidCharacterHandler; private readonly double scale; private readonly string indentChar; @@ -32,16 +32,60 @@ public class HOcrTextExporter : ITextExporter private int paraCount; private int imageCount; + /// + public InvalidCharStrategy InvalidCharStrategy { get; } + + /// + /// hOCR v1.2 (HTML) + /// See http://kba.cloud/hocr-spec/1.2/ + /// + /// Extractor used to identify words in the document. + /// Segmenter used to split page into blocks. + /// Scale multiplier to apply to output document, defaults to 1. + /// Character to use for indentation, defaults to tab. + /// How to handle invalid characters. + public HOcrTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, + double scale, string indentChar, + Func invalidCharacterHandler) + : this(wordExtractor, pageSegmenter, scale, indentChar, + InvalidCharStrategy.Custom, invalidCharacterHandler) + { } + /// /// hOCR v1.2 (HTML) /// See http://kba.cloud/hocr-spec/1.2/ /// - public HOcrTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, double scale = 1.0, string indent = "\t") + /// Extractor used to identify words in the document. + /// Segmenter used to split page into blocks. + /// Scale multiplier to apply to output document, defaults to 1. + /// Character to use for indentation, defaults to tab. + /// How to handle invalid characters. + public HOcrTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, + double scale = 1, string indentChar = "\t", + InvalidCharStrategy invalidCharacterStrategy = InvalidCharStrategy.DoNotCheck) + : this(wordExtractor, pageSegmenter, scale, indentChar, + invalidCharacterStrategy, null) + { } + + private HOcrTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, + double scale, string indentChar, + InvalidCharStrategy invalidCharacterStrategy, + Func invalidCharacterHandler) { this.wordExtractor = wordExtractor; this.pageSegmenter = pageSegmenter; this.scale = scale; - indentChar = indent; + this.indentChar = indentChar ?? string.Empty; + InvalidCharStrategy = invalidCharacterStrategy; + + if (invalidCharacterHandler is null) + { + this.invalidCharacterHandler = TextExporterHelper.GetXmlInvalidCharHandler(InvalidCharStrategy); + } + else + { + this.invalidCharacterHandler = invalidCharacterHandler; + } } /// @@ -325,7 +369,7 @@ private string GetCode(Word word, double pageHeight, int level) } hocr += "'"; - hocr += ">" + word.Text + " "; + hocr += ">" + invalidCharacterHandler(word.Text) + " "; return hocr; } diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/InvalidCharStrategy.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/InvalidCharStrategy.cs new file mode 100644 index 000000000..a7db4c7de --- /dev/null +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/InvalidCharStrategy.cs @@ -0,0 +1,28 @@ +namespace UglyToad.PdfPig.DocumentLayoutAnalysis.Export +{ + /// + /// How to handle invalid characters. + /// + public enum InvalidCharStrategy : byte + { + /// + /// Custom strategy. + /// + Custom = 0, + + /// + /// Do not check invalid character. + /// + DoNotCheck = 1, + + /// + /// Remove invalid character. + /// + Remove = 2, + + /// + /// Convert invalid character to hexadecimal representation. + /// + ConvertToHexadecimal = 3 + } +} diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/PageXmlTextExporter.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/PageXmlTextExporter.cs index 64a18810f..47337bba3 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/PageXmlTextExporter.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/PageXmlTextExporter.cs @@ -3,38 +3,51 @@ using Content; using Core; using DocumentLayoutAnalysis; + using Graphics; using Graphics.Colors; using PAGE; + using PageSegmenter; + using ReadingOrderDetector; using System; using System.Collections.Generic; using System.Linq; using System.Xml; using System.Xml.Serialization; - using PageSegmenter; - using ReadingOrderDetector; - using Graphics; using Util; /// /// PAGE-XML 2019-07-15 (XML) text exporter. /// See https://github.com/PRImA-Research-Lab/PAGE-XML /// - public class PageXmlTextExporter : ITextExporter + public sealed class PageXmlTextExporter : ITextExporter { private readonly IPageSegmenter pageSegmenter; private readonly IWordExtractor wordExtractor; private readonly IReadingOrderDetector readingOrderDetector; - + private readonly Func invalidCharacterHandler; private readonly double scale; private readonly string indentChar; - private int lineCount; - private int wordCount; - private int glyphCount; - private int regionCount; - private int groupOrderCount; + /// + public InvalidCharStrategy InvalidCharStrategy { get; } - private List orderedRegions; + /// + /// PAGE-XML 2019-07-15 (XML) text exporter. + /// See https://github.com/PRImA-Research-Lab/PAGE-XML + /// + /// + /// + /// + /// + /// Indent character. + /// How to handle invalid characters. + public PageXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, + IReadingOrderDetector readingOrderDetector, + double scale, string indentChar, + Func invalidCharacterHandler) + : this(wordExtractor, pageSegmenter, readingOrderDetector, scale, indentChar, + InvalidCharStrategy.Custom, invalidCharacterHandler) + { } /// /// PAGE-XML 2019-07-15 (XML) text exporter. @@ -42,20 +55,44 @@ public class PageXmlTextExporter : ITextExporter /// /// /// - /// + /// /// /// Indent character. - public PageXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, IReadingOrderDetector readingOrderDetector = null, double scale = 1.0, string indent = "\t") + /// How to handle invalid characters. + public PageXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, + IReadingOrderDetector readingOrderDetector = null, + double scale = 1.0, string indent = "\t", + InvalidCharStrategy invalidCharacterStrategy = InvalidCharStrategy.DoNotCheck) + : this(wordExtractor, pageSegmenter, readingOrderDetector, scale, indent, + invalidCharacterStrategy, null) + { } + + private PageXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, + IReadingOrderDetector readingOrderDetector, + double scale, string indentChar, + InvalidCharStrategy invalidCharacterStrategy, + Func invalidCharacterHandler) { this.wordExtractor = wordExtractor; this.pageSegmenter = pageSegmenter; this.readingOrderDetector = readingOrderDetector; this.scale = scale; - indentChar = indent; + this.indentChar = indentChar ?? string.Empty; + InvalidCharStrategy = invalidCharacterStrategy; + + if (invalidCharacterHandler is null) + { + this.invalidCharacterHandler = TextExporterHelper.GetXmlInvalidCharHandler(InvalidCharStrategy); + } + else + { + this.invalidCharacterHandler = invalidCharacterHandler; + } } /// /// Get the PAGE-XML (XML) string of the pages layout. + /// Not implemented, use instead. /// /// /// Draw PdfPaths present in the page. @@ -80,26 +117,23 @@ public string Get(Page page) /// Draw PdfPaths present in the page. public string Get(Page page, bool includePaths) { - lineCount = 0; - wordCount = 0; - glyphCount = 0; - regionCount = 0; - groupOrderCount = 0; - orderedRegions = new List(); + PageXmlData data = new PageXmlData(); + + DateTime utcNow = DateTime.UtcNow; PageXmlDocument pageXmlDocument = new PageXmlDocument() { Metadata = new PageXmlDocument.PageXmlMetadata() { - Created = DateTime.UtcNow, - LastChange = DateTime.UtcNow, + Created = utcNow, + LastChange = utcNow, Creator = "PdfPig", Comments = pageSegmenter.GetType().Name + "|" + wordExtractor.GetType().Name, }, PcGtsId = "pc-" + page.GetHashCode() }; - pageXmlDocument.Page = ToPageXmlPage(page, includePaths); + pageXmlDocument.Page = ToPageXmlPage(page, data, includePaths); return Serialize(pageXmlDocument); } @@ -151,17 +185,17 @@ private PageXmlDocument.PageXmlCoords ToCoords(PdfRectangle pdfRectangle, double /// private string ToRgbEncoded(IColor color) { - var rgb = color.ToRGBValues(); - int red = (int)Math.Round(255f * (float)rgb.r); - int green = 256 * (int)Math.Round(255f * (float)rgb.g); - int blue = 65536 * (int)Math.Round(255f * (float)rgb.b); + var (r, g, b) = color.ToRGBValues(); + int red = Convert.ToByte(255.0 * r); + int green = 256 * Convert.ToByte(255.0 * g); + int blue = 65536 * Convert.ToByte(255.0 * b); int sum = red + green + blue; // as per below, red and blue order might be inverted... var colorWin = System.Drawing.Color.FromArgb(sum); return sum.ToString(); } - private PageXmlDocument.PageXmlPage ToPageXmlPage(Page page, bool includePaths) + private PageXmlDocument.PageXmlPage ToPageXmlPage(Page page, PageXmlData data, bool includePaths) { var pageXmlPage = new PageXmlDocument.PageXmlPage { @@ -182,16 +216,17 @@ private PageXmlDocument.PageXmlPage ToPageXmlPage(Page page, bool includePaths) blocks = readingOrderDetector.Get(blocks).ToList(); } - regions.AddRange(blocks.Select(b => ToPageXmlTextRegion(b, page.Width, page.Height))); + regions.AddRange(blocks.Select(b => ToPageXmlTextRegion(b, data, page.Width, page.Height))); - if (orderedRegions.Count > 0) + if (data.OrderedRegions.Count > 0) { + data.GroupOrdersCount++; pageXmlPage.ReadingOrder = new PageXmlDocument.PageXmlReadingOrder() { Item = new PageXmlDocument.PageXmlOrderedGroup() { - Items = orderedRegions.ToArray(), - Id = "g" + groupOrderCount++ + Items = data.OrderedRegions.ToArray(), + Id = "g" + data.GroupOrdersCount } }; } @@ -200,14 +235,14 @@ private PageXmlDocument.PageXmlPage ToPageXmlPage(Page page, bool includePaths) var images = page.GetImages().ToList(); if (images.Count > 0) { - regions.AddRange(images.Select(i => ToPageXmlImageRegion(i, page.Width, page.Height))); + regions.AddRange(images.Select(i => ToPageXmlImageRegion(i, data, page.Width, page.Height))); } if (includePaths) { foreach (var path in page.ExperimentalAccess.Paths) { - var graphicalElement = ToPageXmlLineDrawingRegion(path, page.Width, page.Height); + var graphicalElement = ToPageXmlLineDrawingRegion(path, data, page.Width, page.Height); if (graphicalElement != null) { @@ -220,40 +255,40 @@ private PageXmlDocument.PageXmlPage ToPageXmlPage(Page page, bool includePaths) return pageXmlPage; } - private PageXmlDocument.PageXmlLineDrawingRegion ToPageXmlLineDrawingRegion(PdfPath pdfPath, double pageWidth, double pageHeight) + private PageXmlDocument.PageXmlLineDrawingRegion ToPageXmlLineDrawingRegion(PdfPath pdfPath, PageXmlData data, double pageWidth, double pageHeight) { var bbox = pdfPath.GetBoundingRectangle(); if (bbox.HasValue) { - regionCount++; + data.RegionsCount++; return new PageXmlDocument.PageXmlLineDrawingRegion() { Coords = ToCoords(bbox.Value, pageWidth, pageHeight), - Id = "r" + regionCount + Id = "r" + data.RegionsCount }; } return null; } - private PageXmlDocument.PageXmlImageRegion ToPageXmlImageRegion(IPdfImage pdfImage, double pageWidth, double pageHeight) + private PageXmlDocument.PageXmlImageRegion ToPageXmlImageRegion(IPdfImage pdfImage, PageXmlData data, double pageWidth, double pageHeight) { - regionCount++; + data.RegionsCount++; var bbox = pdfImage.Bounds; return new PageXmlDocument.PageXmlImageRegion() { Coords = ToCoords(bbox, pageWidth, pageHeight), - Id = "r" + regionCount + Id = "r" + data.RegionsCount }; } - private PageXmlDocument.PageXmlTextRegion ToPageXmlTextRegion(TextBlock textBlock, double pageWidth, double pageHeight) + private PageXmlDocument.PageXmlTextRegion ToPageXmlTextRegion(TextBlock textBlock, PageXmlData data, double pageWidth, double pageHeight) { - regionCount++; - string regionId = "r" + regionCount; + data.RegionsCount++; + string regionId = "r" + data.RegionsCount; if (readingOrderDetector != null && textBlock.ReadingOrder > -1) { - orderedRegions.Add(new PageXmlDocument.PageXmlRegionRefIndexed() + data.OrderedRegions.Add(new PageXmlDocument.PageXmlRegionRefIndexed() { RegionRef = regionId, Index = textBlock.ReadingOrder @@ -264,40 +299,58 @@ private PageXmlDocument.PageXmlTextRegion ToPageXmlTextRegion(TextBlock textBloc { Coords = ToCoords(textBlock.BoundingBox, pageWidth, pageHeight), Type = PageXmlDocument.PageXmlTextSimpleType.Paragraph, - TextLines = textBlock.TextLines.Select(l => ToPageXmlTextLine(l, pageWidth, pageHeight)).ToArray(), - TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = textBlock.Text } }, + TextLines = textBlock.TextLines.Select(l => ToPageXmlTextLine(l, data, pageWidth, pageHeight)).ToArray(), + TextEquivs = new[] + { + new PageXmlDocument.PageXmlTextEquiv() + { + Unicode = invalidCharacterHandler(textBlock.Text) + } + }, Id = regionId }; } - private PageXmlDocument.PageXmlTextLine ToPageXmlTextLine(TextLine textLine, double pageWidth, double pageHeight) + private PageXmlDocument.PageXmlTextLine ToPageXmlTextLine(TextLine textLine, PageXmlData data, double pageWidth, double pageHeight) { - lineCount++; + data.LinesCount++; return new PageXmlDocument.PageXmlTextLine() { Coords = ToCoords(textLine.BoundingBox, pageWidth, pageHeight), Production = PageXmlDocument.PageXmlProductionSimpleType.Printed, - Words = textLine.Words.Select(w => ToPageXmlWord(w, pageWidth, pageHeight)).ToArray(), - TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = textLine.Text } }, - Id = "l" + lineCount + Words = textLine.Words.Select(w => ToPageXmlWord(w, data, pageWidth, pageHeight)).ToArray(), + TextEquivs = new[] + { + new PageXmlDocument.PageXmlTextEquiv() + { + Unicode = invalidCharacterHandler(textLine.Text) + } + }, + Id = "l" + data.LinesCount }; } - private PageXmlDocument.PageXmlWord ToPageXmlWord(Word word, double pageWidth, double pageHeight) + private PageXmlDocument.PageXmlWord ToPageXmlWord(Word word, PageXmlData data, double pageWidth, double pageHeight) { - wordCount++; + data.WordsCount++; return new PageXmlDocument.PageXmlWord() { Coords = ToCoords(word.BoundingBox, pageWidth, pageHeight), - Glyphs = word.Letters.Select(l => ToPageXmlGlyph(l, pageWidth, pageHeight)).ToArray(), - TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = word.Text } }, - Id = "w" + wordCount + Glyphs = word.Letters.Select(l => ToPageXmlGlyph(l, data, pageWidth, pageHeight)).ToArray(), + TextEquivs = new[] + { + new PageXmlDocument.PageXmlTextEquiv() + { + Unicode = invalidCharacterHandler(word.Text) + } + }, + Id = "w" + data.WordsCount }; } - private PageXmlDocument.PageXmlGlyph ToPageXmlGlyph(Letter letter, double pageWidth, double pageHeight) + private PageXmlDocument.PageXmlGlyph ToPageXmlGlyph(Letter letter, PageXmlData data, double pageWidth, double pageHeight) { - glyphCount++; + data.GlyphsCount++; return new PageXmlDocument.PageXmlGlyph() { Coords = ToCoords(letter.GlyphRectangle, pageWidth, pageHeight), @@ -309,8 +362,14 @@ private PageXmlDocument.PageXmlGlyph ToPageXmlGlyph(Letter letter, double pageWi FontFamily = letter.FontName, TextColourRgb = ToRgbEncoded(letter.Color), }, - TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = letter.Value } }, - Id = "c" + glyphCount + TextEquivs = new[] + { + new PageXmlDocument.PageXmlTextEquiv() + { + Unicode = invalidCharacterHandler(letter.Value) + } + }, + Id = "c" + data.GlyphsCount }; } @@ -322,6 +381,7 @@ private string Serialize(PageXmlDocument pageXmlDocument) Encoding = System.Text.Encoding.UTF8, Indent = true, IndentChars = indentChar, + CheckCharacters = InvalidCharStrategy != InvalidCharStrategy.DoNotCheck, }; using (var memoryStream = new System.IO.MemoryStream()) @@ -339,10 +399,34 @@ public static PageXmlDocument Deserialize(string xmlPath) { XmlSerializer serializer = new XmlSerializer(typeof(PageXmlDocument)); - using (var reader = XmlReader.Create(xmlPath)) + var settings = new XmlReaderSettings() + { + CheckCharacters = false + }; + + using (var reader = XmlReader.Create(xmlPath, settings)) { return (PageXmlDocument)serializer.Deserialize(reader); } } + + /// + /// Class to keep track of a page data. + /// + private sealed class PageXmlData + { + public PageXmlData() + { + OrderedRegions = new List(); + } + + public int LinesCount { get; set; } + public int WordsCount { get; set; } + public int GlyphsCount { get; set; } + public int RegionsCount { get; set; } + public int GroupOrdersCount { get; set; } + + public List OrderedRegions { get; } + } } } diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/SvgTextExporter.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/SvgTextExporter.cs index 46151923e..4137ac975 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/SvgTextExporter.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/SvgTextExporter.cs @@ -13,38 +13,72 @@ /// /// Exports a page as an SVG. /// - public class SvgTextExporter : ITextExporter + public sealed class SvgTextExporter : ITextExporter { - private const int Rounding = 4; + private readonly Func invalidCharacterHandler; private static readonly Dictionary Fonts = new Dictionary() { { "ArialMT", "Arial Rounded MT Bold" } }; + /// + /// Used to round numbers. + /// + public int Rounding { get; } = 4; + + /// + /// + /// Not in use. + /// + public InvalidCharStrategy InvalidCharStrategy { get; } + + /// + /// Svg text exporter. + /// + /// How to handle invalid characters. + public SvgTextExporter(Func invalidCharacterHandler) + : this(InvalidCharStrategy.Custom, invalidCharacterHandler) + { } + + /// + /// Svg text exporter. + /// + /// How to handle invalid characters. + public SvgTextExporter(InvalidCharStrategy invalidCharacterStrategy = InvalidCharStrategy.DoNotCheck) + : this(invalidCharacterStrategy, null) + { } + + private SvgTextExporter(InvalidCharStrategy invalidCharacterStrategy, Func invalidCharacterHandler) + { + InvalidCharStrategy = invalidCharacterStrategy; + + if (invalidCharacterHandler is null) + { + this.invalidCharacterHandler = TextExporterHelper.GetXmlInvalidCharHandler(InvalidCharStrategy); + } + else + { + this.invalidCharacterHandler = invalidCharacterHandler; + } + } /// /// Get the page contents as an SVG. /// public string Get(Page page) { - var builder = new StringBuilder($""); + var builder = new StringBuilder($"\n\n"); - var paths = page.ExperimentalAccess.Paths; - foreach (var path in paths) + foreach (var path in page.ExperimentalAccess.Paths) { - if (path.IsClipping) + if (!path.IsClipping) { - //var svg = PathToSvg(path, page.Height); - //svg = svg.Replace("stroke='black'", "stroke='yellow'"); - //builder.Append(svg); - } - else - { - builder.Append(PathToSvg(path, page.Height)); + builder.AppendLine(PathToSvg(path, page.Height)); } } var doc = new XmlDocument(); + foreach (var letter in page.Letters) { builder.Append(LetterToSvg(letter, page.Height, doc)); @@ -54,7 +88,7 @@ public string Get(Page page) return builder.ToString(); } - private static string LetterToSvg(Letter l, double height, XmlDocument doc) + private string LetterToSvg(Letter l, double height, XmlDocument doc) { string fontFamily = GetFontFamily(l.FontName, out string style, out string weight); string rotation = ""; @@ -131,10 +165,10 @@ private static string GetFontFamily(string fontName, out string style, out strin return fontName; } - private static string XmlEscape(Letter letter, XmlDocument doc) + private string XmlEscape(Letter letter, XmlDocument doc) { XmlNode node = doc.CreateElement("root"); - node.InnerText = letter.Value; + node.InnerText = invalidCharacterHandler(letter.Value); return node.InnerXml; } @@ -146,7 +180,7 @@ private static string ColorToSvg(IColor color) } var (r, g, b) = color.ToRGBValues(); - return $"rgb({Math.Ceiling(r * 255)},{Math.Ceiling(g * 255)},{Math.Ceiling(b * 255)})"; + return $"rgb({Convert.ToByte(r * 255)},{Convert.ToByte(g * 255)},{Convert.ToByte(b * 255)})"; } private static string PathToSvg(PdfPath p, double height) @@ -214,12 +248,11 @@ private static string PathToSvg(PdfPath p, double height) } string fillColor = " fill='none'"; - string fillRule = ""; + const string fillRule = ""; // For further dev if (p.IsFilled) { fillColor = $" fill='{ColorToSvg(p.FillColor)}'"; - //if (p.FillingRule == FillingRule.EvenOdd) fillRule = " fill-rule='evenodd'"; } var path = $""; diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/TextExporterHelper.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/TextExporterHelper.cs new file mode 100644 index 000000000..2ce0f9147 --- /dev/null +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/TextExporterHelper.cs @@ -0,0 +1,71 @@ +namespace UglyToad.PdfPig.DocumentLayoutAnalysis.Export +{ + using System; + using System.Text; + using System.Xml; + + internal static class TextExporterHelper + { + public static Func GetXmlInvalidCharHandler(InvalidCharStrategy invalidCharacterStrategy) + { + switch (invalidCharacterStrategy) + { + case InvalidCharStrategy.DoNotCheck: + return new Func(s => s); + + case InvalidCharStrategy.Remove: + return new Func(s => + { + // https://stackoverflow.com/a/17735649 + if (string.IsNullOrEmpty(s)) + { + return s; + } + + int length = s.Length; + StringBuilder stringBuilder = new StringBuilder(length); + for (int i = 0; i < length; ++i) + { + if (XmlConvert.IsXmlChar(s[i])) + { + stringBuilder.Append(s[i]); + } + } + + return stringBuilder.ToString(); + }); + + case InvalidCharStrategy.ConvertToHexadecimal: + return new Func(s => + { + // Adapted from https://stackoverflow.com/a/17735649 + if (string.IsNullOrEmpty(s)) + { + return s; + } + + int length = s.Length; + StringBuilder stringBuilder = new StringBuilder(length); + for (int i = 0; i < length; ++i) + { + if (XmlConvert.IsXmlChar(s[i])) + { + stringBuilder.Append(s[i]); + } + else + { + byte[] bytes = Encoding.UTF8.GetBytes(s[i].ToString()); + string hexString = BitConverter.ToString(bytes); + stringBuilder.Append("0x").Append(hexString); + } + } + + return stringBuilder.ToString(); + }); + + default: + throw new NotImplementedException("TODO"); + } + } + } +} diff --git a/src/UglyToad.PdfPig.Tests/Integration/AltoXmlTextExporterTests.cs b/src/UglyToad.PdfPig.Tests/Integration/AltoXmlTextExporterTests.cs new file mode 100644 index 000000000..caa92a1dc --- /dev/null +++ b/src/UglyToad.PdfPig.Tests/Integration/AltoXmlTextExporterTests.cs @@ -0,0 +1,189 @@ +namespace UglyToad.PdfPig.Tests.Integration +{ + using System; + using System.IO; + using System.Linq; + using System.Text; + using System.Xml; + using UglyToad.PdfPig.DocumentLayoutAnalysis.Export; + using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter; + using UglyToad.PdfPig.Util; + using Xunit; + + public class AltoXmlTextExporterTests + { + [Fact] + public void Issue655NoCheckStrategy() + { + var hex_0x0006 = IntegrationHelpers.GetDocumentPath("hex_0x0006.pdf"); + + var pageXmlTextExporter = new AltoXmlTextExporter( + DefaultWordExtractor.Instance, + RecursiveXYCut.Instance); + + Assert.Equal(InvalidCharStrategy.DoNotCheck, pageXmlTextExporter.InvalidCharStrategy); + + using (var document = PdfDocument.Open(hex_0x0006)) + { + var page = document.GetPage(1); + + // Convert page to text + string xml = pageXmlTextExporter.Get(page); + + // Save text to an xml file + File.WriteAllText("issue655.nocheck.altoxml.xml", xml); + + var pageXml = AltoXmlTextExporter.Deserialize("issue655.nocheck.altoxml.xml"); + + var textRegions = pageXml.Layout.Pages[0].PrintSpace.TextBlock; + Assert.Single(textRegions); + + var textLines = textRegions.Single().TextLines; + Assert.Single(textLines); + + var strings = textLines.Single().Strings; + Assert.Equal(2, strings.Length); + + Assert.Equal("TM", strings[0].Content); + Assert.Equal("1\u00062345\u0006678\u0006ABC", strings[1].Content); // no check strategy, contains invalid xml chars + } + } + + [Fact] + public void Issue655RemoveStrategy() + { + var hex_0x0006 = IntegrationHelpers.GetDocumentPath("hex_0x0006.pdf"); + + var pageXmlTextExporter = new AltoXmlTextExporter( + DefaultWordExtractor.Instance, + RecursiveXYCut.Instance, + invalidCharacterStrategy: InvalidCharStrategy.Remove); + + Assert.Equal(InvalidCharStrategy.Remove, pageXmlTextExporter.InvalidCharStrategy); + + using (var document = PdfDocument.Open(hex_0x0006)) + { + var page = document.GetPage(1); + + // Convert page to text + string xml = pageXmlTextExporter.Get(page); + + // Save text to an xml file + File.WriteAllText("issue655.remove.altoxml.xml", xml); + + var pageXml = AltoXmlTextExporter.Deserialize("issue655.remove.altoxml.xml"); + + var textRegions = pageXml.Layout.Pages[0].PrintSpace.TextBlock; + Assert.Single(textRegions); + + var textLines = textRegions.Single().TextLines; + Assert.Single(textLines); + + var strings = textLines.Single().Strings; + Assert.Equal(2, strings.Length); + + Assert.Equal("TM", strings[0].Content); + Assert.Equal("12345678ABC", strings[1].Content); + } + } + + [Fact] + public void Issue655ConvertToHexadecimalStrategy() + { + var hex_0x0006 = IntegrationHelpers.GetDocumentPath("hex_0x0006.pdf"); + + var pageXmlTextExporter = new AltoXmlTextExporter( + DefaultWordExtractor.Instance, + RecursiveXYCut.Instance, + invalidCharacterStrategy: InvalidCharStrategy.ConvertToHexadecimal); + + Assert.Equal(InvalidCharStrategy.ConvertToHexadecimal, pageXmlTextExporter.InvalidCharStrategy); + + using (var document = PdfDocument.Open(hex_0x0006)) + { + var page = document.GetPage(1); + + // Convert page to text + string xml = pageXmlTextExporter.Get(page); + + // Save text to an xml file + File.WriteAllText("issue655.hex.altoxml.xml", xml); + + var pageXml = AltoXmlTextExporter.Deserialize("issue655.hex.altoxml.xml"); + + var textRegions = pageXml.Layout.Pages[0].PrintSpace.TextBlock; + Assert.Single(textRegions); + + var textLines = textRegions.Single().TextLines; + Assert.Single(textLines); + + var strings = textLines.Single().Strings; + Assert.Equal(2, strings.Length); + + Assert.Equal("TM", strings[0].Content); + Assert.Equal("10x0623450x066780x06ABC", strings[1].Content); + } + } + + [Fact] + public void Issue655CustomStrategy() + { + var hex_0x0006 = IntegrationHelpers.GetDocumentPath("hex_0x0006.pdf"); + + var pageXmlTextExporter = new AltoXmlTextExporter( + DefaultWordExtractor.Instance, + RecursiveXYCut.Instance, 1.0, "\t", + new Func(s => + { + // Adapted from https://stackoverflow.com/a/17735649 + if (string.IsNullOrEmpty(s)) + { + return s; + } + + int length = s.Length; + StringBuilder stringBuilder = new StringBuilder(length); + for (int i = 0; i < length; ++i) + { + if (XmlConvert.IsXmlChar(s[i])) + { + stringBuilder.Append(s[i]); + } + else + { + stringBuilder.Append("!?"); + } + } + + return stringBuilder.ToString(); + })); + + Assert.Equal(InvalidCharStrategy.Custom, pageXmlTextExporter.InvalidCharStrategy); + + using (var document = PdfDocument.Open(hex_0x0006)) + { + var page = document.GetPage(1); + + // Convert page to text + string xml = pageXmlTextExporter.Get(page); + + // Save text to an xml file + File.WriteAllText("issue655.custom.altoxml.xml", xml); + + var pageXml = AltoXmlTextExporter.Deserialize("issue655.custom.altoxml.xml"); + + var textRegions = pageXml.Layout.Pages[0].PrintSpace.TextBlock; + Assert.Single(textRegions); + + var textLines = textRegions.Single().TextLines; + Assert.Single(textLines); + + var strings = textLines.Single().Strings; + Assert.Equal(2, strings.Length); + + Assert.Equal("TM", strings[0].Content); + Assert.Equal("1!?2345!?678!?ABC", strings[1].Content); + } + } + } +} diff --git a/src/UglyToad.PdfPig.Tests/Integration/Documents/hex_0x0006.pdf b/src/UglyToad.PdfPig.Tests/Integration/Documents/hex_0x0006.pdf new file mode 100644 index 0000000000000000000000000000000000000000..158b7a4155fb3e89c6d4ad114e7a4c80680ef459 GIT binary patch literal 10002 zcmch6cT`l_(yt&{GKffQnjop^L{5@(kR(}YpvlnCG&zGPaR3oPqGZV#L6Qgpiew~b z5Rsgb=m*YtXYQSO@4i3Y`g)yn);@dh`kkuUwX1elvEP$Z073Z#h}ahv`-X@_AaDQ# zfHJou5*G*SxS3xj^wc!jVi>z^w61{8do~d%S^`)l5=Zz^&yBIM=Z;=+>@`ax zgVU>iI$NLuc6tj{yMiOX*_!IZ_mNSKjTR6@kMPcz6gCU~Q<0nroedEk_6xNP$*smh zk1ap9tRIZ!L9LbW&I1mImuju!eWzI?^^jG!ctsrfvs&7sZewJ|~$F~dm9!sk& zMb|}eAf@|sz8Qfo0 zb94A!4XAmP z85&21gUZca#rtPJLPF;70jriwK}=6FOtdOi**mQ6_h=^5Zj_tlVsev3 zaaFL>gJieK*F%T4C!J|E-imnAJ$N)5jE@jw2+S_WCwg7o-Nfgss zwXN()nf`>V&G5VMdhne&s%-{eB1ybSv%3sICC-#gLHTpIV}#5>rwDxUYYp?X`wS$B z3FML3s<$!Mc4fAtwq!Hf)VGa!40vetr!JH?K7QtTX#NZ?X>k^PEW9VE+I4cn0PEvh zx^gUr>s52&bso&#B3_}J?BIF`*_8Ko z4}PBn&fWc)wjYP(yYnA15~e40WzIkR_&|w$$OzO{Rxr5R42ZJiIeq(nnf@##&_l;# zjdb_LUiKIXubb<|^81uISbT%Bwl!q|JqEuAKM>J?oJ=98U&b-q0T~84BDHCnvxyy$ z{;ImE0PZk>F8OZxHr!9XGOu5z!*kj0)Qxhn-YWW9sPh?cSh$u;qLf=3p&z|dl=85| zhw2&isIS(^v`0*Z5eS%#4d)&F-Oyu=&O3@v0V1M{lF`eG-uqnQxslx0vjV%Omu;|r%WE|) z1Frc9+8!b%ng(UXq2zmXBt8StKIV5w=kdXu&Fth$-P*g7M>h!hMxHWgX=cT2Zwc4$PRCR#l6fp!0FH&bt^Gog5*5sgE}e7}}5P$92=hxW1-#%?%G zW&Xo6YYg5C{-qojUY0NQ$y#Zv&Sk`}13n7dG5R(z_k3Ls;!h{_9@J49Nxc_Z9;>}> zqv!>!(hld*^_B+b59%y?3W;X)1)KN}_|T-$u8n-zP8H1CjttbaTq6fCPMfDu4}Dy+ zYxaM`lmDB0pwg47FMvvQE#tmcI(m0*p)BJgypELZ$8#^qw_*#($+drTcU?ubFHFR0`+4UQP+yGu`O(U= zI=K?+RU^z)`n2LhLgeB=)Gg1U@l1;SGv%AsbQudc9*%N!4JpGW#Z~garGlV{zKw;K zV3cNRLFZZ@=O#HB5QpaUVi>vpX)1K7bEV!Z&5W3NjX;2O3y)3tf%%ORU?Qz+_<3O% z)(WeZDg$Gm_n?b-oV$9Oj0WAE9A#eiFZE=ZCh5{_9qi_^suJ3j&K|g@_k$YZ^d63G zd$;XH_tofk7x7Q_7@s`AwNJ868s@iSUMQUAdJ!X}K$P;%#%kE-WKDO&ezvNdGNkFJ7b{do09ZFBw zJlzY0Jd*4Mw&&!p8$@j2eSo+*^YFOQsAr9`;++>m{p6z{BD;?Qt)3n<>T)He04B_d zZ|O^(C8Y_4*6o$KF&+VSfT`Furu+pDwz*SNCe5_o)`#jvN5juiUl;%p z?QVx$Z8>bwxop^)H{5JQg9vUB68D_O;(6H1JPMCzJyFJRe^u_-@%$*^LYYZ9pBT!0 zcr+_>S0Cje% z;T;rumM`8o_7L<9BFGWmarX-b3-+~j6vfj^LP9Cs$PD_C)0*6VJNk>C_NrF3p$#{J$d zGZQPs*0Gxtm_L5m0iwri+dT}zd*=I__($Fk)J_!F*HKHWB#vT-f{8*~lZ6+ICzB;J zYK#vOZPOB)Zsgoj(=TOZZYax}X?<#)pQo8GW%sS|6z08{mFGrrcR;WbT>eGqBW&8% zu$dJ*3%j(6blMd4ta_nD+qR(@YlR+kfFCFH#QyvssfDGrn2(;ihsSfSEo%9R!8*FD}cIkT#tSah~i z%1cLL#3(tnyhKm@4^9U9?N4S z+pHbUEQB=D=d?H}%6xI~_L>FM?Il??bkg^SyOK{o0AjSnz^XG18Q(Jt>r(V&&K6ij z%Udid=p(Mrg#32M;#CjY{B){G0j4?m7?v@;s7rnEk*)rH3#>3{Df1m!)B>5Nj)uQX zfnPUwF<)~xXXEK-iC(JWh1GYs3Oh2mMN5ak9S^|@?$d*Wqw=HDDTZUFCU@yO2JaF@ zIq2gB0QKGH9Xws#C!_KLNw4w>Tw(~q>^xRqXQtHpxYl?&9`>hx=Z#GjFsX|=+W;ND z>B{1iRAr%utKK(`p83u@2O1{?r&F99NA`N1NtWyHVJat|sJ? z-Lg)5*?V|IDu;F{;aVeJN~=@(>Zx>#p^w_*r#xNsn$Q-$vj}zPBslB!T3V(WT=Cv( z&bTF+oKLf(v_$lJ-G;MrlGn2U{E9Z?4~fZY$K@W#*DF$Ne7YUTBIfi$rn54Hlsj8g z#!3HL-SrW(q%4=WD{WDdxyCCi3{GgL9fzF}pB$(3hjb$xfqXM*y(|{L#hKyVKn}$b zGH+{qA_Zru-jGP<^3_g>3q3|OSA8+i3C&&KZgiJpz@l)8cNH=kvb?YCQt*+J3aFj( zv8)Yy0(T>SLQ_EX1#V;F&7U@}zHl5m@-Ll$3kIhnNAesktvK`-H)U(`wqgG0G zzb%xN;voiPXylrR8~TLZ<8aiF0SL8j<|{=wR*i^-CdpFkvbjkmGHkq%nZ2ugME5P~ z>q4Ruf}|*C`}@sWDc%+8Rkm?oD4G)(N(C=qs0+8NiyI=~rHQCfD3^iU8rw{7~2U z5kri7aT!neogV3I)vtq-!y)MYvb%iqz$NYx*UqCuo(inbRhjE_f z)_pgyJuvNM98_R0c;6+$pYlzBW9Jz(YljD$=yey%HTMfMpBuYTXq`o@;!>9~2l^8$ z(=%PM3(+;sc_gYz!1KLMmSLwza*?iL*k!;2KMsMR937`6H5BSvKEA>k1s*!<=#w3mJP!dWsoiRFayX%n! zT@Qy(w3fIXfn5`~)EH`ZQMam#f8G-1&aAWx^@rtm>oECL>fVEIzAx&+f6V@IF*Ru` zn@*rvLn`Qj*;Gs18y#J$zN!}3ntg7M38FsCN2-O<&&3DRjD<3}_u!NYE#~XzL}8pN z#(Ylh_ak*-j#tw7)=VYoH=l;DHn;Ad-CCB4ehp>Ec{fb^<-VZEdiK=rF;%IHlm=Cc z85R6fTh($g2eCqXNwY%r3)HiFrde{M-Z??diPLM`49VB4biNM67sP5pnJ#5j+wM-?D;mm(A`~5^s@l*R;ewp>+_XR_7sw;B6 zS^4ByHwx1z=bjnVpHl2vvAe_zOr~uRo40C&H!&uB1mWS9LhJ-g(fuVWR17j*J1T_? zJEb?34v##TN-S@FNWk4n%XVB}Br94P@?r@KO~wMeYaL4-f5tdJTN@%R z2wr=b$-d>n{SAgLsh&GOLp~RxA!3(!o1#P(optOG!xzzsk<2y4;nPS{ekiZ0N!w1% zE5F&nNMQJi?cyz+1m)VEc~d%HgvHyuQR2dZ`IT}nk(B*B z>%O|BD*}bi30kA$Al&K7=GqW~+od_OvVN_q#jeL5Pnok`=w^i5t$dOWDH_VH;KE0m zNqMMEMa44TryMo3P^6@}YqgUyWhCZoGaN!xEpPTz2UPrSto$}hC#R1H(Zc}sDFdUo zyRO#Q`u${mJfywzt_N|WMR7f*HLEmg(?H7FSAh|e4yB3@$9GsGsJEsSnz^SJW-_t@ zU(gK=gtw<;Z>q7T$B)MGoKOLYlClJe=%XEtBtJ)K>rSa*?ah7=6@<3KFz&0Cx(%R2 z1GbggiLr2n#`#CPC!l1S3$BRp@J~u%S@JWIp$|mZ$Kj6ywvU^ZYj|NsbN4V~ngcipoPyj7r*+RVM4KPwU6&uA+zlul)ba_s$v`IOr!lv!b(26n zzd;P{sVLBvt8JublV97H({^B0RjQBsM)bt9gy6REW0l9A$$papU9QUS(!L}s<)yrt z%74-%Ek0L;|4ymAv&h9P50Tl_d;IuyE6DX@DqR^ACyGUt|FK*#ix0Ug&h$)RHkOE1mfcqMMnAc1D zS2yTa-48vz9TgWOhVr`K+?Flou_IlXwwt?8qV-v0m~}uq(Xv1d^pej#_`WLqjaP=) z_~X}i5{68-`iI#VGOA4JY5ibdC=g)ZlAlh*q*6I-@A@$>ViY{k?`YgO*!hE^)0BLVN2yrg{5U_blC1T*7rW z1W(j2Hl0z#3_C3ew2EOdTd_LDCyR{dqMr(#?R*u?_Ma3O-xn8L${LKA{Q6oxP(m~* zMKzWy3|+%8l>8dG-gs4(JLJD{-HU9kQzmP0Q}Y|BZ}n ztm_>2;kb|*Hn6YGIQ0pG`jPW`+HEG{o z24RW7ApXoNir@8gMp)P*F#vO=nI- zMp?R9AkmW2GBSpS00mnoq&5;^i9`b~lfM9{fcO>dpR;}ne?<$v;`1+j|E`w*-vW6e zod2cQe-r3(;8GySe+YEhD=K@LXTHokU*3{12m}g(h=5>%x)32z7)(?^#7G?cACrDb zbjAABkUGlJ*2?P-dKer85d^`6bYW0Y0f?x8AP)p43W59%@|T%^1Np;1f3YD5f(Tt+ zx=@%XOz;v01QGZX=C5=A6X>TjD6}pLig#APcBD z?9$2+!u%GnpW^;I?Js%%Sh~wWWhYk*!pQ>p3)tM;QUoCYwE{teg)V{lFEPyp5HOH{ zAVPqjUjT-LnG5~`{tN9d;Qu8>T3Y-|i_UInho7#)(gKWhKwc@y_0rIx|Di@p3sEZ+ z+7W?~L^wM;*jga2T*uW;6$k%0^h=5>^<4H^Tx|>#TJops(m}fX-f*y${9Vd_HGU3V z!Tcil4;+jw#sT>|v9tr`7s{0f5_Ld0SxW#tL6%4>gqs5fD2YM3V*UgEC(RZ7?>N7} z{|~DE$?RX;%r8gn_+R~k_@!(Af8EUgKJ)La{N8iv?k~OErMdiy8j^oToy!m=0ra?x zDSr=ZP>7JwpKj{lakaw18EGFj)bcptG(O0;%MT?m#pCgH4 z8{3){OuU7>Y>drN1+Nt*G-EnqJn;@?@IJgb-X9=QCN(B_SobpWt+Wb67;7zEudV<$ z=;mmyPyuSuUpc)c{73>2R$O$_HAhk^ZOit>s@SL(*-dYl3IAa<+JH#i&(DGp6cF%H zjfT?48BQIuxaxd2O7wdINAWTLP%akAqOLW`&-rv*+%|h1j~P(Edx{s2bR+!-l|lbn zYW$bVg#Me#?sYr00m#p=f3z{h%i>fF?FYwle_fDeE}SHLTV5Nxii_~wp1q|ss3cK7 z@nYPNqFfRJ7WblxL0xjr8V5D(~0s6u5O-)j2};)>VB{**LTDFO3N9;CM0d&lW6aR8y z5~gG7J~Qp*cYH2U`#@GHygOWFm|>9e!gN54>_fsVc%jw3@Sk)h_z&e-Wk-ZH68!V2 zj&QXHSRou-FW;{si8Jb|KnuX2-H>2?Tg%JpEffY31S=tJt!*#>C{%zSEQ@kLp>-|~ zR!FcM(%seqsfb2+fn{tluJ@2=S(KwQ%IVTU0ib`Z3z0EcMMhZ>_IuIS^pfVP_ABwQE?sS2xAA zYyT*;z~A;3T!jC>7|MIyZjCkwc@FjW-z44zMbEmF`pN4qNTo2pl6j>qP^eTybHOy} z!i1yeTo8-og5NId6nj1!qU`=@h?h@41M&bnsIEK5r>!{XWh}Oa6vAm4FIc|TlyTQW zaK_OC`M~lCOf&NnOOijsLX%YDLgHZA$fUg88~wdup|b2m#q73pM%NINO;4ezoiZeXkJ5+guC- z_id>K1~{KXRDy3qC0V?#7bc0kz%G>?7JsyKAnBzf_%c?TDkP|nMmvJP7*b=9rF*jx z-k<+EOi?e@op*tB4Oo+HlESOGY~f^ecgR0ySdQ+~E02$HuAUel()D!sa)1H280Mr$u?DZWx%NCIzYeM$76@>J>;Jm%oy% z7C>WFV5|~Py`hxnevQt$+i>va73P${$i6-LP*rEDM5{XA4Iiw|UN@W%{ zE%4a znX)41ox30Zl*rj$i7R}rIMnR+rV=t@UIeFeJNViCKrzBrtWC2{bdJa~#>Y9%*pzus za7utf;m>3MhXH;VSp7Y)4zo@!uYsHY|qtIKi#Puj)we4{KD?Uv!(ax@YP zr4PG#dOOC8z>fNYdLpWWi>0Z0@gc|Wi;qSgp5q~UKHS<Zwy&gu?oazJF6G`T5352ri$siPB^k=(p34uA`Ysc6?B7IjIzPMW>0w&Hf(JbSRQ#ArtU z1~t`(AN9$=1K-9>So>EUhrKZ)#-k|>LA7?9$=9VQALBPGj*-m?=l&0zdh6Cbl6dS& zWe%?2gG&9t_q!3+VqD?b7W^&J=_HH(NHg+HPi7r4GtVo?xkkn1Rf#&Cw%y|WQKo{v zNRkfS&@a8ZVtB+D{NA_3S)K}b*!;Im3UyhG3;u&Xw znL~N$d>l*`_}K%eNBpJB7d7Dg6%h%#5L~kD!gOXR)uh;o79rO2lH#{VpQr<*7IV^P zNDI{roOK|!7A;h(;pCd>rTDG}!MvYueIjl^N8I904dY$G??Pi4b!`{#4iIS<$fR0-u`-TS*Qg^T zEkP#Q>4R%=jj`~WMOVp9v8Cr_h_(6}`}%~o1oN&9!G%Z$IBKeEZ$mx!+(Z;lbuYxr zSi*VWW%a4HY^f9Z^x$q}%4U9*^CZz|>lfsccWIE#_j&25$G=JD_zLHs&PaA$eM3t; z$6m7zuN>)TTWel0G2X`&{a#a1*l;AvwslRPgK0sw ziEO1YwI)3wJFoE})lu(F`I@Q^4*K!l*73`Tc)#x!(r2#9yjeNhi&DN?)c!XO3c7a^ z=8Qm^gDqHX=*aVb@|D8SKZD=z0aY7m1=g|kMqb8mVHjB77z%&^pr&AL6bb_n5xP1c zXkSKuE7ax7747AVoH3^ZSzCh%ItV*%y@D4+wt>UO#pzQT1G_~yt@hY}>6wV&5Q59} zA42LM9+wCz4F5NWJ1RmNEJYnSf4kkU6<43erS*X6LFc=V@dn9bK8gWRp}3(e@@}Y_PX@F z=usAPf586TDw?Er%yx?@ZKEdObj61;b)m1V$otx>Ay&ZO)qD9*5{>ahBdv%;1mRF3 Kc6ND9h5rG2*Nyi8 literal 0 HcmV?d00001 diff --git a/src/UglyToad.PdfPig.Tests/Integration/HOcrTextExporterTests.cs b/src/UglyToad.PdfPig.Tests/Integration/HOcrTextExporterTests.cs new file mode 100644 index 000000000..e0646218d --- /dev/null +++ b/src/UglyToad.PdfPig.Tests/Integration/HOcrTextExporterTests.cs @@ -0,0 +1,144 @@ +namespace UglyToad.PdfPig.Tests.Integration +{ + using System; + using System.IO; + using System.Text; + using System.Xml; + using UglyToad.PdfPig.DocumentLayoutAnalysis.Export; + using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter; + using UglyToad.PdfPig.Util; + using Xunit; + + public class HOcrTextExporterTests + { + [Fact] + public void Issue655NoCheckStrategy() + { + var hex_0x0006 = IntegrationHelpers.GetDocumentPath("hex_0x0006.pdf"); + + var pageXmlTextExporter = new HOcrTextExporter( + DefaultWordExtractor.Instance, + RecursiveXYCut.Instance); + + Assert.Equal(InvalidCharStrategy.DoNotCheck, pageXmlTextExporter.InvalidCharStrategy); + + using (var document = PdfDocument.Open(hex_0x0006)) + { + var page = document.GetPage(1); + + // Convert page to text + string xml = pageXmlTextExporter.Get(page, useHocrjs: true); + + // Save text to an xml file + File.WriteAllText("issue655.nocheck.hocr.html", xml); + + string rawText = File.ReadAllText("issue655.nocheck.hocr.html"); + Assert.Contains("1\u00062345\u0006678\u0006ABC", rawText); + } + } + + [Fact] + public void Issue655RemoveStrategy() + { + var hex_0x0006 = IntegrationHelpers.GetDocumentPath("hex_0x0006.pdf"); + + var pageXmlTextExporter = new HOcrTextExporter( + DefaultWordExtractor.Instance, + RecursiveXYCut.Instance, + invalidCharacterStrategy: InvalidCharStrategy.Remove); + + Assert.Equal(InvalidCharStrategy.Remove, pageXmlTextExporter.InvalidCharStrategy); + + using (var document = PdfDocument.Open(hex_0x0006)) + { + var page = document.GetPage(1); + + // Convert page to text + string xml = pageXmlTextExporter.Get(page, useHocrjs: true); + + // Save text to an xml file + File.WriteAllText("issue655.remove.hocr.html", xml); + + string rawText = File.ReadAllText("issue655.remove.hocr.html"); + Assert.Contains("12345678ABC", rawText); + } + } + + [Fact] + public void Issue655ConvertToHexadecimalStrategy() + { + var hex_0x0006 = IntegrationHelpers.GetDocumentPath("hex_0x0006.pdf"); + + var pageXmlTextExporter = new HOcrTextExporter( + DefaultWordExtractor.Instance, + RecursiveXYCut.Instance, + invalidCharacterStrategy: InvalidCharStrategy.ConvertToHexadecimal); + + Assert.Equal(InvalidCharStrategy.ConvertToHexadecimal, pageXmlTextExporter.InvalidCharStrategy); + + using (var document = PdfDocument.Open(hex_0x0006)) + { + var page = document.GetPage(1); + + // Convert page to text + string xml = pageXmlTextExporter.Get(page, useHocrjs: true); + + // Save text to an xml file + File.WriteAllText("issue655.hex.hocr.html", xml); + + string rawText = File.ReadAllText("issue655.hex.hocr.html"); + Assert.Contains("10x0623450x066780x06ABC", rawText); + } + } + + [Fact] + public void Issue655CustomStrategy() + { + var hex_0x0006 = IntegrationHelpers.GetDocumentPath("hex_0x0006.pdf"); + + var pageXmlTextExporter = new HOcrTextExporter( + DefaultWordExtractor.Instance, + RecursiveXYCut.Instance, 1.0, "\t", + new Func(s => + { + // Adapted from https://stackoverflow.com/a/17735649 + if (string.IsNullOrEmpty(s)) + { + return s; + } + + int length = s.Length; + StringBuilder stringBuilder = new StringBuilder(length); + for (int i = 0; i < length; ++i) + { + if (XmlConvert.IsXmlChar(s[i])) + { + stringBuilder.Append(s[i]); + } + else + { + stringBuilder.Append("!?"); + } + } + + return stringBuilder.ToString(); + })); + + Assert.Equal(InvalidCharStrategy.Custom, pageXmlTextExporter.InvalidCharStrategy); + + using (var document = PdfDocument.Open(hex_0x0006)) + { + var page = document.GetPage(1); + + // Convert page to text + string xml = pageXmlTextExporter.Get(page, useHocrjs: true); + + // Save text to an xml file + File.WriteAllText("issue655.custom.hocr.html", xml); + + string rawText = File.ReadAllText("issue655.custom.hocr.html"); + Assert.Contains("1!?2345!?678!?ABC", rawText); + } + } + } +} diff --git a/src/UglyToad.PdfPig.Tests/Integration/PageXmlTextExporterTests.cs b/src/UglyToad.PdfPig.Tests/Integration/PageXmlTextExporterTests.cs index 1860f24e9..984ae1fb4 100644 --- a/src/UglyToad.PdfPig.Tests/Integration/PageXmlTextExporterTests.cs +++ b/src/UglyToad.PdfPig.Tests/Integration/PageXmlTextExporterTests.cs @@ -1,15 +1,17 @@ namespace UglyToad.PdfPig.Tests.Integration { - using System; - using System.Collections.Generic; - using System.IO; - using System.Text; - using System.Text.RegularExpressions; using DocumentLayoutAnalysis.Export; using DocumentLayoutAnalysis.PageSegmenter; using DocumentLayoutAnalysis.ReadingOrderDetector; using PdfPig.Core; using PdfPig.Util; + using System; + using System.IO; + using System.Linq; + using System.Text; + using System.Text.RegularExpressions; + using System.Xml; + using UglyToad.PdfPig.DocumentLayoutAnalysis.Export.PAGE; using Xunit; public class PageXmlTextExporterTests @@ -26,14 +28,11 @@ private static string GetXml(PageXmlTextExporter pageXmlTextExporter = null) RecursiveXYCut.Instance, UnsupervisedReadingOrderDetector.Instance); - string xml; using (var document = PdfDocument.Open(GetFilename())) { var page = document.GetPage(1); - xml = pageXmlTextExporter.Get(page); + return pageXmlTextExporter.Get(page); } - - return xml; } [Fact] @@ -88,7 +87,173 @@ public void NoPointsAreOnThePageBoundary() Assert.Equal("1,199", PageXmlTextExporter.PointToString(topLeftPagePoint, pageWidth, pageHeight)); Assert.Equal("1,1", PageXmlTextExporter.PointToString(bottomLeftPagePoint, pageWidth, pageHeight)); Assert.Equal("99,1", PageXmlTextExporter.PointToString(bottomRightPagePoint, pageWidth, pageHeight)); - Assert.Equal($"60,140", PageXmlTextExporter.PointToString(normalPoint, pageWidth, pageHeight)); + Assert.Equal("60,140", PageXmlTextExporter.PointToString(normalPoint, pageWidth, pageHeight)); + } + + [Fact] + public void Issue655NoCheckStrategy() + { + var hex_0x0006 = IntegrationHelpers.GetDocumentPath("hex_0x0006.pdf"); + + var pageXmlTextExporter = new PageXmlTextExporter( + DefaultWordExtractor.Instance, + RecursiveXYCut.Instance, + UnsupervisedReadingOrderDetector.Instance); + + Assert.Equal(InvalidCharStrategy.DoNotCheck, pageXmlTextExporter.InvalidCharStrategy); + + using (var document = PdfDocument.Open(hex_0x0006)) + { + var page = document.GetPage(1); + + // Convert page to text + string xml = pageXmlTextExporter.Get(page); + + // Save text to an xml file + File.WriteAllText("issue655.nocheck.pagexml.xml", xml); + + var pageXml = PageXmlTextExporter.Deserialize("issue655.nocheck.pagexml.xml"); + + var textRegions = pageXml.Page.Items.OfType().ToArray(); + Assert.Single(textRegions); + + var textEquivs = textRegions.Single().TextEquivs; + Assert.Single(textEquivs); + + string unicode = textEquivs.Single().Unicode; + Assert.Equal("TM 1\u00062345\u0006678\u0006ABC", unicode); // no check strategy, contains invalid xml chars + } + } + + [Fact] + public void Issue655RemoveStrategy() + { + var hex_0x0006 = IntegrationHelpers.GetDocumentPath("hex_0x0006.pdf"); + + var pageXmlTextExporter = new PageXmlTextExporter( + DefaultWordExtractor.Instance, + RecursiveXYCut.Instance, + UnsupervisedReadingOrderDetector.Instance, + invalidCharacterStrategy: InvalidCharStrategy.Remove); + + Assert.Equal(InvalidCharStrategy.Remove, pageXmlTextExporter.InvalidCharStrategy); + + using (var document = PdfDocument.Open(hex_0x0006)) + { + var page = document.GetPage(1); + + // Convert page to text + string xml = pageXmlTextExporter.Get(page); + + // Save text to an xml file + File.WriteAllText("issue655.remove.pagexml.xml", xml); + + var pageXml = PageXmlTextExporter.Deserialize("issue655.remove.pagexml.xml"); + + var textRegions = pageXml.Page.Items.OfType().ToArray(); + Assert.Single(textRegions); + + var textEquivs = textRegions.Single().TextEquivs; + Assert.Single(textEquivs); + + string unicode = textEquivs.Single().Unicode; + Assert.Equal("TM 12345678ABC", unicode); + } + } + + [Fact] + public void Issue655ConvertToHexadecimalStrategy() + { + var hex_0x0006 = IntegrationHelpers.GetDocumentPath("hex_0x0006.pdf"); + + var pageXmlTextExporter = new PageXmlTextExporter( + DefaultWordExtractor.Instance, + RecursiveXYCut.Instance, + UnsupervisedReadingOrderDetector.Instance, + invalidCharacterStrategy: InvalidCharStrategy.ConvertToHexadecimal); + + Assert.Equal(InvalidCharStrategy.ConvertToHexadecimal, pageXmlTextExporter.InvalidCharStrategy); + + using (var document = PdfDocument.Open(hex_0x0006)) + { + var page = document.GetPage(1); + + // Convert page to text + string xml = pageXmlTextExporter.Get(page); + + // Save text to an xml file + File.WriteAllText("issue655.hex.pagexml.xml", xml); + + var pageXml = PageXmlTextExporter.Deserialize("issue655.hex.pagexml.xml"); + + var textRegions = pageXml.Page.Items.OfType().ToArray(); + Assert.Single(textRegions); + + var textEquivs = textRegions.Single().TextEquivs; + Assert.Single(textEquivs); + + string unicode = textEquivs.Single().Unicode; + Assert.Equal("TM 10x0623450x066780x06ABC", unicode); + } + } + + [Fact] + public void Issue655CustomStrategy() + { + var hex_0x0006 = IntegrationHelpers.GetDocumentPath("hex_0x0006.pdf"); + + var pageXmlTextExporter = new PageXmlTextExporter( + DefaultWordExtractor.Instance, + RecursiveXYCut.Instance, + UnsupervisedReadingOrderDetector.Instance, 1.0, "\t", + new Func(s => + { + // Adapted from https://stackoverflow.com/a/17735649 + if (string.IsNullOrEmpty(s)) + { + return s; + } + + int length = s.Length; + StringBuilder stringBuilder = new StringBuilder(length); + for (int i = 0; i < length; ++i) + { + if (XmlConvert.IsXmlChar(s[i])) + { + stringBuilder.Append(s[i]); + } + else + { + stringBuilder.Append("!?"); + } + } + + return stringBuilder.ToString(); + })); + + Assert.Equal(InvalidCharStrategy.Custom, pageXmlTextExporter.InvalidCharStrategy); + + using (var document = PdfDocument.Open(hex_0x0006)) + { + var page = document.GetPage(1); + + // Convert page to text + string xml = pageXmlTextExporter.Get(page); + + // Save text to an xml file + File.WriteAllText("issue655.custom.pagexml.xml", xml); + + var pageXml = PageXmlTextExporter.Deserialize("issue655.custom.pagexml.xml"); + + var textRegions = pageXml.Page.Items.OfType().ToArray(); + Assert.Single(textRegions); + + var textEquivs = textRegions.Single().TextEquivs; + Assert.Single(textEquivs); + + string unicode = textEquivs.Single().Unicode; + Assert.Equal("TM 1!?2345!?678!?ABC", unicode); + } } } } diff --git a/src/UglyToad.PdfPig.Tests/Integration/SvgTextExporterTests.cs b/src/UglyToad.PdfPig.Tests/Integration/SvgTextExporterTests.cs new file mode 100644 index 000000000..5ec50a21e --- /dev/null +++ b/src/UglyToad.PdfPig.Tests/Integration/SvgTextExporterTests.cs @@ -0,0 +1,153 @@ +namespace UglyToad.PdfPig.Tests.Integration +{ + using System; + using System.IO; + using System.Text; + using System.Xml; + using UglyToad.PdfPig.DocumentLayoutAnalysis.Export; + using Xunit; + + public class SvgTextExporterTests + { + [Fact] + public void Doc68_1990_01_A() + { + var hex_0x0006 = IntegrationHelpers.GetDocumentPath("68-1990-01_A.pdf"); + + var pageXmlTextExporter = new SvgTextExporter(); + + Assert.Equal(InvalidCharStrategy.DoNotCheck, pageXmlTextExporter.InvalidCharStrategy); + + using (var document = PdfDocument.Open(hex_0x0006)) + { + var page = document.GetPage(7); + + // Convert page to text + string xml = pageXmlTextExporter.Get(page); + + // Save text to an xml file + File.WriteAllText("68-1990-01_A.7.svg", xml); + } + } + + [Fact] + public void Issue655NoCheckStrategy() + { + var hex_0x0006 = IntegrationHelpers.GetDocumentPath("hex_0x0006.pdf"); + + var pageXmlTextExporter = new SvgTextExporter(); + + Assert.Equal(InvalidCharStrategy.DoNotCheck, pageXmlTextExporter.InvalidCharStrategy); + + using (var document = PdfDocument.Open(hex_0x0006)) + { + var page = document.GetPage(1); + + // Convert page to text + string xml = pageXmlTextExporter.Get(page); + + // Save text to an xml file + File.WriteAllText("issue655.nocheck.svg", xml); + + string rawText = File.ReadAllText("issue655.nocheck.svg"); + Assert.Contains("><", rawText); + } + } + + [Fact] + public void Issue655RemoveStrategy() + { + var hex_0x0006 = IntegrationHelpers.GetDocumentPath("hex_0x0006.pdf"); + + var pageXmlTextExporter = new SvgTextExporter(InvalidCharStrategy.Remove); + + Assert.Equal(InvalidCharStrategy.Remove, pageXmlTextExporter.InvalidCharStrategy); + + using (var document = PdfDocument.Open(hex_0x0006)) + { + var page = document.GetPage(1); + + // Convert page to text + string xml = pageXmlTextExporter.Get(page); + + // Save text to an xml file + File.WriteAllText("issue655.remove.svg", xml); + + string rawText = File.ReadAllText("issue655.remove.svg"); + Assert.DoesNotContain(">0x06<", rawText); + } + } + + [Fact] + public void Issue655ConvertToHexadecimalStrategy() + { + var hex_0x0006 = IntegrationHelpers.GetDocumentPath("hex_0x0006.pdf"); + + var pageXmlTextExporter = new SvgTextExporter(InvalidCharStrategy.ConvertToHexadecimal); + + Assert.Equal(InvalidCharStrategy.ConvertToHexadecimal, pageXmlTextExporter.InvalidCharStrategy); + + using (var document = PdfDocument.Open(hex_0x0006)) + { + var page = document.GetPage(1); + + // Convert page to text + string xml = pageXmlTextExporter.Get(page); + + // Save text to an xml file + File.WriteAllText("issue655.hex.svg", xml); + + string rawText = File.ReadAllText("issue655.hex.svg"); + Assert.Contains(">0x06<", rawText); + } + } + + [Fact] + public void Issue655CustomStrategy() + { + var hex_0x0006 = IntegrationHelpers.GetDocumentPath("hex_0x0006.pdf"); + + var pageXmlTextExporter = new SvgTextExporter( + new Func(s => + { + // Adapted from https://stackoverflow.com/a/17735649 + if (string.IsNullOrEmpty(s)) + { + return s; + } + + int length = s.Length; + StringBuilder stringBuilder = new StringBuilder(length); + for (int i = 0; i < length; ++i) + { + if (XmlConvert.IsXmlChar(s[i])) + { + stringBuilder.Append(s[i]); + } + else + { + stringBuilder.Append("!?"); + } + } + + return stringBuilder.ToString(); + })); + + Assert.Equal(InvalidCharStrategy.Custom, pageXmlTextExporter.InvalidCharStrategy); + + using (var document = PdfDocument.Open(hex_0x0006)) + { + var page = document.GetPage(1); + + // Convert page to text + string xml = pageXmlTextExporter.Get(page); + + // Save text to an xml file + File.WriteAllText("issue655.custom.svg", xml); + + string rawText = File.ReadAllText("issue655.custom.svg"); + Assert.Contains(">!?<", rawText); + } + } + } +}