From 45e21717c83a2f3c0797759df2728732cb229b5e Mon Sep 17 00:00:00 2001
From: BobLd <38405645+BobLd@users.noreply.github.com>
Date: Wed, 28 Jun 2023 19:20:51 +0100
Subject: [PATCH] Handle invalid xml characters in text exporters and fix #655
---
.../Export/AltoXmlTextExporter.cs | 84 +++++--
.../Export/HOcrTextExporter.cs | 54 ++++-
.../Export/InvalidCharStrategy.cs | 28 +++
.../Export/PageXmlTextExporter.cs | 206 ++++++++++++------
.../Export/SvgTextExporter.cs | 71 ++++--
.../Export/TextExporterHelper.cs | 71 ++++++
.../Integration/AltoXmlTextExporterTests.cs | 189 ++++++++++++++++
.../Integration/Documents/hex_0x0006.pdf | Bin 0 -> 10002 bytes
.../Integration/HOcrTextExporterTests.cs | 144 ++++++++++++
.../Integration/PageXmlTextExporterTests.cs | 185 +++++++++++++++-
.../Integration/SvgTextExporterTests.cs | 153 +++++++++++++
11 files changed, 1068 insertions(+), 117 deletions(-)
create mode 100644 src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/InvalidCharStrategy.cs
create mode 100644 src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/TextExporterHelper.cs
create mode 100644 src/UglyToad.PdfPig.Tests/Integration/AltoXmlTextExporterTests.cs
create mode 100644 src/UglyToad.PdfPig.Tests/Integration/Documents/hex_0x0006.pdf
create mode 100644 src/UglyToad.PdfPig.Tests/Integration/HOcrTextExporterTests.cs
create mode 100644 src/UglyToad.PdfPig.Tests/Integration/SvgTextExporterTests.cs
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/AltoXmlTextExporter.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/AltoXmlTextExporter.cs
index acbe58276..c259f8338 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/AltoXmlTextExporter.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/AltoXmlTextExporter.cs
@@ -16,11 +16,11 @@
/// Alto 4.1 (XML) text exporter.
/// See https://github.com/altoxml/schema
///
- public class AltoXmlTextExporter : ITextExporter
+ public sealed class AltoXmlTextExporter : ITextExporter
{
private readonly IPageSegmenter pageSegmenter;
private readonly IWordExtractor wordExtractor;
-
+ private readonly Func invalidCharacterHandler;
private readonly double scale;
private readonly string indentChar;
@@ -33,6 +33,9 @@ public class AltoXmlTextExporter : ITextExporter
private int stringCount;
private int glyphCount;
+ ///
+ public InvalidCharStrategy InvalidCharStrategy { get; }
+
///
/// Alto 4.1 (XML).
/// See https://github.com/altoxml/schema
@@ -40,13 +43,50 @@ public class AltoXmlTextExporter : ITextExporter
/// Extractor used to identify words in the document.
/// Segmenter used to split page into blocks.
/// Scale multiplier to apply to output document, defaults to 1.
- /// Character to use for indentation, defaults to tab.
- public AltoXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, double scale = 1, string indent = "\t")
+ /// Character to use for indentation, defaults to tab.
+ /// How to handle invalid characters.
+ public AltoXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter,
+ double scale, string indentChar,
+ Func invalidCharacterHandler)
+ : this(wordExtractor, pageSegmenter, scale, indentChar,
+ InvalidCharStrategy.Custom, invalidCharacterHandler)
+ { }
+
+ ///
+ /// Alto 4.1 (XML).
+ /// See https://github.com/altoxml/schema
+ ///
+ /// Extractor used to identify words in the document.
+ /// Segmenter used to split page into blocks.
+ /// Scale multiplier to apply to output document, defaults to 1.
+ /// Character to use for indentation, defaults to tab.
+ /// How to handle invalid characters.
+ public AltoXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter,
+ double scale = 1, string indentChar = "\t",
+ InvalidCharStrategy invalidCharacterStrategy = InvalidCharStrategy.DoNotCheck)
+ : this(wordExtractor, pageSegmenter, scale, indentChar,
+ invalidCharacterStrategy, null)
+ { }
+
+ private AltoXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter,
+ double scale, string indentChar,
+ InvalidCharStrategy invalidCharacterStrategy,
+ Func invalidCharacterHandler)
{
- this.wordExtractor = wordExtractor ?? throw new ArgumentNullException(nameof(wordExtractor));
- this.pageSegmenter = pageSegmenter ?? throw new ArgumentNullException(nameof(pageSegmenter));
+ this.wordExtractor = wordExtractor;
+ this.pageSegmenter = pageSegmenter;
this.scale = scale;
- indentChar = indent ?? string.Empty;
+ this.indentChar = indentChar ?? string.Empty;
+ InvalidCharStrategy = invalidCharacterStrategy;
+
+ if (invalidCharacterHandler is null)
+ {
+ this.invalidCharacterHandler = TextExporterHelper.GetXmlInvalidCharHandler(InvalidCharStrategy);
+ }
+ else
+ {
+ this.invalidCharacterHandler = invalidCharacterHandler;
+ }
}
///
@@ -57,10 +97,7 @@ public AltoXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegm
public string Get(PdfDocument document, bool includePaths = false)
{
var altoDocument = CreateAltoDocument("unknown");
- var altoPages = document.GetPages().Select(x => ToAltoPage(x, includePaths)).ToArray();
-
- altoDocument.Layout.Pages = altoPages;
-
+ altoDocument.Layout.Pages = document.GetPages().Select(x => ToAltoPage(x, includePaths)).ToArray();
return Serialize(altoDocument);
}
@@ -128,8 +165,8 @@ private AltoDocument.AltoPage ToAltoPage(Page page, bool includePaths)
{
Height = (float)Math.Round(page.Height * scale), // TBD
Width = (float)Math.Round(page.Width * scale), // TBD
- VerticalPosition = 0f, // TBD
- HorizontalPosition = 0f, // TBD
+ VerticalPosition = 0f, // TBD
+ HorizontalPosition = 0f, // TBD
ComposedBlocks = null, // TBD
GraphicalElements = null, // TBD
Illustrations = null, // TBD
@@ -141,9 +178,7 @@ private AltoDocument.AltoPage ToAltoPage(Page page, bool includePaths)
};
var words = page.GetWords(wordExtractor);
- var blocks = pageSegmenter.GetBlocks(words).Select(b => ToAltoTextBlock(b, page.Height)).ToArray();
-
- altoPage.PrintSpace.TextBlock = blocks;
+ altoPage.PrintSpace.TextBlock = pageSegmenter.GetBlocks(words).Select(b => ToAltoTextBlock(b, page.Height)).ToArray();
altoPage.PrintSpace.Illustrations = page.GetImages().Select(i => ToAltoIllustration(i, page.Height)).ToArray();
@@ -222,7 +257,6 @@ private AltoDocument.AltoTextBlockTextLine ToAltoTextLine(TextLine textLine, dou
{
textLineCount++;
var strings = textLine.Words
- .Where(x => x.Text.All(XmlConvert.IsXmlChar))
.Select(w => ToAltoString(w, height)).ToArray();
return new AltoDocument.AltoTextBlockTextLine
@@ -252,7 +286,7 @@ private AltoDocument.AltoString ToAltoString(Word word, double height)
Width = (float)Math.Round(word.BoundingBox.Width * scale),
Glyph = glyphs,
Cc = string.Join("", glyphs.Select(g => 9f * (1f - g.Gc))), // from 0->1 to 9->0
- Content = word.Text,
+ Content = invalidCharacterHandler(word.Text),
Language = null,
StyleRefs = null,
SubsContent = null,
@@ -272,7 +306,7 @@ private AltoDocument.AltoGlyph ToAltoGlyph(Letter letter, double height)
Height = (float)Math.Round(letter.GlyphRectangle.Height * scale),
Width = (float)Math.Round(letter.GlyphRectangle.Width * scale),
Gc = 1.0f,
- Content = letter.Value,
+ Content = invalidCharacterHandler(letter.Value),
Id = "P" + pageCount + "_ST" + stringCount.ToString("#00000") + "_G" + glyphCount.ToString("#00")
};
}
@@ -314,8 +348,8 @@ private AltoDocument.AltoDescription GetAltoDescription(string fileName)
Processings = new[] { processing },
SourceImageInformation = new AltoDocument.AltoSourceImageInformation
{
- DocumentIdentifiers = new [] { documentIdentifier },
- FileIdentifiers = new [] { fileIdentifier },
+ DocumentIdentifiers = new[] { documentIdentifier },
+ FileIdentifiers = new[] { fileIdentifier },
FileName = fileName
}
};
@@ -329,6 +363,7 @@ private string Serialize(AltoDocument altoDocument)
Encoding = System.Text.Encoding.UTF8,
Indent = true,
IndentChars = indentChar,
+ CheckCharacters = InvalidCharStrategy != InvalidCharStrategy.DoNotCheck,
};
using (var memoryStream = new System.IO.MemoryStream())
@@ -346,7 +381,12 @@ public static AltoDocument Deserialize(string xmlPath)
{
var serializer = new XmlSerializer(typeof(AltoDocument));
- using (var reader = XmlReader.Create(xmlPath))
+ var settings = new XmlReaderSettings()
+ {
+ CheckCharacters = false
+ };
+
+ using (var reader = XmlReader.Create(xmlPath, settings))
{
return (AltoDocument)serializer.Deserialize(reader);
}
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/HOcrTextExporter.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/HOcrTextExporter.cs
index 40cc53dc0..89be28588 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/HOcrTextExporter.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/HOcrTextExporter.cs
@@ -13,14 +13,14 @@
/// hOCR v1.2 (HTML) text exporter.
/// See http://kba.cloud/hocr-spec/1.2/
///
- public class HOcrTextExporter : ITextExporter
+ public sealed class HOcrTextExporter : ITextExporter
{
private const string XmlHeader = "\n\n";
private const string Hocrjs = "\n";
private readonly IPageSegmenter pageSegmenter;
private readonly IWordExtractor wordExtractor;
-
+ private readonly Func invalidCharacterHandler;
private readonly double scale;
private readonly string indentChar;
@@ -32,16 +32,60 @@ public class HOcrTextExporter : ITextExporter
private int paraCount;
private int imageCount;
+ ///
+ public InvalidCharStrategy InvalidCharStrategy { get; }
+
+ ///
+ /// hOCR v1.2 (HTML)
+ /// See http://kba.cloud/hocr-spec/1.2/
+ ///
+ /// Extractor used to identify words in the document.
+ /// Segmenter used to split page into blocks.
+ /// Scale multiplier to apply to output document, defaults to 1.
+ /// Character to use for indentation, defaults to tab.
+ /// How to handle invalid characters.
+ public HOcrTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter,
+ double scale, string indentChar,
+ Func invalidCharacterHandler)
+ : this(wordExtractor, pageSegmenter, scale, indentChar,
+ InvalidCharStrategy.Custom, invalidCharacterHandler)
+ { }
+
///
/// hOCR v1.2 (HTML)
/// See http://kba.cloud/hocr-spec/1.2/
///
- public HOcrTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, double scale = 1.0, string indent = "\t")
+ /// Extractor used to identify words in the document.
+ /// Segmenter used to split page into blocks.
+ /// Scale multiplier to apply to output document, defaults to 1.
+ /// Character to use for indentation, defaults to tab.
+ /// How to handle invalid characters.
+ public HOcrTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter,
+ double scale = 1, string indentChar = "\t",
+ InvalidCharStrategy invalidCharacterStrategy = InvalidCharStrategy.DoNotCheck)
+ : this(wordExtractor, pageSegmenter, scale, indentChar,
+ invalidCharacterStrategy, null)
+ { }
+
+ private HOcrTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter,
+ double scale, string indentChar,
+ InvalidCharStrategy invalidCharacterStrategy,
+ Func invalidCharacterHandler)
{
this.wordExtractor = wordExtractor;
this.pageSegmenter = pageSegmenter;
this.scale = scale;
- indentChar = indent;
+ this.indentChar = indentChar ?? string.Empty;
+ InvalidCharStrategy = invalidCharacterStrategy;
+
+ if (invalidCharacterHandler is null)
+ {
+ this.invalidCharacterHandler = TextExporterHelper.GetXmlInvalidCharHandler(InvalidCharStrategy);
+ }
+ else
+ {
+ this.invalidCharacterHandler = invalidCharacterHandler;
+ }
}
///
@@ -325,7 +369,7 @@ private string GetCode(Word word, double pageHeight, int level)
}
hocr += "'";
- hocr += ">" + word.Text + " ";
+ hocr += ">" + invalidCharacterHandler(word.Text) + " ";
return hocr;
}
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/InvalidCharStrategy.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/InvalidCharStrategy.cs
new file mode 100644
index 000000000..a7db4c7de
--- /dev/null
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/InvalidCharStrategy.cs
@@ -0,0 +1,28 @@
+namespace UglyToad.PdfPig.DocumentLayoutAnalysis.Export
+{
+ ///
+ /// How to handle invalid characters.
+ ///
+ public enum InvalidCharStrategy : byte
+ {
+ ///
+ /// Custom strategy.
+ ///
+ Custom = 0,
+
+ ///
+ /// Do not check invalid character.
+ ///
+ DoNotCheck = 1,
+
+ ///
+ /// Remove invalid character.
+ ///
+ Remove = 2,
+
+ ///
+ /// Convert invalid character to hexadecimal representation.
+ ///
+ ConvertToHexadecimal = 3
+ }
+}
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/PageXmlTextExporter.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/PageXmlTextExporter.cs
index 64a18810f..47337bba3 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/PageXmlTextExporter.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/PageXmlTextExporter.cs
@@ -3,38 +3,51 @@
using Content;
using Core;
using DocumentLayoutAnalysis;
+ using Graphics;
using Graphics.Colors;
using PAGE;
+ using PageSegmenter;
+ using ReadingOrderDetector;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Xml;
using System.Xml.Serialization;
- using PageSegmenter;
- using ReadingOrderDetector;
- using Graphics;
using Util;
///
/// PAGE-XML 2019-07-15 (XML) text exporter.
/// See https://github.com/PRImA-Research-Lab/PAGE-XML
///
- public class PageXmlTextExporter : ITextExporter
+ public sealed class PageXmlTextExporter : ITextExporter
{
private readonly IPageSegmenter pageSegmenter;
private readonly IWordExtractor wordExtractor;
private readonly IReadingOrderDetector readingOrderDetector;
-
+ private readonly Func invalidCharacterHandler;
private readonly double scale;
private readonly string indentChar;
- private int lineCount;
- private int wordCount;
- private int glyphCount;
- private int regionCount;
- private int groupOrderCount;
+ ///
+ public InvalidCharStrategy InvalidCharStrategy { get; }
- private List orderedRegions;
+ ///
+ /// PAGE-XML 2019-07-15 (XML) text exporter.
+ /// See https://github.com/PRImA-Research-Lab/PAGE-XML
+ ///
+ ///
+ ///
+ ///
+ ///
+ /// Indent character.
+ /// How to handle invalid characters.
+ public PageXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter,
+ IReadingOrderDetector readingOrderDetector,
+ double scale, string indentChar,
+ Func invalidCharacterHandler)
+ : this(wordExtractor, pageSegmenter, readingOrderDetector, scale, indentChar,
+ InvalidCharStrategy.Custom, invalidCharacterHandler)
+ { }
///
/// PAGE-XML 2019-07-15 (XML) text exporter.
@@ -42,20 +55,44 @@ public class PageXmlTextExporter : ITextExporter
///
///
///
- ///
+ ///
///
/// Indent character.
- public PageXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, IReadingOrderDetector readingOrderDetector = null, double scale = 1.0, string indent = "\t")
+ /// How to handle invalid characters.
+ public PageXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter,
+ IReadingOrderDetector readingOrderDetector = null,
+ double scale = 1.0, string indent = "\t",
+ InvalidCharStrategy invalidCharacterStrategy = InvalidCharStrategy.DoNotCheck)
+ : this(wordExtractor, pageSegmenter, readingOrderDetector, scale, indent,
+ invalidCharacterStrategy, null)
+ { }
+
+ private PageXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter,
+ IReadingOrderDetector readingOrderDetector,
+ double scale, string indentChar,
+ InvalidCharStrategy invalidCharacterStrategy,
+ Func invalidCharacterHandler)
{
this.wordExtractor = wordExtractor;
this.pageSegmenter = pageSegmenter;
this.readingOrderDetector = readingOrderDetector;
this.scale = scale;
- indentChar = indent;
+ this.indentChar = indentChar ?? string.Empty;
+ InvalidCharStrategy = invalidCharacterStrategy;
+
+ if (invalidCharacterHandler is null)
+ {
+ this.invalidCharacterHandler = TextExporterHelper.GetXmlInvalidCharHandler(InvalidCharStrategy);
+ }
+ else
+ {
+ this.invalidCharacterHandler = invalidCharacterHandler;
+ }
}
///
/// Get the PAGE-XML (XML) string of the pages layout.
+ /// Not implemented, use instead.
///
///
/// Draw PdfPaths present in the page.
@@ -80,26 +117,23 @@ public string Get(Page page)
/// Draw PdfPaths present in the page.
public string Get(Page page, bool includePaths)
{
- lineCount = 0;
- wordCount = 0;
- glyphCount = 0;
- regionCount = 0;
- groupOrderCount = 0;
- orderedRegions = new List();
+ PageXmlData data = new PageXmlData();
+
+ DateTime utcNow = DateTime.UtcNow;
PageXmlDocument pageXmlDocument = new PageXmlDocument()
{
Metadata = new PageXmlDocument.PageXmlMetadata()
{
- Created = DateTime.UtcNow,
- LastChange = DateTime.UtcNow,
+ Created = utcNow,
+ LastChange = utcNow,
Creator = "PdfPig",
Comments = pageSegmenter.GetType().Name + "|" + wordExtractor.GetType().Name,
},
PcGtsId = "pc-" + page.GetHashCode()
};
- pageXmlDocument.Page = ToPageXmlPage(page, includePaths);
+ pageXmlDocument.Page = ToPageXmlPage(page, data, includePaths);
return Serialize(pageXmlDocument);
}
@@ -151,17 +185,17 @@ private PageXmlDocument.PageXmlCoords ToCoords(PdfRectangle pdfRectangle, double
///
private string ToRgbEncoded(IColor color)
{
- var rgb = color.ToRGBValues();
- int red = (int)Math.Round(255f * (float)rgb.r);
- int green = 256 * (int)Math.Round(255f * (float)rgb.g);
- int blue = 65536 * (int)Math.Round(255f * (float)rgb.b);
+ var (r, g, b) = color.ToRGBValues();
+ int red = Convert.ToByte(255.0 * r);
+ int green = 256 * Convert.ToByte(255.0 * g);
+ int blue = 65536 * Convert.ToByte(255.0 * b);
int sum = red + green + blue;
// as per below, red and blue order might be inverted... var colorWin = System.Drawing.Color.FromArgb(sum);
return sum.ToString();
}
- private PageXmlDocument.PageXmlPage ToPageXmlPage(Page page, bool includePaths)
+ private PageXmlDocument.PageXmlPage ToPageXmlPage(Page page, PageXmlData data, bool includePaths)
{
var pageXmlPage = new PageXmlDocument.PageXmlPage
{
@@ -182,16 +216,17 @@ private PageXmlDocument.PageXmlPage ToPageXmlPage(Page page, bool includePaths)
blocks = readingOrderDetector.Get(blocks).ToList();
}
- regions.AddRange(blocks.Select(b => ToPageXmlTextRegion(b, page.Width, page.Height)));
+ regions.AddRange(blocks.Select(b => ToPageXmlTextRegion(b, data, page.Width, page.Height)));
- if (orderedRegions.Count > 0)
+ if (data.OrderedRegions.Count > 0)
{
+ data.GroupOrdersCount++;
pageXmlPage.ReadingOrder = new PageXmlDocument.PageXmlReadingOrder()
{
Item = new PageXmlDocument.PageXmlOrderedGroup()
{
- Items = orderedRegions.ToArray(),
- Id = "g" + groupOrderCount++
+ Items = data.OrderedRegions.ToArray(),
+ Id = "g" + data.GroupOrdersCount
}
};
}
@@ -200,14 +235,14 @@ private PageXmlDocument.PageXmlPage ToPageXmlPage(Page page, bool includePaths)
var images = page.GetImages().ToList();
if (images.Count > 0)
{
- regions.AddRange(images.Select(i => ToPageXmlImageRegion(i, page.Width, page.Height)));
+ regions.AddRange(images.Select(i => ToPageXmlImageRegion(i, data, page.Width, page.Height)));
}
if (includePaths)
{
foreach (var path in page.ExperimentalAccess.Paths)
{
- var graphicalElement = ToPageXmlLineDrawingRegion(path, page.Width, page.Height);
+ var graphicalElement = ToPageXmlLineDrawingRegion(path, data, page.Width, page.Height);
if (graphicalElement != null)
{
@@ -220,40 +255,40 @@ private PageXmlDocument.PageXmlPage ToPageXmlPage(Page page, bool includePaths)
return pageXmlPage;
}
- private PageXmlDocument.PageXmlLineDrawingRegion ToPageXmlLineDrawingRegion(PdfPath pdfPath, double pageWidth, double pageHeight)
+ private PageXmlDocument.PageXmlLineDrawingRegion ToPageXmlLineDrawingRegion(PdfPath pdfPath, PageXmlData data, double pageWidth, double pageHeight)
{
var bbox = pdfPath.GetBoundingRectangle();
if (bbox.HasValue)
{
- regionCount++;
+ data.RegionsCount++;
return new PageXmlDocument.PageXmlLineDrawingRegion()
{
Coords = ToCoords(bbox.Value, pageWidth, pageHeight),
- Id = "r" + regionCount
+ Id = "r" + data.RegionsCount
};
}
return null;
}
- private PageXmlDocument.PageXmlImageRegion ToPageXmlImageRegion(IPdfImage pdfImage, double pageWidth, double pageHeight)
+ private PageXmlDocument.PageXmlImageRegion ToPageXmlImageRegion(IPdfImage pdfImage, PageXmlData data, double pageWidth, double pageHeight)
{
- regionCount++;
+ data.RegionsCount++;
var bbox = pdfImage.Bounds;
return new PageXmlDocument.PageXmlImageRegion()
{
Coords = ToCoords(bbox, pageWidth, pageHeight),
- Id = "r" + regionCount
+ Id = "r" + data.RegionsCount
};
}
- private PageXmlDocument.PageXmlTextRegion ToPageXmlTextRegion(TextBlock textBlock, double pageWidth, double pageHeight)
+ private PageXmlDocument.PageXmlTextRegion ToPageXmlTextRegion(TextBlock textBlock, PageXmlData data, double pageWidth, double pageHeight)
{
- regionCount++;
- string regionId = "r" + regionCount;
+ data.RegionsCount++;
+ string regionId = "r" + data.RegionsCount;
if (readingOrderDetector != null && textBlock.ReadingOrder > -1)
{
- orderedRegions.Add(new PageXmlDocument.PageXmlRegionRefIndexed()
+ data.OrderedRegions.Add(new PageXmlDocument.PageXmlRegionRefIndexed()
{
RegionRef = regionId,
Index = textBlock.ReadingOrder
@@ -264,40 +299,58 @@ private PageXmlDocument.PageXmlTextRegion ToPageXmlTextRegion(TextBlock textBloc
{
Coords = ToCoords(textBlock.BoundingBox, pageWidth, pageHeight),
Type = PageXmlDocument.PageXmlTextSimpleType.Paragraph,
- TextLines = textBlock.TextLines.Select(l => ToPageXmlTextLine(l, pageWidth, pageHeight)).ToArray(),
- TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = textBlock.Text } },
+ TextLines = textBlock.TextLines.Select(l => ToPageXmlTextLine(l, data, pageWidth, pageHeight)).ToArray(),
+ TextEquivs = new[]
+ {
+ new PageXmlDocument.PageXmlTextEquiv()
+ {
+ Unicode = invalidCharacterHandler(textBlock.Text)
+ }
+ },
Id = regionId
};
}
- private PageXmlDocument.PageXmlTextLine ToPageXmlTextLine(TextLine textLine, double pageWidth, double pageHeight)
+ private PageXmlDocument.PageXmlTextLine ToPageXmlTextLine(TextLine textLine, PageXmlData data, double pageWidth, double pageHeight)
{
- lineCount++;
+ data.LinesCount++;
return new PageXmlDocument.PageXmlTextLine()
{
Coords = ToCoords(textLine.BoundingBox, pageWidth, pageHeight),
Production = PageXmlDocument.PageXmlProductionSimpleType.Printed,
- Words = textLine.Words.Select(w => ToPageXmlWord(w, pageWidth, pageHeight)).ToArray(),
- TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = textLine.Text } },
- Id = "l" + lineCount
+ Words = textLine.Words.Select(w => ToPageXmlWord(w, data, pageWidth, pageHeight)).ToArray(),
+ TextEquivs = new[]
+ {
+ new PageXmlDocument.PageXmlTextEquiv()
+ {
+ Unicode = invalidCharacterHandler(textLine.Text)
+ }
+ },
+ Id = "l" + data.LinesCount
};
}
- private PageXmlDocument.PageXmlWord ToPageXmlWord(Word word, double pageWidth, double pageHeight)
+ private PageXmlDocument.PageXmlWord ToPageXmlWord(Word word, PageXmlData data, double pageWidth, double pageHeight)
{
- wordCount++;
+ data.WordsCount++;
return new PageXmlDocument.PageXmlWord()
{
Coords = ToCoords(word.BoundingBox, pageWidth, pageHeight),
- Glyphs = word.Letters.Select(l => ToPageXmlGlyph(l, pageWidth, pageHeight)).ToArray(),
- TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = word.Text } },
- Id = "w" + wordCount
+ Glyphs = word.Letters.Select(l => ToPageXmlGlyph(l, data, pageWidth, pageHeight)).ToArray(),
+ TextEquivs = new[]
+ {
+ new PageXmlDocument.PageXmlTextEquiv()
+ {
+ Unicode = invalidCharacterHandler(word.Text)
+ }
+ },
+ Id = "w" + data.WordsCount
};
}
- private PageXmlDocument.PageXmlGlyph ToPageXmlGlyph(Letter letter, double pageWidth, double pageHeight)
+ private PageXmlDocument.PageXmlGlyph ToPageXmlGlyph(Letter letter, PageXmlData data, double pageWidth, double pageHeight)
{
- glyphCount++;
+ data.GlyphsCount++;
return new PageXmlDocument.PageXmlGlyph()
{
Coords = ToCoords(letter.GlyphRectangle, pageWidth, pageHeight),
@@ -309,8 +362,14 @@ private PageXmlDocument.PageXmlGlyph ToPageXmlGlyph(Letter letter, double pageWi
FontFamily = letter.FontName,
TextColourRgb = ToRgbEncoded(letter.Color),
},
- TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = letter.Value } },
- Id = "c" + glyphCount
+ TextEquivs = new[]
+ {
+ new PageXmlDocument.PageXmlTextEquiv()
+ {
+ Unicode = invalidCharacterHandler(letter.Value)
+ }
+ },
+ Id = "c" + data.GlyphsCount
};
}
@@ -322,6 +381,7 @@ private string Serialize(PageXmlDocument pageXmlDocument)
Encoding = System.Text.Encoding.UTF8,
Indent = true,
IndentChars = indentChar,
+ CheckCharacters = InvalidCharStrategy != InvalidCharStrategy.DoNotCheck,
};
using (var memoryStream = new System.IO.MemoryStream())
@@ -339,10 +399,34 @@ public static PageXmlDocument Deserialize(string xmlPath)
{
XmlSerializer serializer = new XmlSerializer(typeof(PageXmlDocument));
- using (var reader = XmlReader.Create(xmlPath))
+ var settings = new XmlReaderSettings()
+ {
+ CheckCharacters = false
+ };
+
+ using (var reader = XmlReader.Create(xmlPath, settings))
{
return (PageXmlDocument)serializer.Deserialize(reader);
}
}
+
+ ///
+ /// Class to keep track of a page data.
+ ///
+ private sealed class PageXmlData
+ {
+ public PageXmlData()
+ {
+ OrderedRegions = new List();
+ }
+
+ public int LinesCount { get; set; }
+ public int WordsCount { get; set; }
+ public int GlyphsCount { get; set; }
+ public int RegionsCount { get; set; }
+ public int GroupOrdersCount { get; set; }
+
+ public List OrderedRegions { get; }
+ }
}
}
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/SvgTextExporter.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/SvgTextExporter.cs
index 46151923e..4137ac975 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/SvgTextExporter.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/SvgTextExporter.cs
@@ -13,38 +13,72 @@
///
/// Exports a page as an SVG.
///
- public class SvgTextExporter : ITextExporter
+ public sealed class SvgTextExporter : ITextExporter
{
- private const int Rounding = 4;
+ private readonly Func invalidCharacterHandler;
private static readonly Dictionary Fonts = new Dictionary()
{
{ "ArialMT", "Arial Rounded MT Bold" }
};
+ ///
+ /// Used to round numbers.
+ ///
+ public int Rounding { get; } = 4;
+
+ ///
+ ///
+ /// Not in use.
+ ///
+ public InvalidCharStrategy InvalidCharStrategy { get; }
+
+ ///
+ /// Svg text exporter.
+ ///
+ /// How to handle invalid characters.
+ public SvgTextExporter(Func invalidCharacterHandler)
+ : this(InvalidCharStrategy.Custom, invalidCharacterHandler)
+ { }
+
+ ///
+ /// Svg text exporter.
+ ///
+ /// How to handle invalid characters.
+ public SvgTextExporter(InvalidCharStrategy invalidCharacterStrategy = InvalidCharStrategy.DoNotCheck)
+ : this(invalidCharacterStrategy, null)
+ { }
+
+ private SvgTextExporter(InvalidCharStrategy invalidCharacterStrategy, Func invalidCharacterHandler)
+ {
+ InvalidCharStrategy = invalidCharacterStrategy;
+
+ if (invalidCharacterHandler is null)
+ {
+ this.invalidCharacterHandler = TextExporterHelper.GetXmlInvalidCharHandler(InvalidCharStrategy);
+ }
+ else
+ {
+ this.invalidCharacterHandler = invalidCharacterHandler;
+ }
+ }
///
/// Get the page contents as an SVG.
///
public string Get(Page page)
{
- var builder = new StringBuilder($"