Skip to content

Commit

Permalink
Handle invalid xml characters in text exporters and fix #655
Browse files Browse the repository at this point in the history
  • Loading branch information
BobLd committed Aug 5, 2023
1 parent 9aaf20c commit 45e2171
Show file tree
Hide file tree
Showing 11 changed files with 1,068 additions and 117 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@
/// Alto 4.1 (XML) text exporter.
/// <para>See https://github.com/altoxml/schema </para>
/// </summary>
public class AltoXmlTextExporter : ITextExporter
public sealed class AltoXmlTextExporter : ITextExporter
{
private readonly IPageSegmenter pageSegmenter;
private readonly IWordExtractor wordExtractor;

private readonly Func<string, string> invalidCharacterHandler;
private readonly double scale;
private readonly string indentChar;

Expand All @@ -33,20 +33,60 @@ public class AltoXmlTextExporter : ITextExporter
private int stringCount;
private int glyphCount;

/// <inheritdoc/>
public InvalidCharStrategy InvalidCharStrategy { get; }

/// <summary>
/// Alto 4.1 (XML).
/// <para>See https://github.com/altoxml/schema </para>
/// </summary>
/// <param name="wordExtractor">Extractor used to identify words in the document.</param>
/// <param name="pageSegmenter">Segmenter used to split page into blocks.</param>
/// <param name="scale">Scale multiplier to apply to output document, defaults to 1.</param>
/// <param name="indent">Character to use for indentation, defaults to tab.</param>
public AltoXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, double scale = 1, string indent = "\t")
/// <param name="indentChar">Character to use for indentation, defaults to tab.</param>
/// <param name="invalidCharacterHandler">How to handle invalid characters.</param>
public AltoXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter,
double scale, string indentChar,
Func<string, string> invalidCharacterHandler)
: this(wordExtractor, pageSegmenter, scale, indentChar,
InvalidCharStrategy.Custom, invalidCharacterHandler)
{ }

/// <summary>
/// Alto 4.1 (XML).
/// <para>See https://github.com/altoxml/schema </para>
/// </summary>
/// <param name="wordExtractor">Extractor used to identify words in the document.</param>
/// <param name="pageSegmenter">Segmenter used to split page into blocks.</param>
/// <param name="scale">Scale multiplier to apply to output document, defaults to 1.</param>
/// <param name="indentChar">Character to use for indentation, defaults to tab.</param>
/// <param name="invalidCharacterStrategy">How to handle invalid characters.</param>
public AltoXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter,
double scale = 1, string indentChar = "\t",
InvalidCharStrategy invalidCharacterStrategy = InvalidCharStrategy.DoNotCheck)
: this(wordExtractor, pageSegmenter, scale, indentChar,
invalidCharacterStrategy, null)
{ }

private AltoXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter,
double scale, string indentChar,
InvalidCharStrategy invalidCharacterStrategy,
Func<string, string> invalidCharacterHandler)
{
this.wordExtractor = wordExtractor ?? throw new ArgumentNullException(nameof(wordExtractor));
this.pageSegmenter = pageSegmenter ?? throw new ArgumentNullException(nameof(pageSegmenter));
this.wordExtractor = wordExtractor;
this.pageSegmenter = pageSegmenter;
this.scale = scale;
indentChar = indent ?? string.Empty;
this.indentChar = indentChar ?? string.Empty;
InvalidCharStrategy = invalidCharacterStrategy;

if (invalidCharacterHandler is null)
{
this.invalidCharacterHandler = TextExporterHelper.GetXmlInvalidCharHandler(InvalidCharStrategy);
}
else
{
this.invalidCharacterHandler = invalidCharacterHandler;
}
}

/// <summary>
Expand All @@ -57,10 +97,7 @@ public AltoXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegm
public string Get(PdfDocument document, bool includePaths = false)
{
var altoDocument = CreateAltoDocument("unknown");
var altoPages = document.GetPages().Select(x => ToAltoPage(x, includePaths)).ToArray();

altoDocument.Layout.Pages = altoPages;

altoDocument.Layout.Pages = document.GetPages().Select(x => ToAltoPage(x, includePaths)).ToArray();
return Serialize(altoDocument);
}

Expand Down Expand Up @@ -128,8 +165,8 @@ private AltoDocument.AltoPage ToAltoPage(Page page, bool includePaths)
{
Height = (float)Math.Round(page.Height * scale), // TBD
Width = (float)Math.Round(page.Width * scale), // TBD
VerticalPosition = 0f, // TBD
HorizontalPosition = 0f, // TBD
VerticalPosition = 0f, // TBD
HorizontalPosition = 0f, // TBD
ComposedBlocks = null, // TBD
GraphicalElements = null, // TBD
Illustrations = null, // TBD
Expand All @@ -141,9 +178,7 @@ private AltoDocument.AltoPage ToAltoPage(Page page, bool includePaths)
};

var words = page.GetWords(wordExtractor);
var blocks = pageSegmenter.GetBlocks(words).Select(b => ToAltoTextBlock(b, page.Height)).ToArray();

altoPage.PrintSpace.TextBlock = blocks;
altoPage.PrintSpace.TextBlock = pageSegmenter.GetBlocks(words).Select(b => ToAltoTextBlock(b, page.Height)).ToArray();

altoPage.PrintSpace.Illustrations = page.GetImages().Select(i => ToAltoIllustration(i, page.Height)).ToArray();

Expand Down Expand Up @@ -222,7 +257,6 @@ private AltoDocument.AltoTextBlockTextLine ToAltoTextLine(TextLine textLine, dou
{
textLineCount++;
var strings = textLine.Words
.Where(x => x.Text.All(XmlConvert.IsXmlChar))
.Select(w => ToAltoString(w, height)).ToArray();

return new AltoDocument.AltoTextBlockTextLine
Expand Down Expand Up @@ -252,7 +286,7 @@ private AltoDocument.AltoString ToAltoString(Word word, double height)
Width = (float)Math.Round(word.BoundingBox.Width * scale),
Glyph = glyphs,
Cc = string.Join("", glyphs.Select(g => 9f * (1f - g.Gc))), // from 0->1 to 9->0
Content = word.Text,
Content = invalidCharacterHandler(word.Text),
Language = null,
StyleRefs = null,
SubsContent = null,
Expand All @@ -272,7 +306,7 @@ private AltoDocument.AltoGlyph ToAltoGlyph(Letter letter, double height)
Height = (float)Math.Round(letter.GlyphRectangle.Height * scale),
Width = (float)Math.Round(letter.GlyphRectangle.Width * scale),
Gc = 1.0f,
Content = letter.Value,
Content = invalidCharacterHandler(letter.Value),
Id = "P" + pageCount + "_ST" + stringCount.ToString("#00000") + "_G" + glyphCount.ToString("#00")
};
}
Expand Down Expand Up @@ -314,8 +348,8 @@ private AltoDocument.AltoDescription GetAltoDescription(string fileName)
Processings = new[] { processing },
SourceImageInformation = new AltoDocument.AltoSourceImageInformation
{
DocumentIdentifiers = new [] { documentIdentifier },
FileIdentifiers = new [] { fileIdentifier },
DocumentIdentifiers = new[] { documentIdentifier },
FileIdentifiers = new[] { fileIdentifier },
FileName = fileName
}
};
Expand All @@ -329,6 +363,7 @@ private string Serialize(AltoDocument altoDocument)
Encoding = System.Text.Encoding.UTF8,
Indent = true,
IndentChars = indentChar,
CheckCharacters = InvalidCharStrategy != InvalidCharStrategy.DoNotCheck,
};

using (var memoryStream = new System.IO.MemoryStream())
Expand All @@ -346,7 +381,12 @@ public static AltoDocument Deserialize(string xmlPath)
{
var serializer = new XmlSerializer(typeof(AltoDocument));

using (var reader = XmlReader.Create(xmlPath))
var settings = new XmlReaderSettings()
{
CheckCharacters = false
};

using (var reader = XmlReader.Create(xmlPath, settings))
{
return (AltoDocument)serializer.Deserialize(reader);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,14 @@
/// hOCR v1.2 (HTML) text exporter.
/// <para>See http://kba.cloud/hocr-spec/1.2/ </para>
/// </summary>
public class HOcrTextExporter : ITextExporter
public sealed class HOcrTextExporter : ITextExporter
{
private const string XmlHeader = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n\t\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n";
private const string Hocrjs = "<script src='https://unpkg.com/hocrjs'></script>\n";

private readonly IPageSegmenter pageSegmenter;
private readonly IWordExtractor wordExtractor;

private readonly Func<string, string> invalidCharacterHandler;
private readonly double scale;
private readonly string indentChar;

Expand All @@ -32,16 +32,60 @@ public class HOcrTextExporter : ITextExporter
private int paraCount;
private int imageCount;

/// <inheritdoc/>
public InvalidCharStrategy InvalidCharStrategy { get; }

/// <summary>
/// hOCR v1.2 (HTML)
/// <para>See http://kba.cloud/hocr-spec/1.2/ </para>
/// </summary>
/// <param name="wordExtractor">Extractor used to identify words in the document.</param>
/// <param name="pageSegmenter">Segmenter used to split page into blocks.</param>
/// <param name="scale">Scale multiplier to apply to output document, defaults to 1.</param>
/// <param name="indentChar">Character to use for indentation, defaults to tab.</param>
/// <param name="invalidCharacterHandler">How to handle invalid characters.</param>
public HOcrTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter,
double scale, string indentChar,
Func<string, string> invalidCharacterHandler)
: this(wordExtractor, pageSegmenter, scale, indentChar,
InvalidCharStrategy.Custom, invalidCharacterHandler)
{ }

/// <summary>
/// hOCR v1.2 (HTML)
/// <para>See http://kba.cloud/hocr-spec/1.2/ </para>
/// </summary>
public HOcrTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, double scale = 1.0, string indent = "\t")
/// <param name="wordExtractor">Extractor used to identify words in the document.</param>
/// <param name="pageSegmenter">Segmenter used to split page into blocks.</param>
/// <param name="scale">Scale multiplier to apply to output document, defaults to 1.</param>
/// <param name="indentChar">Character to use for indentation, defaults to tab.</param>
/// <param name="invalidCharacterStrategy">How to handle invalid characters.</param>
public HOcrTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter,
double scale = 1, string indentChar = "\t",
InvalidCharStrategy invalidCharacterStrategy = InvalidCharStrategy.DoNotCheck)
: this(wordExtractor, pageSegmenter, scale, indentChar,
invalidCharacterStrategy, null)
{ }

private HOcrTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter,
double scale, string indentChar,
InvalidCharStrategy invalidCharacterStrategy,
Func<string, string> invalidCharacterHandler)
{
this.wordExtractor = wordExtractor;
this.pageSegmenter = pageSegmenter;
this.scale = scale;
indentChar = indent;
this.indentChar = indentChar ?? string.Empty;
InvalidCharStrategy = invalidCharacterStrategy;

if (invalidCharacterHandler is null)
{
this.invalidCharacterHandler = TextExporterHelper.GetXmlInvalidCharHandler(InvalidCharStrategy);
}
else
{
this.invalidCharacterHandler = invalidCharacterHandler;
}
}

/// <summary>
Expand Down Expand Up @@ -325,7 +369,7 @@ private string GetCode(Word word, double pageHeight, int level)
}
hocr += "'";

hocr += ">" + word.Text + "</span> ";
hocr += ">" + invalidCharacterHandler(word.Text) + "</span> ";
return hocr;
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
namespace UglyToad.PdfPig.DocumentLayoutAnalysis.Export
{
/// <summary>
/// How to handle invalid characters.
/// </summary>
public enum InvalidCharStrategy : byte
{
/// <summary>
/// Custom strategy.
/// </summary>
Custom = 0,

/// <summary>
/// Do not check invalid character.
/// </summary>
DoNotCheck = 1,

/// <summary>
/// Remove invalid character.
/// </summary>
Remove = 2,

/// <summary>
/// Convert invalid character to hexadecimal representation.
/// </summary>
ConvertToHexadecimal = 3
}
}
Loading

0 comments on commit 45e2171

Please sign in to comment.