Skip to content

Commit

Permalink
Add error handling for label extraction from corrupted PDF pages
Browse files Browse the repository at this point in the history
  • Loading branch information
mrtcode committed Jul 18, 2024
1 parent 6cdf452 commit b658eb6
Showing 1 changed file with 21 additions and 16 deletions.
37 changes: 21 additions & 16 deletions src/core/module/page-label.js
Original file line number Diff line number Diff line change
Expand Up @@ -357,25 +357,30 @@ function validateExtractedPageLabels(labels, processedPages) {
}

export async function getPageLabels(pdfDocument, structuredCharsProvider) {
// Max pages to process
const MAX_PAGES = 25;
let { numPages } = pdfDocument.catalog;

let extractedPageLabels = [];
let i = 0;
for (; i < MAX_PAGES; i++) {
let pageLabel = await getPageLabel(pdfDocument, structuredCharsProvider, i);
if (pageLabel) {
extractedPageLabels.push(pageLabel);
let pageLabels = [];
try {
// Max pages to process
const MAX_PAGES = 25;
let { numPages } = pdfDocument.catalog;

let extractedPageLabels = [];
let i = 0;
for (; i < MAX_PAGES; i++) {
let pageLabel = await getPageLabel(pdfDocument, structuredCharsProvider, i);
if (pageLabel) {
extractedPageLabels.push(pageLabel);
}
}
}

if (!validateExtractedPageLabels(extractedPageLabels, i)) {
extractedPageLabels = null;
}
if (!validateExtractedPageLabels(extractedPageLabels, i)) {
extractedPageLabels = null;
}

let catalogPageLabels = await pdfDocument.pdfManager.ensureCatalog("pageLabels");
let catalogPageLabels = await pdfDocument.pdfManager.ensureCatalog("pageLabels");

let pageLabels = predictPageLabels(extractedPageLabels, catalogPageLabels, numPages)
pageLabels = predictPageLabels(extractedPageLabels, catalogPageLabels, numPages)
} catch (e) {
console.log(e);
}
return pageLabels;
}

0 comments on commit b658eb6

Please sign in to comment.