Skip to content

Commit

Permalink
Merge pull request #20 from SamEdwardes/feature/page-range
Browse files Browse the repository at this point in the history
Add support for page_range in pdf_reader
  • Loading branch information
SamEdwardes authored Oct 17, 2023
2 parents 168223d + b23866f commit 802ec31
Show file tree
Hide file tree
Showing 5 changed files with 150 additions and 7 deletions.
13 changes: 11 additions & 2 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,20 @@

**Changes**

Nonepo
- Support for `page_range` argument ([#16](https://github.com/SamEdwardes/spacypdfreader/issues/16), [#18](https://github.com/SamEdwardes/spacypdfreader/issues/18)).

```python
import spacy
from spacypdfreader import pdf_reader
from spacypdfreader.parsers import pytesseract

nlp = spacy.load("en_core_web_sm")
doc = pdf_reader("tests/data/test_pdf_01.pdf", nlp, pytesseract.parser, n_processes=4, page_range=(2, 3))
```

**Fixes**

- Remove `shed` as a depdency. It was removing unused imports that were required ([#17](https://github.com/SamEdwardes/spacypdfreader/issues/17)).
- Remove `shed` as a dependency. It was removing unused imports that were required ([#17](https://github.com/SamEdwardes/spacypdfreader/issues/17)).

## 0.3.0 (2023-05-17)

Expand Down
9 changes: 9 additions & 0 deletions spacypdfreader/parsers/pdfminer.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,15 @@ def parser(pdf_path: str, page_number: int, **kwargs):
details on the implementation of pdfminer. For more details on pdfminer
refer to the [pdfminer docs](https://pdfminersix.readthedocs.io/en/latest/).
"""
# Check to see if the users has provided the `page_numbers` kwarg. This is not
# valid. So raise an error. See: https://github.com/SamEdwardes/spacypdfreader/issues/16
if "page_numbers" in kwargs:
raise ValueError(
"The `page_numbers` kwarg is not valid when using the pdfminer parser. "
"Please use `page_range` instead. For example: ",
"``",
)

# pdfminer uses zero indexed page numbers. Therefore need to remove 1
# from the page count.
page_number -= 1
Expand Down
42 changes: 37 additions & 5 deletions spacypdfreader/spacypdfreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import warnings
from functools import partial
from multiprocessing.pool import ThreadPool as Pool
from typing import Any, Callable, Optional
from typing import Any, Callable, Iterable, Optional

import spacy
from spacy.tokens import Doc, Token
Expand Down Expand Up @@ -40,6 +40,7 @@ def pdf_reader(
pdf_parser: Callable = pdfminer.parser,
verbose: bool = False,
n_processes: Optional[int] = None,
page_range: Optional[Iterable[int]] = None,
**kwargs: Any,
) -> spacy.tokens.Doc:
"""Convert a PDF document to a spaCy Doc object.
Expand All @@ -54,6 +55,9 @@ def pdf_reader(
False.
n_processes: The number of process to use for multi-processing. If `None`,
multi-processing will not be used.
page_range: The page range of the PDF to convert from PDF to text. Must be
one digit based indexing (e.g. the first page of the PDF is page 1, as
opposed to page 0). If `None` all pages will be converted.
**kwargs: Arbitrary keyword arguments to pass to the underlying functions
that extract text from the PDFs. If using pdfminer (the default)
`**kwargs` will be passed to
Expand Down Expand Up @@ -112,6 +116,15 @@ def pdf_reader(
>>>
>>> nlp = spacy.load("en_core_web_sm")
>>> doc = pdf_reader("tests/data/test_pdf_01.pdf", nlp, pytesseract.parser, n_processes=4)
To extract a specific range of pages, use the `page_range` argument.
>>> import spacy
>>> from spacypdfreader import pdf_reader
>>> from spacypdfreader.parsers import pytesseract
>>>
>>> nlp = spacy.load("en_core_web_sm")
>>> doc = pdf_reader("tests/data/test_pdf_01.pdf", nlp, pytesseract.parser, n_processes=4, page_range=(2, 3))
"""
# For backwards compatibility, if someone passes in PdfMinerParser or
# PyTesseractParser replace with the correct function
Expand Down Expand Up @@ -141,20 +154,39 @@ def pdf_reader(

pdf_path = os.path.normpath(pdf_path)
num_pages = _get_number_of_pages(pdf_path)

# Get page range:
if page_range:
start_page, end_page = page_range
else:
start_page = 1
end_page = num_pages

# Validate the page_range argument.
if start_page > end_page:
raise ValueError("The start page must be less than or equal to the end page.")
elif start_page < 1:
raise ValueError("The start page must be greater than or equal to 1.")
elif end_page > num_pages:
raise ValueError(
f"The end page must be less than or equal to the number of pages in the PDF ({num_pages})."
)

if verbose:
console.print(f"Extracting text from {num_pages} pdf pages...")
console.print(f"PDF contains {num_pages} pages.")
console.print(f"Extracting text from {start_page} to {end_page}...")

# Handle multiprocessing
if n_processes:
with Pool(n_processes) as p:
partial_worker = partial(pdf_parser, pdf_path, **kwargs)
args = list(range(1, num_pages + 1))
args = list(range(start_page, end_page + 1))
texts = p.map(partial_worker, args)

# Handle non-multiprocessing
else:
texts = []
for page_num in range(1, num_pages + 1):
for page_num in range(start_page, end_page + 1):
text = pdf_parser(pdf_path=pdf_path, page_number=page_num, **kwargs)
texts.append(text)

Expand All @@ -164,7 +196,7 @@ def pdf_reader(

docs = [doc for doc in nlp.pipe(texts)]
for idx, doc in enumerate(docs):
page_num = idx + 1
page_num = idx + start_page
for token in doc:
token._.page_number = page_num

Expand Down
84 changes: 84 additions & 0 deletions tests/test_page_range.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import pytest
import spacy

from spacypdfreader.parsers import pdfminer, pytesseract
from spacypdfreader.spacypdfreader import pdf_reader


def pdf_assertions(doc: spacy.tokens.Doc):
# Page numbers.
assert doc[0]._.page_number == 2
assert doc[-1]._.page_number == 3
# Doc attributes.
assert doc._.page_range == (2, 3)
assert doc._.first_page == 2
assert doc._.last_page == 3
assert doc._.pdf_file_name == "tests/data/test_pdf_01.pdf"


def test_page_range_pdfminer_single():
nlp = spacy.load("en_core_web_sm")
doc = pdf_reader(
"tests/data/test_pdf_01.pdf", nlp, pdfminer.parser, page_range=(2, 3)
)
pdf_assertions(doc)


def test_page_range_pdfminer_multi():
nlp = spacy.load("en_core_web_sm")
doc = pdf_reader(
"tests/data/test_pdf_01.pdf",
nlp,
pdfminer.parser,
page_range=(2, 3),
n_processes=2,
)
pdf_assertions(doc)


def test_page_range_pytesseract_single():
nlp = spacy.load("en_core_web_sm")
doc = pdf_reader(
"tests/data/test_pdf_01.pdf", nlp, pytesseract.parser, page_range=(2, 3)
)
pdf_assertions(doc)


def test_page_range_pytesseract_multi():
nlp = spacy.load("en_core_web_sm")
doc = pdf_reader(
"tests/data/test_pdf_01.pdf",
nlp,
pytesseract.parser,
page_range=(2, 3),
n_processes=2,
)
pdf_assertions(doc)


def test_page_range_logic():
nlp = spacy.load("en_core_web_sm")
with pytest.raises(ValueError):
doc = pdf_reader(
"tests/data/test_pdf_01.pdf",
nlp,
pytesseract.parser,
page_range=(10, 20),
n_processes=2,
)
with pytest.raises(ValueError):
doc = pdf_reader(
"tests/data/test_pdf_01.pdf",
nlp,
pytesseract.parser,
page_range=(-1, 2),
n_processes=2,
)
with pytest.raises(ValueError):
doc = pdf_reader(
"tests/data/test_pdf_01.pdf",
nlp,
pytesseract.parser,
page_range=(3, 1),
n_processes=2,
)
9 changes: 9 additions & 0 deletions tests/test_pdfminer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import pytest
import spacy

from spacypdfreader.parsers import pdfminer
Expand Down Expand Up @@ -59,3 +60,11 @@ def test_pdfminer_multi_same_as_single():
)
doc_single = pdf_reader("tests/data/test_pdf_01.pdf", nlp, pdfminer.parser)
assert doc_multi.text == doc_single.text


def test_pdfminer_rejects_n_pages():
nlp = spacy.load("en_core_web_sm")
with pytest.raises(ValueError):
doc = pdf_reader(
"tests/data/test_pdf_01.pdf", nlp, pdfminer.parser, page_numbers=[1, 2]
)

0 comments on commit 802ec31

Please sign in to comment.