Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rewrite PSBaseParser and add an optimized in-memory version #1041

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,4 @@ Pipfile.lock
.vscode/
poetry.lock
.eggs
*~
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
### Changed

- Using absolute instead of relative imports ([[#995](https://github.com/pdfminer/pdfminer.six/pull/995)])
- Reimplement optimized parsers (really lexers) for file versus in-memory input

### Deprecated

Expand Down
5 changes: 2 additions & 3 deletions pdfminer/cmapdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
import sys
from typing import (
Any,
BinaryIO,
Dict,
Iterable,
Iterator,
Expand Down Expand Up @@ -278,8 +277,8 @@ def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap:


class CMapParser(PSStackParser[PSKeyword]):
def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None:
PSStackParser.__init__(self, fp)
def __init__(self, cmap: CMapBase, data: bytes) -> None:
super().__init__(data)
self.cmap = cmap
# some ToUnicode maps don't have "begincmap" keyword.
self._in_cmap = True
Expand Down
2 changes: 1 addition & 1 deletion pdfminer/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from typing import Literal
except ImportError:
# Literal was introduced in Python 3.8
from typing_extensions import Literal # type: ignore[assignment]
from typing_extensions import Literal # type: ignore

from pdfminer.jbig2 import JBIG2StreamReader, JBIG2StreamWriter
from pdfminer.layout import LTImage
Expand Down
8 changes: 8 additions & 0 deletions pdfminer/pdfdevice.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,10 @@ def render_string_horizontal(
x -= obj * dxscale
needcharspace = True
else:
if isinstance(obj, str):
obj = utils.make_compat_bytes(obj)
if not isinstance(obj, bytes):
continue
for cid in font.decode(obj):
if needcharspace:
x += charspace
Expand Down Expand Up @@ -208,6 +212,10 @@ def render_string_vertical(
y -= obj * dxscale
needcharspace = True
else:
if isinstance(obj, str):
obj = utils.make_compat_bytes(obj)
if not isinstance(obj, bytes):
continue
for cid in font.decode(obj):
if needcharspace:
y += charspace
Expand Down
7 changes: 5 additions & 2 deletions pdfminer/pdfdocument.py
Original file line number Diff line number Diff line change
Expand Up @@ -837,6 +837,7 @@ def getobj(self, objid: int) -> object:
if objid in self._cached_objs:
(obj, genno) = self._cached_objs[objid]
else:
obj = None
for xref in self.xrefs:
try:
(strmid, index, genno) = xref.get_pos(objid)
Expand All @@ -856,7 +857,7 @@ def getobj(self, objid: int) -> object:
break
except (PSEOF, PDFSyntaxError):
continue
else:
if obj is None:
raise PDFObjectNotFound(objid)
log.debug("register: objid=%r: %r", objid, obj)
if self.caching:
Expand Down Expand Up @@ -891,7 +892,9 @@ def get_page_labels(self) -> Iterator[str]:
If the document includes page labels, generates strings, one per page.
If not, raises PDFNoPageLabels.

The resulting iteration is unbounded.
The resulting iterator is unbounded, so it is recommended to
zip it with the iterator over actual pages returned by `get_pages`.

"""
assert self.catalog is not None

Expand Down
18 changes: 9 additions & 9 deletions pdfminer/pdffont.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,8 @@ class Type1FontHeaderParser(PSStackParser[int]):
KEYWORD_READONLY = KWD(b"readonly")
KEYWORD_FOR = KWD(b"for")

def __init__(self, data: BinaryIO) -> None:
PSStackParser.__init__(self, data)
def __init__(self, data: bytes) -> None:
super().__init__(data)
self._cid2unicode: Dict[int, str] = {}

def get_encoding(self) -> Dict[int, str]:
Expand Down Expand Up @@ -899,8 +899,8 @@ def is_vertical(self) -> bool:
def is_multibyte(self) -> bool:
return False

def decode(self, bytes: bytes) -> Iterable[int]:
return bytearray(bytes) # map(ord, bytes)
def decode(self, data: bytes) -> Iterable[int]:
return data

def get_ascent(self) -> float:
"""Ascent above the baseline, in text space units"""
Expand Down Expand Up @@ -969,7 +969,7 @@ def __init__(
if "ToUnicode" in spec:
strm = stream_value(spec["ToUnicode"])
self.unicode_map = FileUnicodeMap()
CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
CMapParser(self.unicode_map, strm.get_data()).run()
PDFFont.__init__(self, descriptor, widths)

def to_unichr(self, cid: int) -> str:
Expand Down Expand Up @@ -1009,7 +1009,7 @@ def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> No
self.fontfile = stream_value(descriptor.get("FontFile"))
length1 = int_value(self.fontfile["Length1"])
data = self.fontfile.get_data()[:length1]
parser = Type1FontHeaderParser(BytesIO(data))
parser = Type1FontHeaderParser(data)
self.cid2unicode = parser.get_encoding()

def __repr__(self) -> str:
Expand Down Expand Up @@ -1080,7 +1080,7 @@ def __init__(
if isinstance(spec["ToUnicode"], PDFStream):
strm = stream_value(spec["ToUnicode"])
self.unicode_map = FileUnicodeMap()
CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
CMapParser(self.unicode_map, strm.get_data()).run()
else:
cmap_name = literal_name(spec["ToUnicode"])
encoding = literal_name(spec["Encoding"])
Expand Down Expand Up @@ -1173,8 +1173,8 @@ def is_vertical(self) -> bool:
def is_multibyte(self) -> bool:
return True

def decode(self, bytes: bytes) -> Iterable[int]:
return self.cmap.decode(bytes)
def decode(self, data: bytes) -> Iterable[int]:
return self.cmap.decode(data)

def char_disp(self, cid: int) -> Union[float, Tuple[Optional[float], float]]:
"""Returns an integer for horizontal fonts, a tuple for vertical fonts."""
Expand Down
138 changes: 62 additions & 76 deletions pdfminer/pdfinterp.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import logging
import re
from io import BytesIO
from typing import Dict, List, Mapping, Optional, Sequence, Tuple, Union, cast

from pdfminer import settings
Expand All @@ -18,6 +16,7 @@
PDFType3Font,
)
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFSyntaxError
from pdfminer.pdftypes import (
LITERALS_ASCII85_DECODE,
PDFObjRef,
Expand All @@ -31,6 +30,7 @@
from pdfminer.psparser import (
KWD,
LIT,
PSBaseParserToken,
PSKeyword,
PSLiteral,
PSStackParser,
Expand Down Expand Up @@ -248,85 +248,52 @@ def get_font(self, objid: object, spec: Mapping[str, object]) -> PDFFont:
return font


KEYWORD_BI = KWD(b"BI")
KEYWORD_ID = KWD(b"ID")
KEYWORD_EI = KWD(b"EI")


class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]):
def __init__(self, streams: Sequence[object]) -> None:
self.streams = streams
self.istream = 0
# PSStackParser.__init__(fp=None) is safe only because we've overloaded
# all the methods that would attempt to access self.fp without first
# calling self.fillfp().
PSStackParser.__init__(self, None) # type: ignore[arg-type]

def fillfp(self) -> None:
if not self.fp:
if self.istream < len(self.streams):
strm = stream_value(self.streams[self.istream])
self.istream += 1
else:
raise PSEOF("Unexpected EOF, file truncated?")
self.fp = BytesIO(strm.get_data())
"""Parse the concatenation of multiple content streams, as
described in the spec (PDF 1.7, p.86):

...the effect shall be as if all of the streams in the array were
concatenated, in order, to form a single stream. Conforming
writers can create image objects and other resources as they
occur, even though they interrupt the content stream. The division
between streams may occur only at the boundaries between lexical
tokens (see 7.2, "Lexical Conventions") but shall be unrelated to
the page’s logical content or organization.
"""

def seek(self, pos: int) -> None:
self.fillfp()
PSStackParser.seek(self, pos)
def __init__(self, streams: Sequence[object]) -> None:
self.streamiter = iter(streams)
try:
stream = stream_value(next(self.streamiter))
except StopIteration:
raise PSEOF
log.debug("PDFContentParser starting stream %r", stream)
super().__init__(stream.get_data())

def fillbuf(self) -> None:
if self.charpos < len(self.buf):
return
while 1:
self.fillfp()
self.bufpos = self.fp.tell()
self.buf = self.fp.read(self.BUFSIZ)
if self.buf:
break
self.fp = None # type: ignore[assignment]
self.charpos = 0

def get_inline_data(self, pos: int, target: bytes = b"EI") -> Tuple[int, bytes]:
self.seek(pos)
i = 0
data = b""
while i <= len(target):
self.fillbuf()
if i:
ci = self.buf[self.charpos]
c = bytes((ci,))
data += c
self.charpos += 1
if (
len(target) <= i
and c.isspace()
or i < len(target)
and c == (bytes((target[i],)))
):
i += 1
else:
i = 0
else:
try:
j = self.buf.index(target[0], self.charpos)
data += self.buf[self.charpos : j + 1]
self.charpos = j + 1
i = 1
except ValueError:
data += self.buf[self.charpos :]
self.charpos = len(self.buf)
data = data[: -(len(target) + 1)] # strip the last part
data = re.sub(rb"(\x0d\x0a|[\x0d\x0a])$", b"", data)
return (pos, data)
def __next__(self) -> Tuple[int, PSBaseParserToken]:
while True:
try:
return super().__next__()
except StopIteration:
# Will also raise StopIteration if there are no more,
# which is exactly what we want
stream = stream_value(next(self.streamiter))
log.debug("PDFContentParser starting stream %r", stream)
self.reinit(stream.get_data())

def flush(self) -> None:
self.add_results(*self.popall())

KEYWORD_BI = KWD(b"BI")
KEYWORD_ID = KWD(b"ID")
KEYWORD_EI = KWD(b"EI")

def do_keyword(self, pos: int, token: PSKeyword) -> None:
if token is self.KEYWORD_BI:
if token is KEYWORD_BI:
# inline image within a content stream
self.start_type(pos, "inline")
elif token is self.KEYWORD_ID:
elif token is KEYWORD_ID:
try:
(_, objs) = self.end_type("inline")
if len(objs) % 2 != 0:
Expand All @@ -340,13 +307,32 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
filter = [filter]
if filter[0] in LITERALS_ASCII85_DECODE:
eos = b"~>"
(pos, data) = self.get_inline_data(pos + len(b"ID "), target=eos)
if eos != b"EI": # it may be necessary for decoding
data += eos
# PDF 1.7 p. 215: Unless the image uses ASCIIHexDecode
# or ASCII85Decode as one of its filters, the ID
# operator shall be followed by a single white-space
# character, and the next character shall be
# interpreted as the first byte of image data.
if eos == b"EI":
self.seek(pos + len(token.name) + 1)
(pos, data) = self.get_inline_data(target=eos)
# FIXME: it is totally unspecified what to do with
# a newline between the end of the data and "EI",
# since there is no explicit stream length. (PDF
# 1.7 p. 756: There should be an end-of-line
# marker after the data and before endstream; this
# marker shall not be included in the stream
# length.)
data = data[: -len(eos)]
else:
self.seek(pos + len(token.name))
(pos, data) = self.get_inline_data(target=eos)
if pos == -1:
raise PDFSyntaxError("End of inline stream %r not found" % eos)
obj = PDFStream(d, data)
self.push((pos, obj))
if eos == b"EI": # otherwise it is still in the stream
self.push((pos, self.KEYWORD_EI))
# This was included in the data but we need to "parse" it
if eos == b"EI":
self.push((pos, KEYWORD_EI))
except PSTypeError:
if settings.STRICT:
raise
Expand Down
19 changes: 11 additions & 8 deletions pdfminer/pdfparser.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import logging
from io import BytesIO
from typing import TYPE_CHECKING, BinaryIO, Optional, Union

from pdfminer import settings
Expand Down Expand Up @@ -36,8 +35,8 @@ class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):

"""

def __init__(self, fp: BinaryIO) -> None:
PSStackParser.__init__(self, fp)
def __init__(self, data: Union[BinaryIO, bytes]) -> None:
super().__init__(data)
self.doc: Optional[PDFDocument] = None
self.fallback = False

Expand Down Expand Up @@ -92,10 +91,9 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
raise PDFSyntaxError("Unexpected EOF")
return
pos += len(line)
self.fp.seek(pos)
data = bytearray(self.fp.read(objlen))
data = bytearray(self.read(pos, objlen))
self.seek(pos + objlen)
while 1:
while True:
try:
(linepos, line) = self.nextline()
except PSEOF:
Expand Down Expand Up @@ -138,7 +136,7 @@ class PDFStreamParser(PDFParser):
"""

def __init__(self, data: bytes) -> None:
PDFParser.__init__(self, BytesIO(data))
super().__init__(data)

def flush(self) -> None:
self.add_results(*self.popall())
Expand All @@ -148,7 +146,12 @@ def flush(self) -> None:
def do_keyword(self, pos: int, token: PSKeyword) -> None:
if token is self.KEYWORD_R:
# reference to indirect object
(_, _object_id), _ = self.pop(2)
try:
(_, _object_id), _ = self.pop(2)
except ValueError:
raise PDFSyntaxError(
"Expected generation and object id in indirect object reference"
)
object_id = safe_int(_object_id)
if object_id is not None:
obj = PDFObjRef(self.doc, object_id)
Expand Down
Loading
Loading