From 0a1ab082d1730bfd04d0467d75f609ea2b53e282 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Wed, 18 Sep 2024 12:41:36 -0400 Subject: [PATCH 1/4] Rewrite PSBaseParser and add an optimized in-memory version --- .gitignore | 1 + CHANGELOG.md | 1 + pdfminer/cmapdb.py | 5 +- pdfminer/image.py | 2 +- pdfminer/pdfdocument.py | 7 +- pdfminer/pdffont.py | 10 +- pdfminer/pdfinterp.py | 138 +++-- pdfminer/pdfparser.py | 12 +- pdfminer/psparser.py | 926 ++++++++++++++++++++++---------- pdfminer/utils.py | 7 +- tests/test_pdfminer_psparser.py | 270 +++++++++- tools/dumppdf.py | 1 - 12 files changed, 985 insertions(+), 395 deletions(-) diff --git a/.gitignore b/.gitignore index 7f27b7ae..f136d472 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,4 @@ Pipfile.lock .vscode/ poetry.lock .eggs +*~ diff --git a/CHANGELOG.md b/CHANGELOG.md index 5425c5d3..ac94d845 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Changed - Using absolute instead of relative imports ([[#995](https://github.com/pdfminer/pdfminer.six/pull/995)]) +- Reimplement optimized parsers (really lexers) for file versus in-memory input ### Deprecated diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py index 87d9870e..93be1266 100644 --- a/pdfminer/cmapdb.py +++ b/pdfminer/cmapdb.py @@ -18,7 +18,6 @@ import sys from typing import ( Any, - BinaryIO, Dict, Iterable, Iterator, @@ -278,8 +277,8 @@ def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap: class CMapParser(PSStackParser[PSKeyword]): - def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None: - PSStackParser.__init__(self, fp) + def __init__(self, cmap: CMapBase, data: bytes) -> None: + super().__init__(data) self.cmap = cmap # some ToUnicode maps don't have "begincmap" keyword. self._in_cmap = True diff --git a/pdfminer/image.py b/pdfminer/image.py index 355c7fb7..a6c26497 100644 --- a/pdfminer/image.py +++ b/pdfminer/image.py @@ -8,7 +8,7 @@ from typing import Literal except ImportError: # Literal was introduced in Python 3.8 - from typing_extensions import Literal # type: ignore[assignment] + from typing_extensions import Literal # type: ignore from pdfminer.jbig2 import JBIG2StreamReader, JBIG2StreamWriter from pdfminer.layout import LTImage diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py index 1c063359..dc8bd661 100644 --- a/pdfminer/pdfdocument.py +++ b/pdfminer/pdfdocument.py @@ -837,6 +837,7 @@ def getobj(self, objid: int) -> object: if objid in self._cached_objs: (obj, genno) = self._cached_objs[objid] else: + obj = None for xref in self.xrefs: try: (strmid, index, genno) = xref.get_pos(objid) @@ -856,7 +857,7 @@ def getobj(self, objid: int) -> object: break except (PSEOF, PDFSyntaxError): continue - else: + if obj is None: raise PDFObjectNotFound(objid) log.debug("register: objid=%r: %r", objid, obj) if self.caching: @@ -891,7 +892,9 @@ def get_page_labels(self) -> Iterator[str]: If the document includes page labels, generates strings, one per page. If not, raises PDFNoPageLabels. - The resulting iteration is unbounded. + The resulting iterator is unbounded, so it is recommended to + zip it with the iterator over actual pages returned by `get_pages`. + """ assert self.catalog is not None diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index e1df40c2..e3c51d73 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -115,8 +115,8 @@ class Type1FontHeaderParser(PSStackParser[int]): KEYWORD_READONLY = KWD(b"readonly") KEYWORD_FOR = KWD(b"for") - def __init__(self, data: BinaryIO) -> None: - PSStackParser.__init__(self, data) + def __init__(self, data: bytes) -> None: + super().__init__(data) self._cid2unicode: Dict[int, str] = {} def get_encoding(self) -> Dict[int, str]: @@ -969,7 +969,7 @@ def __init__( if "ToUnicode" in spec: strm = stream_value(spec["ToUnicode"]) self.unicode_map = FileUnicodeMap() - CMapParser(self.unicode_map, BytesIO(strm.get_data())).run() + CMapParser(self.unicode_map, strm.get_data()).run() PDFFont.__init__(self, descriptor, widths) def to_unichr(self, cid: int) -> str: @@ -1009,7 +1009,7 @@ def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> No self.fontfile = stream_value(descriptor.get("FontFile")) length1 = int_value(self.fontfile["Length1"]) data = self.fontfile.get_data()[:length1] - parser = Type1FontHeaderParser(BytesIO(data)) + parser = Type1FontHeaderParser(data) self.cid2unicode = parser.get_encoding() def __repr__(self) -> str: @@ -1080,7 +1080,7 @@ def __init__( if isinstance(spec["ToUnicode"], PDFStream): strm = stream_value(spec["ToUnicode"]) self.unicode_map = FileUnicodeMap() - CMapParser(self.unicode_map, BytesIO(strm.get_data())).run() + CMapParser(self.unicode_map, strm.get_data()).run() else: cmap_name = literal_name(spec["ToUnicode"]) encoding = literal_name(spec["Encoding"]) diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index ae1c46a7..9f222931 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -1,6 +1,4 @@ import logging -import re -from io import BytesIO from typing import Dict, List, Mapping, Optional, Sequence, Tuple, Union, cast from pdfminer import settings @@ -18,6 +16,7 @@ PDFType3Font, ) from pdfminer.pdfpage import PDFPage +from pdfminer.pdfparser import PDFSyntaxError from pdfminer.pdftypes import ( LITERALS_ASCII85_DECODE, PDFObjRef, @@ -31,6 +30,7 @@ from pdfminer.psparser import ( KWD, LIT, + PSBaseParserToken, PSKeyword, PSLiteral, PSStackParser, @@ -248,85 +248,52 @@ def get_font(self, objid: object, spec: Mapping[str, object]) -> PDFFont: return font +KEYWORD_BI = KWD(b"BI") +KEYWORD_ID = KWD(b"ID") +KEYWORD_EI = KWD(b"EI") + + class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]): - def __init__(self, streams: Sequence[object]) -> None: - self.streams = streams - self.istream = 0 - # PSStackParser.__init__(fp=None) is safe only because we've overloaded - # all the methods that would attempt to access self.fp without first - # calling self.fillfp(). - PSStackParser.__init__(self, None) # type: ignore[arg-type] - - def fillfp(self) -> None: - if not self.fp: - if self.istream < len(self.streams): - strm = stream_value(self.streams[self.istream]) - self.istream += 1 - else: - raise PSEOF("Unexpected EOF, file truncated?") - self.fp = BytesIO(strm.get_data()) + """Parse the concatenation of multiple content streams, as + described in the spec (PDF 1.7, p.86): + + ...the effect shall be as if all of the streams in the array were + concatenated, in order, to form a single stream. Conforming + writers can create image objects and other resources as they + occur, even though they interrupt the content stream. The division + between streams may occur only at the boundaries between lexical + tokens (see 7.2, "Lexical Conventions") but shall be unrelated to + the page’s logical content or organization. + """ - def seek(self, pos: int) -> None: - self.fillfp() - PSStackParser.seek(self, pos) + def __init__(self, streams: Sequence[object]) -> None: + self.streamiter = iter(streams) + try: + stream = stream_value(next(self.streamiter)) + except StopIteration: + raise PSEOF + log.debug("PDFContentParser starting stream %r", stream) + super().__init__(stream.get_data()) - def fillbuf(self) -> None: - if self.charpos < len(self.buf): - return - while 1: - self.fillfp() - self.bufpos = self.fp.tell() - self.buf = self.fp.read(self.BUFSIZ) - if self.buf: - break - self.fp = None # type: ignore[assignment] - self.charpos = 0 - - def get_inline_data(self, pos: int, target: bytes = b"EI") -> Tuple[int, bytes]: - self.seek(pos) - i = 0 - data = b"" - while i <= len(target): - self.fillbuf() - if i: - ci = self.buf[self.charpos] - c = bytes((ci,)) - data += c - self.charpos += 1 - if ( - len(target) <= i - and c.isspace() - or i < len(target) - and c == (bytes((target[i],))) - ): - i += 1 - else: - i = 0 - else: - try: - j = self.buf.index(target[0], self.charpos) - data += self.buf[self.charpos : j + 1] - self.charpos = j + 1 - i = 1 - except ValueError: - data += self.buf[self.charpos :] - self.charpos = len(self.buf) - data = data[: -(len(target) + 1)] # strip the last part - data = re.sub(rb"(\x0d\x0a|[\x0d\x0a])$", b"", data) - return (pos, data) + def __next__(self) -> Tuple[int, PSBaseParserToken]: + while True: + try: + return super().__next__() + except StopIteration: + # Will also raise StopIteration if there are no more, + # which is exactly what we want + stream = stream_value(next(self.streamiter)) + log.debug("PDFContentParser starting stream %r", stream) + self.reinit(stream.get_data()) def flush(self) -> None: self.add_results(*self.popall()) - KEYWORD_BI = KWD(b"BI") - KEYWORD_ID = KWD(b"ID") - KEYWORD_EI = KWD(b"EI") - def do_keyword(self, pos: int, token: PSKeyword) -> None: - if token is self.KEYWORD_BI: + if token is KEYWORD_BI: # inline image within a content stream self.start_type(pos, "inline") - elif token is self.KEYWORD_ID: + elif token is KEYWORD_ID: try: (_, objs) = self.end_type("inline") if len(objs) % 2 != 0: @@ -340,13 +307,32 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None: filter = [filter] if filter[0] in LITERALS_ASCII85_DECODE: eos = b"~>" - (pos, data) = self.get_inline_data(pos + len(b"ID "), target=eos) - if eos != b"EI": # it may be necessary for decoding - data += eos + # PDF 1.7 p. 215: Unless the image uses ASCIIHexDecode + # or ASCII85Decode as one of its filters, the ID + # operator shall be followed by a single white-space + # character, and the next character shall be + # interpreted as the first byte of image data. + if eos == b"EI": + self.seek(pos + len(token.name) + 1) + (pos, data) = self.get_inline_data(target=eos) + # FIXME: it is totally unspecified what to do with + # a newline between the end of the data and "EI", + # since there is no explicit stream length. (PDF + # 1.7 p. 756: There should be an end-of-line + # marker after the data and before endstream; this + # marker shall not be included in the stream + # length.) + data = data[: -len(eos)] + else: + self.seek(pos + len(token.name)) + (pos, data) = self.get_inline_data(target=eos) + if pos == -1: + raise PDFSyntaxError("End of inline stream %r not found" % eos) obj = PDFStream(d, data) self.push((pos, obj)) - if eos == b"EI": # otherwise it is still in the stream - self.push((pos, self.KEYWORD_EI)) + # This was included in the data but we need to "parse" it + if eos == b"EI": + self.push((pos, KEYWORD_EI)) except PSTypeError: if settings.STRICT: raise diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py index b00c2b35..645e0dec 100644 --- a/pdfminer/pdfparser.py +++ b/pdfminer/pdfparser.py @@ -1,5 +1,4 @@ import logging -from io import BytesIO from typing import TYPE_CHECKING, BinaryIO, Optional, Union from pdfminer import settings @@ -36,8 +35,8 @@ class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]): """ - def __init__(self, fp: BinaryIO) -> None: - PSStackParser.__init__(self, fp) + def __init__(self, data: Union[BinaryIO, bytes]) -> None: + super().__init__(data) self.doc: Optional[PDFDocument] = None self.fallback = False @@ -92,10 +91,9 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None: raise PDFSyntaxError("Unexpected EOF") return pos += len(line) - self.fp.seek(pos) - data = bytearray(self.fp.read(objlen)) + data = bytearray(self.read(pos, objlen)) self.seek(pos + objlen) - while 1: + while True: try: (linepos, line) = self.nextline() except PSEOF: @@ -138,7 +136,7 @@ class PDFStreamParser(PDFParser): """ def __init__(self, data: bytes) -> None: - PDFParser.__init__(self, BytesIO(data)) + super().__init__(data) def flush(self) -> None: self.add_results(*self.popall()) diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index b4869560..0075b345 100755 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -2,9 +2,12 @@ import io import logging import re +from binascii import unhexlify +from collections import deque from typing import ( Any, BinaryIO, + Deque, Dict, Generic, Iterator, @@ -109,6 +112,7 @@ def intern(self, name: PSLiteral.NameType) -> _SymbolT: KEYWORD_ARRAY_END = KWD(b"]") KEYWORD_DICT_BEGIN = KWD(b"<<") KEYWORD_DICT_END = KWD(b">>") +KEYWORD_GT = KWD(b">") def literal_name(x: Any) -> str: @@ -136,17 +140,14 @@ def keyword_name(x: Any) -> Any: return name -EOL = re.compile(rb"[\r\n]") -SPC = re.compile(rb"\s") -NONSPC = re.compile(rb"\S") -HEX = re.compile(rb"[0-9a-fA-F]") -END_LITERAL = re.compile(rb"[#/%\[\]()<>{}\s]") -END_HEX_STRING = re.compile(rb"[^\s0-9a-fA-F]") -HEX_PAIR = re.compile(rb"[0-9a-fA-F]{2}|.") -END_NUMBER = re.compile(rb"[^0-9]") -END_KEYWORD = re.compile(rb"[#/%\[\]()<>{}\s]") -END_STRING = re.compile(rb"[()\134]") -OCT_STRING = re.compile(rb"[0-7]") +EOL = b"\r\n" +WHITESPACE = b" \t\n\r\f\v" +NUMBER = b"0123456789" +HEX = NUMBER + b"abcdef" + b"ABCDEF" +NOTLITERAL = b"#/%[]()<>{}" + WHITESPACE +NOTKEYWORD = b"#/%[]()<>{}" + WHITESPACE +NOTSTRING = b"()\\" +OCTAL = b"01234567" ESC_STRING = { b"b": 8, b"t": 9, @@ -162,90 +163,60 @@ def keyword_name(x: Any) -> Any: PSBaseParserToken = Union[float, bool, PSLiteral, PSKeyword, bytes] -class PSBaseParser: - """Most basic PostScript parser that performs only tokenization.""" - - BUFSIZ = 4096 +class PSFileParser: + """ + Parser (actually a lexer) for PDF data from a buffered file object. + """ def __init__(self, fp: BinaryIO) -> None: self.fp = fp + self._tokens: Deque[Tuple[int, PSBaseParserToken]] = deque() self.seek(0) - def __repr__(self) -> str: - return "<%s: %r, bufpos=%d>" % (self.__class__.__name__, self.fp, self.bufpos) - - def flush(self) -> None: - pass - - def close(self) -> None: - self.flush() - - def tell(self) -> int: - return self.bufpos + self.charpos - - def poll(self, pos: Optional[int] = None, n: int = 80) -> None: - pos0 = self.fp.tell() - if not pos: - pos = self.bufpos + self.charpos - self.fp.seek(pos) - log.debug("poll(%d): %r", pos, self.fp.read(n)) - self.fp.seek(pos0) + def reinit(self, fp: BinaryIO) -> None: + """Reinitialize parser with a new file.""" + self.fp = fp + self.seek(0) def seek(self, pos: int) -> None: - """Seeks the parser to the given position.""" - log.debug("seek: %r", pos) + """Seek to a position and reinitialize parser state.""" self.fp.seek(pos) - # reset the status for nextline() - self.bufpos = pos - self.buf = b"" - self.charpos = 0 - # reset the status for nexttoken() self._parse1 = self._parse_main self._curtoken = b"" self._curtokenpos = 0 - self._tokens: List[Tuple[int, PSBaseParserToken]] = [] - - def fillbuf(self) -> None: - if self.charpos < len(self.buf): - return - # fetch next chunk. - self.bufpos = self.fp.tell() - self.buf = self.fp.read(self.BUFSIZ) - if not self.buf: - raise PSEOF("Unexpected EOF") - self.charpos = 0 + self._tokens.clear() - def nextline(self) -> Tuple[int, bytes]: - """Fetches a next line that ends either with \\r or \\n.""" - linebuf = b"" - linepos = self.bufpos + self.charpos - eol = False - while 1: - self.fillbuf() - if eol: - c = self.buf[self.charpos : self.charpos + 1] - # handle b'\r\n' - if c == b"\n": - linebuf += c - self.charpos += 1 - break - m = EOL.search(self.buf, self.charpos) - if m: - linebuf += self.buf[self.charpos : m.end(0)] - self.charpos = m.end(0) - if linebuf[-1:] == b"\r": - eol = True - else: - break - else: - linebuf += self.buf[self.charpos :] - self.charpos = len(self.buf) - log.debug("nextline: %r, %r", linepos, linebuf) + def tell(self) -> int: + """Get the current position in the file.""" + return self.fp.tell() - return (linepos, linebuf) + def read(self, pos: int, objlen: int) -> bytes: + """Read data from a specified position, moving the current + position to the end of this data.""" + self.fp.seek(pos) + return self.fp.read(objlen) + + def nextline(self) -> Tuple[int, bytes]: + r"""Fetches a next line that ends either with \r, \n, or + \r\n.""" + linepos = self.fp.tell() + # readline() is implemented on BinarIO so just use that + # (except that it only accepts \n as a separator) + line_or_lines = self.fp.readline() + if line_or_lines == b"": + raise PSEOF + first, sep, rest = line_or_lines.partition(b"\r") + if len(rest) == 0: + return (linepos, line_or_lines) + elif rest != b"\n": + self.fp.seek(linepos + len(first) + 1) + return (linepos, first + sep) + else: + self.fp.seek(linepos + len(first) + 2) + return (linepos, first + b"\r\n") def revreadlines(self) -> Iterator[bytes]: - """Fetches a next line backword. + """Fetches a next line backwards. This is used to locate the trailers at the end of a file. """ @@ -253,261 +224,592 @@ def revreadlines(self) -> Iterator[bytes]: pos = self.fp.tell() buf = b"" while pos > 0: - prevpos = pos - pos = max(0, pos - self.BUFSIZ) + # NOTE: This can obviously be optimized to use regular + # expressions on the (known to exist) buffer in + # self.fp... + pos -= 1 self.fp.seek(pos) - s = self.fp.read(prevpos - pos) - if not s: - break - while 1: - n = max(s.rfind(b"\r"), s.rfind(b"\n")) - if n == -1: - buf = s + buf + c = self.fp.read(1) + if c in b"\r\n": + yield buf + buf = c + if c == b"\n" and pos > 0: + self.fp.seek(pos - 1) + cc = self.fp.read(1) + if cc == b"\r": + pos -= 1 + buf = cc + buf + else: + buf = c + buf + yield buf + + def get_inline_data( + self, target: bytes = b"EI", blocksize: int = 4096 + ) -> Tuple[int, bytes]: + """Get the data for an inline image up to the target + end-of-stream marker. + + Returns a tuple of the position of the target in the data and the + data *including* the end of stream marker. Advances the file + pointer to a position after the end of the stream. + + The caller is responsible for removing the end-of-stream if + necessary (this depends on the filter being used) and parsing + the end-of-stream token (likewise) if necessary. + """ + # PDF 1.7, p. 216: The bytes between the ID and EI operators + # shall be treated the same as a stream object’s data (see + # 7.3.8, "Stream Objects"), even though they do not follow the + # standard stream syntax. + data = [] # list of blocks + partial = b"" # partially seen target + pos = 0 + while True: + # Did we see part of the target at the end of the last + # block? Then scan ahead and try to find the rest (we + # assume the stream is buffered) + if partial: + extra_len = len(target) - len(partial) + extra = self.fp.read(extra_len) + if partial + extra == target: + pos -= len(partial) + data.append(extra) break - yield s[n:] + buf - s = s[:n] - buf = b"" - - def _parse_main(self, s: bytes, i: int) -> int: - m = NONSPC.search(s, i) - if not m: - return len(s) - j = m.start(0) - c = s[j : j + 1] - self._curtokenpos = self.bufpos + j + # Put it back (assume buffering!) + self.fp.seek(-extra_len, io.SEEK_CUR) + partial = b"" + # Fall through (the target could be at the beginning) + buf = self.fp.read(blocksize) + if not buf: + return (-1, b"") + tpos = buf.find(target) + if tpos != -1: + data.append(buf[: tpos + len(target)]) + # Put the extra back (assume buffering!) + self.fp.seek(tpos - len(buf) + len(target), io.SEEK_CUR) + pos += tpos + break + else: + pos += len(buf) + # look for the longest partial match at the end + plen = len(target) - 1 + while plen > 0: + ppos = len(buf) - plen + if buf[ppos:] == target[:plen]: + partial = buf[ppos:] + break + plen -= 1 + data.append(buf) + return (pos, b"".join(data)) + + def __iter__(self) -> Iterator[Tuple[int, PSBaseParserToken]]: + """Iterate over tokens.""" + return self + + def __next__(self) -> Tuple[int, PSBaseParserToken]: + """Get the next token in iteration, raising StopIteration when + done.""" + while True: + c = self._parse1() + # print(c, self._curtoken, self._parse1) + if self._tokens or c == b"": + break + if not self._tokens: + raise StopIteration + return self._tokens.popleft() + + def nexttoken(self) -> Tuple[int, PSBaseParserToken]: + """Get the next token in iteration, raising PSEOF when done.""" + try: + return self.__next__() + except StopIteration: + raise PSEOF + + def _parse_main(self) -> bytes: + """Initial/default state for the lexer.""" + c = self.fp.read(1) + # note that b"" (EOF) is in everything, which is fine + if c in WHITESPACE: + return c + self._curtokenpos = self.fp.tell() - 1 if c == b"%": self._curtoken = b"%" self._parse1 = self._parse_comment - return j + 1 elif c == b"/": self._curtoken = b"" self._parse1 = self._parse_literal - return j + 1 - elif c in b"-+" or c.isdigit(): + elif c in b"-+" or c in NUMBER: self._curtoken = c self._parse1 = self._parse_number - return j + 1 elif c == b".": self._curtoken = c self._parse1 = self._parse_float - return j + 1 elif c.isalpha(): self._curtoken = c self._parse1 = self._parse_keyword - return j + 1 elif c == b"(": self._curtoken = b"" self.paren = 1 self._parse1 = self._parse_string - return j + 1 elif c == b"<": self._curtoken = b"" self._parse1 = self._parse_wopen - return j + 1 elif c == b">": self._curtoken = b"" self._parse1 = self._parse_wclose - return j + 1 elif c == b"\x00": - return j + 1 + pass else: self._add_token(KWD(c)) - return j + 1 + return c def _add_token(self, obj: PSBaseParserToken) -> None: + """Add a succesfully parsed token.""" self._tokens.append((self._curtokenpos, obj)) - def _parse_comment(self, s: bytes, i: int) -> int: - m = EOL.search(s, i) - if not m: - self._curtoken += s[i:] - return len(s) - j = m.start(0) - self._curtoken += s[i:j] - self._parse1 = self._parse_main - # We ignore comments. - # self._tokens.append(self._curtoken) - return j - - def _parse_literal(self, s: bytes, i: int) -> int: - m = END_LITERAL.search(s, i) - if not m: - self._curtoken += s[i:] - return len(s) - j = m.start(0) - self._curtoken += s[i:j] - c = s[j : j + 1] + def _parse_comment(self) -> bytes: + """Comment state for the lexer""" + c = self.fp.read(1) + if c in EOL: # this includes b"", i.e. EOF + self._parse1 = self._parse_main + # We ignore comments. + # self._tokens.append(self._curtoken) + else: + self._curtoken += c + return c + + def _parse_literal(self) -> bytes: + """Literal (keyword) state for the lexer.""" + c = self.fp.read(1) if c == b"#": self.hex = b"" self._parse1 = self._parse_literal_hex - return j + 1 - try: - name: Union[str, bytes] = str(self._curtoken, "utf-8") - except Exception: - name = self._curtoken - self._add_token(LIT(name)) - self._parse1 = self._parse_main - return j + elif c in NOTLITERAL: + if c: + self.fp.seek(-1, io.SEEK_CUR) + try: + self._add_token(LIT(self._curtoken.decode("utf-8"))) + except UnicodeDecodeError: + self._add_token(LIT(self._curtoken)) + self._parse1 = self._parse_main + else: + self._curtoken += c + return c - def _parse_literal_hex(self, s: bytes, i: int) -> int: - c = s[i : i + 1] - if HEX.match(c) and len(self.hex) < 2: + def _parse_literal_hex(self) -> bytes: + """State for escaped hex characters in literal names""" + # Consume a hex digit only if we can ... consume a hex digit + c = self.fp.read(1) + if c and c in HEX and len(self.hex) < 2: self.hex += c - return i + 1 - if self.hex: - self._curtoken += bytes((int(self.hex, 16),)) - self._parse1 = self._parse_literal - return i - - def _parse_number(self, s: bytes, i: int) -> int: - m = END_NUMBER.search(s, i) - if not m: - self._curtoken += s[i:] - return len(s) - j = m.start(0) - self._curtoken += s[i:j] - c = s[j : j + 1] - if c == b".": + else: + if c: + self.fp.seek(-1, io.SEEK_CUR) + if self.hex: + self._curtoken += bytes((int(self.hex, 16),)) + self._parse1 = self._parse_literal + return c + + def _parse_number(self) -> bytes: + """State for numeric objects.""" + c = self.fp.read(1) + if c and c in NUMBER: + self._curtoken += c + elif c == b".": self._curtoken += c self._parse1 = self._parse_float - return j + 1 - try: - self._add_token(int(self._curtoken)) - except ValueError: - pass - self._parse1 = self._parse_main - return j - - def _parse_float(self, s: bytes, i: int) -> int: - m = END_NUMBER.search(s, i) - if not m: - self._curtoken += s[i:] - return len(s) - j = m.start(0) - self._curtoken += s[i:j] - try: - self._add_token(float(self._curtoken)) - except ValueError: - pass - self._parse1 = self._parse_main - return j - - def _parse_keyword(self, s: bytes, i: int) -> int: - m = END_KEYWORD.search(s, i) - if m: - j = m.start(0) - self._curtoken += s[i:j] else: - # Use the rest of the stream if no non-keyword character is found. This - # can happen if the keyword is the final bytes of the stream - # (https://github.com/pdfminer/pdfminer.six/issues/884). - j = len(s) - self._curtoken += s[i:] - if self._curtoken == b"true": - token: Union[bool, PSKeyword] = True - elif self._curtoken == b"false": - token = False + if c: + self.fp.seek(-1, io.SEEK_CUR) + try: + self._add_token(int(self._curtoken)) + except ValueError: + log.warning("Invalid int literal: %r", self._curtoken) + self._parse1 = self._parse_main + return c + + def _parse_float(self) -> bytes: + """State for fractional part of numeric objects.""" + c = self.fp.read(1) + # b"" is in everything so we have to add an extra check + if not c or c not in NUMBER: + if c: + self.fp.seek(-1, io.SEEK_CUR) + try: + self._add_token(float(self._curtoken)) + except ValueError: + log.warning("Invalid float literal: %r", self._curtoken) + self._parse1 = self._parse_main + else: + self._curtoken += c + return c + + def _parse_keyword(self) -> bytes: + """State for keywords.""" + c = self.fp.read(1) + if c in NOTKEYWORD: # includes EOF + if c: + self.fp.seek(-1, io.SEEK_CUR) + if self._curtoken == b"true": + self._add_token(True) + elif self._curtoken == b"false": + self._add_token(False) + else: + self._add_token(KWD(self._curtoken)) + self._parse1 = self._parse_main else: - token = KWD(self._curtoken) - self._add_token(token) - self._parse1 = self._parse_main - return j - - def _parse_string(self, s: bytes, i: int) -> int: - m = END_STRING.search(s, i) - if not m: - self._curtoken += s[i:] - return len(s) - j = m.start(0) - self._curtoken += s[i:j] - c = s[j : j + 1] - if c == b"\\": - self.oct = b"" - self._parse1 = self._parse_string_1 - return j + 1 - if c == b"(": - self.paren += 1 self._curtoken += c - return j + 1 - if c == b")": - self.paren -= 1 - if self.paren: - # WTF, they said balanced parens need no special treatment. + return c + + def _parse_string(self) -> bytes: + """State for string objects.""" + c = self.fp.read(1) + if c and c in NOTSTRING: # does not include EOF + if c == b"\\": + self._parse1 = self._parse_string_esc + return c + elif c == b"(": + self.paren += 1 self._curtoken += c - return j + 1 - self._add_token(self._curtoken) - self._parse1 = self._parse_main - return j + 1 - - def _parse_string_1(self, s: bytes, i: int) -> int: - """Parse literal strings + return c + elif c == b")": + self.paren -= 1 + if self.paren: + self._curtoken += c + return c + # We saw the last parenthesis and fell through (it will be + # consumed, but not added to self._curtoken) + self._add_token(self._curtoken) + self._parse1 = self._parse_main + elif c == b"\r": + # PDF 1.7 page 15: An end-of-line marker appearing within + # a literal string without a preceding REVERSE SOLIDUS + # shall be treated as a byte value of (0Ah), irrespective + # of whether the end-of-line marker was a CARRIAGE RETURN + # (0Dh), a LINE FEED (0Ah), or both. + cc = self.fp.read(1) + # Put it back if it isn't \n + if cc and cc != b"\n": + self.fp.seek(-1, io.SEEK_CUR) + self._curtoken += b"\n" + else: + self._curtoken += c + return c + + def _parse_string_esc(self) -> bytes: + """State for escapes in literal strings. We have seen a + backslash and nothing else.""" + c = self.fp.read(1) + if c and c in OCTAL: # exclude EOF + self.oct = c + self._parse1 = self._parse_string_octal + return c + elif c and c in ESC_STRING: + self._curtoken += bytes((ESC_STRING[c],)) + elif c == b"\n": # Skip newline after backslash + pass + elif c == b"\r": # Also skip CRLF after + cc = self.fp.read(1) + # Put it back if it isn't \n + if cc and cc != b"\n": + self.fp.seek(-1, io.SEEK_CUR) + elif c == b"": + log.warning("EOF inside escape %r", self._curtoken) + else: + log.warning("Unrecognized escape %r", c) + self._curtoken += c + self._parse1 = self._parse_string + return c - PDF Reference 3.2.3 - """ - c = s[i : i + 1] - if OCT_STRING.match(c) and len(self.oct) < 3: + def _parse_string_octal(self) -> bytes: + """State for an octal escape.""" + c = self.fp.read(1) + if c and c in OCTAL: # exclude EOF self.oct += c - return i + 1 - - elif self.oct: + done = len(self.oct) >= 3 # it can't be > though + else: + if c: + self.fp.seek(-1, io.SEEK_CUR) + else: + log.warning("EOF in octal escape %r", self._curtoken) + done = True + if done: chrcode = int(self.oct, 8) - assert chrcode < 256, "Invalid octal %s (%d)" % (repr(self.oct), chrcode) - self._curtoken += bytes((chrcode,)) + if chrcode >= 256: + # PDF1.7 p.16: "high-order overflow shall be ignored." + log.warning("Invalid octal %r (%d)", self.oct, chrcode) + else: + self._curtoken += bytes((chrcode,)) + # Back to normal string parsing self._parse1 = self._parse_string - return i - - elif c in ESC_STRING: - self._curtoken += bytes((ESC_STRING[c],)) - - elif c == b"\r" and len(s) > i + 1 and s[i + 1 : i + 2] == b"\n": - # If current and next character is \r\n skip both because enters - # after a \ are ignored - i += 1 + return c - # default action - self._parse1 = self._parse_string - return i + 1 - - def _parse_wopen(self, s: bytes, i: int) -> int: - c = s[i : i + 1] + def _parse_wopen(self) -> bytes: + """State for start of dictionary or hex string.""" + c = self.fp.read(1) if c == b"<": self._add_token(KEYWORD_DICT_BEGIN) self._parse1 = self._parse_main - i += 1 else: + if c: + self.fp.seek(-1, io.SEEK_CUR) self._parse1 = self._parse_hexstring - return i + return c - def _parse_wclose(self, s: bytes, i: int) -> int: - c = s[i : i + 1] + def _parse_wclose(self) -> bytes: + """State for end of dictionary (accessed from initial state only)""" + c = self.fp.read(1) if c == b">": self._add_token(KEYWORD_DICT_END) - i += 1 - self._parse1 = self._parse_main - return i - - def _parse_hexstring(self, s: bytes, i: int) -> int: - m = END_HEX_STRING.search(s, i) - if not m: - self._curtoken += s[i:] - return len(s) - j = m.start(0) - self._curtoken += s[i:j] - token = HEX_PAIR.sub( - lambda m: bytes((int(m.group(0), 16),)), - SPC.sub(b"", self._curtoken), - ) - self._add_token(token) + else: + # Assuming this is a keyword (which means nothing) + self._add_token(KEYWORD_GT) + if c: + self.fp.seek(-1, io.SEEK_CUR) self._parse1 = self._parse_main - return j + return c + + def _parse_hexstring(self) -> bytes: + """State for parsing hexadecimal literal strings.""" + c = self.fp.read(1) + if not c: + log.warning("EOF in hex string %r", self._curtoken) + elif c in WHITESPACE: + pass + elif c in HEX: + self._curtoken += c + elif c == b">": + if len(self._curtoken) % 2 == 1: + self._curtoken += b"0" + token = unhexlify(self._curtoken) + self._add_token(token) + self._parse1 = self._parse_main + else: + log.warning("unexpected character %r in hex string %r", c, self._curtoken) + return c + + +LEXER = re.compile( + rb"""(?: + (?P \s+) + | (?P %[^\r\n]*[\r\n]) + | (?P /(?: \#[A-Fa-f\d][A-Fa-f\d] | [^#/%\[\]()<>{}\s])+ ) + | (?P [-+]? (?: \d*\.\d+ | \d+ ) ) + | (?P [A-Za-z] [^#/%\[\]()<>{}\s]*) + | (?P \([^()\\]*) + | (?P <[A-Fa-f\d\s]+>) + | (?P <<) + | (?P >>) + | (?P .) +) +""", + re.VERBOSE, +) +STRLEXER = re.compile( + rb"""(?: + (?P \\[0-7]{1,3}) + | (?P \\(?:\r\n?|\n)) + | (?P \\.) + | (?P \() + | (?P \)) + | (?P \r\n?|\n) + | (?P .) +)""", + re.VERBOSE, +) +HEXDIGIT = re.compile(rb"#([A-Fa-f\d][A-Fa-f\d])") +EOLR = re.compile(rb"\r\n?|\n") +SPC = re.compile(rb"\s") + + +class PSInMemoryParser: + """ + Parser for in-memory data streams. + """ + + def __init__(self, data: bytes) -> None: + self.data = data + self.pos = 0 + self.end = len(data) + self._tokens: Deque[Tuple[int, PSBaseParserToken]] = deque() + + def reinit(self, data: bytes) -> None: + """Reinitialize parser with a new buffer.""" + self.data = data + self.seek(0) + + def seek(self, pos: int) -> None: + """Seek to a position and reinitialize parser state.""" + self.pos = pos + self._curtoken = b"" + self._curtokenpos = 0 + self._tokens.clear() + + def tell(self) -> int: + """Get the current position in the buffer.""" + return self.pos + + def read(self, pos: int, objlen: int) -> bytes: + """Read data from a specified position, moving the current + position to the end of this data.""" + self.pos = min(pos + objlen, len(self.data)) + return self.data[pos : self.pos] + + def nextline(self) -> Tuple[int, bytes]: + r"""Fetches a next line that ends either with \r, \n, or \r\n.""" + if self.pos == self.end: + raise PSEOF + linepos = self.pos + m = EOLR.search(self.data, self.pos) + if m is None: + self.pos = self.end + else: + self.pos = m.end() + return (linepos, self.data[linepos : self.pos]) + + def revreadlines(self) -> Iterator[bytes]: + """Fetches a next line backwards. + + This is used to locate the trailers at the end of a file. So, + it isn't actually used in PSInMemoryParser, but is here for + completeness. + """ + endline = pos = self.end + while True: + nidx = self.data.rfind(b"\n", 0, pos) + ridx = self.data.rfind(b"\r", 0, pos) + best = max(nidx, ridx) + if best == -1: + yield self.data[:endline] + break + yield self.data[best + 1 : endline] + endline = best + 1 + pos = best + if pos > 0 and self.data[pos - 1 : pos + 1] == b"\r\n": + pos -= 1 + + def get_inline_data( + self, target: bytes = b"EI", blocksize: int = -1 + ) -> Tuple[int, bytes]: + """Get the data for an inline image up to the target + end-of-stream marker. + + Returns a tuple of the position of the target in the data and the + data *including* the end of stream marker. Advances the file + pointer to a position after the end of the stream. + + The caller is responsible for removing the end-of-stream if + necessary (this depends on the filter being used) and parsing + the end-of-stream token (likewise) if necessary. + """ + tpos = self.data.find(target, self.pos) + if tpos != -1: + nextpos = tpos + len(target) + result = (tpos, self.data[self.pos : nextpos]) + self.pos = nextpos + return result + return (-1, b"") + + def __iter__(self) -> Iterator[Tuple[int, PSBaseParserToken]]: + """Iterate over tokens.""" + return self def nexttoken(self) -> Tuple[int, PSBaseParserToken]: - while not self._tokens: - self.fillbuf() - self.charpos = self._parse1(self.buf, self.charpos) - token = self._tokens.pop(0) - log.debug("nexttoken: %r", token) - return token + """Get the next token in iteration, raising PSEOF when done.""" + try: + return self.__next__() + except StopIteration: + raise PSEOF + + def __next__(self) -> Tuple[int, PSBaseParserToken]: + """Get the next token in iteration, raising StopIteration when + done.""" + while True: + m = LEXER.match(self.data, self.pos) + if m is None: # can only happen at EOS + raise StopIteration + self._curtokenpos = m.start() + self.pos = m.end() + if m.lastgroup not in ("whitespace", "comment"): # type: ignore + # Okay, we got a token or something + break + self._curtoken = m[0] + if m.lastgroup == "name": # type: ignore + self._curtoken = m[0][1:] + self._curtoken = HEXDIGIT.sub( + lambda x: bytes((int(x[1], 16),)), self._curtoken + ) + try: + tok = LIT(self._curtoken.decode("utf-8")) + except UnicodeDecodeError: + tok = LIT(self._curtoken) + return (self._curtokenpos, tok) + if m.lastgroup == "number": # type: ignore + if b"." in self._curtoken: + return (self._curtokenpos, float(self._curtoken)) + else: + return (self._curtokenpos, int(self._curtoken)) + if m.lastgroup == "startdict": # type: ignore + return (self._curtokenpos, KEYWORD_DICT_BEGIN) + if m.lastgroup == "enddict": # type: ignore + return (self._curtokenpos, KEYWORD_DICT_END) + if m.lastgroup == "startstr": # type: ignore + return self._parse_endstr(self.data[m.start() + 1 : m.end()], m.end()) + if m.lastgroup == "hexstr": # type: ignore + self._curtoken = SPC.sub(b"", self._curtoken[1:-1]) + if len(self._curtoken) % 2 == 1: + self._curtoken += b"0" + return (self._curtokenpos, unhexlify(self._curtoken)) + # Anything else is treated as a keyword (whether explicitly matched or not) + if self._curtoken == b"true": + return (self._curtokenpos, True) + elif self._curtoken == b"false": + return (self._curtokenpos, False) + else: + return (self._curtokenpos, KWD(self._curtoken)) + + def _parse_endstr(self, start: bytes, pos: int) -> Tuple[int, PSBaseParserToken]: + """Parse the remainder of a string.""" + # Handle nonsense CRLF conversion in strings (PDF 1.7, p.15) + parts = [EOLR.sub(b"\n", start)] + paren = 1 + for m in STRLEXER.finditer(self.data, pos): + self.pos = m.end() + if m.lastgroup == "parenright": # type: ignore + paren -= 1 + if paren == 0: + # By far the most common situation! + break + parts.append(m[0]) + elif m.lastgroup == "parenleft": # type: ignore + parts.append(m[0]) + paren += 1 + elif m.lastgroup == "escape": # type: ignore + chr = m[0][1:2] + if chr not in ESC_STRING: + log.warning("Unrecognized escape %r", m[0]) + parts.append(chr) + else: + parts.append(bytes((ESC_STRING[chr],))) + elif m.lastgroup == "octal": # type: ignore + chrcode = int(m[0][1:], 8) + if chrcode >= 256: + # PDF1.7 p.16: "high-order overflow shall be + # ignored." + log.warning("Invalid octal %r (%d)", m[0][1:], chrcode) + else: + parts.append(bytes((chrcode,))) + elif m.lastgroup == "newline": # type: ignore + # Handle nonsense CRLF conversion in strings (PDF 1.7, p.15) + parts.append(b"\n") + elif m.lastgroup == "linebreak": # type: ignore + pass + else: + parts.append(m[0]) + if paren != 0: + log.warning("Unterminated string at %d", pos) + raise StopIteration + return (self._curtokenpos, b"".join(parts)) # Stack slots may by occupied by any of: @@ -521,35 +823,53 @@ def nexttoken(self) -> Tuple[int, PSBaseParserToken]: PSStackEntry = Tuple[int, PSStackType[ExtraT]] -class PSStackParser(PSBaseParser, Generic[ExtraT]): - def __init__(self, fp: BinaryIO) -> None: - PSBaseParser.__init__(self, fp) +class PSStackParser(Generic[ExtraT]): + """Basic parser for PDF objects, can take a file or a `bytes` as + input.""" + + def __init__(self, reader: Union[BinaryIO, bytes]) -> None: + self.reinit(reader) + + def reinit(self, reader: Union[BinaryIO, bytes]) -> None: + """Reinitialize parser with a new file or buffer.""" + if isinstance(reader, bytes): + self._parser: Union[PSInMemoryParser, PSFileParser] = PSInMemoryParser( + reader + ) + else: + self._parser = PSFileParser(reader) self.reset() def reset(self) -> None: + """Reset parser state.""" self.context: List[Tuple[int, Optional[str], List[PSStackEntry[ExtraT]]]] = [] self.curtype: Optional[str] = None self.curstack: List[PSStackEntry[ExtraT]] = [] self.results: List[PSStackEntry[ExtraT]] = [] def seek(self, pos: int) -> None: - PSBaseParser.seek(self, pos) + """Seek to a position and reset parser state.""" + self._parser.seek(pos) self.reset() def push(self, *objs: PSStackEntry[ExtraT]) -> None: + """Push some objects onto the stack.""" self.curstack.extend(objs) def pop(self, n: int) -> List[PSStackEntry[ExtraT]]: + """Pop some objects off the stack.""" objs = self.curstack[-n:] self.curstack[-n:] = [] return objs def popall(self) -> List[PSStackEntry[ExtraT]]: + """Pop all the things off the stack.""" objs = self.curstack self.curstack = [] return objs def add_results(self, *objs: PSStackEntry[ExtraT]) -> None: + """Move some objects to the output.""" try: log.debug("add_results: %r", objs) except Exception: @@ -557,11 +877,13 @@ def add_results(self, *objs: PSStackEntry[ExtraT]) -> None: self.results.extend(objs) def start_type(self, pos: int, type: str) -> None: + """Start a composite object (array, dict, etc).""" self.context.append((pos, self.curtype, self.curstack)) (self.curtype, self.curstack) = (type, []) log.debug("start_type: pos=%r, type=%r", pos, type) def end_type(self, type: str) -> Tuple[int, List[PSStackType[ExtraT]]]: + """End a composite object (array, dict, etc).""" if self.curtype != type: raise PSTypeError(f"Type mismatch: {self.curtype!r} != {type!r}") objs = [obj for (_, obj) in self.curstack] @@ -570,6 +892,11 @@ def end_type(self, type: str) -> Tuple[int, List[PSStackType[ExtraT]]]: return (pos, objs) def do_keyword(self, pos: int, token: PSKeyword) -> None: + """Handle a PDF keyword.""" + pass + + def flush(self) -> None: + """Get everything off the stack and into the output?""" pass def nextobject(self) -> PSStackEntry[ExtraT]: @@ -644,10 +971,49 @@ def nextobject(self) -> PSStackEntry[ExtraT]: if self.context: continue else: - self.flush() + self.flush() # Does nothing here, but in subclasses... (ugh) obj = self.results.pop(0) try: log.debug("nextobject: %r", obj) except Exception: log.debug("nextobject: (unprintable object)") return obj + + # Delegation follows + def nextline(self) -> Tuple[int, bytes]: + r"""Fetches a next line that ends either with \r, \n, or + \r\n.""" + return self._parser.nextline() + + def revreadlines(self) -> Iterator[bytes]: + """Fetches a next line backwards. + + This is used to locate the trailers at the end of a file. + """ + return self._parser.revreadlines() + + def read(self, pos: int, objlen: int) -> bytes: + """Read data from a specified position, moving the current + position to the end of this data.""" + return self._parser.read(pos, objlen) + + def nexttoken(self) -> Tuple[int, PSBaseParserToken]: + """Get the next token in iteration, raising PSEOF when done.""" + try: + return self.__next__() + except StopIteration: + raise PSEOF + + def get_inline_data(self, target: bytes = b"EI") -> Tuple[int, bytes]: + """Get the data for an inline image up to the target + end-of-stream marker.""" + return self._parser.get_inline_data(target) + + def __iter__(self) -> Iterator[Tuple[int, PSBaseParserToken]]: + """Iterate over tokens.""" + return self + + def __next__(self) -> Tuple[int, PSBaseParserToken]: + """Get the next token in iteration, raising StopIteration when + done.""" + return self._parser.__next__() diff --git a/pdfminer/utils.py b/pdfminer/utils.py index a5b53852..88b44b98 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -75,11 +75,10 @@ def make_compat_bytes(in_str: str) -> bytes: def make_compat_str(o: object) -> str: """Converts everything to string, if bytes guessing the encoding.""" if isinstance(o, bytes): - enc = charset_normalizer.detect(o) - try: - return o.decode(enc["encoding"]) - except UnicodeDecodeError: + result = charset_normalizer.from_bytes(o) + if result is None: return str(o) + return str(result.best()) else: return str(o) diff --git a/tests/test_pdfminer_psparser.py b/tests/test_pdfminer_psparser.py index a1599184..9d869ea9 100644 --- a/tests/test_pdfminer_psparser.py +++ b/tests/test_pdfminer_psparser.py @@ -1,13 +1,23 @@ import logging +import tempfile from io import BytesIO +from typing import Any, List, Tuple from pdfminer.psexceptions import PSEOF -from pdfminer.psparser import KWD, LIT, PSBaseParser, PSStackParser +from pdfminer.psparser import ( + KEYWORD_DICT_BEGIN, + KEYWORD_DICT_END, + KWD, + LIT, + PSFileParser, + PSInMemoryParser, + PSStackParser, +) logger = logging.getLogger(__name__) -class TestPSBaseParser: +class TestPSFileParser: """Simplistic Test cases""" TESTDATA = rb"""%!PS @@ -57,7 +67,7 @@ class TestPSBaseParser: (191, b""), (194, b" "), (199, b"@@ "), - (211, b"\xab\xcd\x00\x124\x05"), + (211, b"\xab\xcd\x00\x124\x50"), (226, KWD(b"func")), (230, LIT("a")), (232, LIT("b")), @@ -99,7 +109,7 @@ class TestPSBaseParser: (191, b""), (194, b" "), (199, b"@@ "), - (211, b"\xab\xcd\x00\x124\x05"), + (211, b"\xab\xcd\x00\x124\x50"), (230, LIT("a")), (232, LIT("b")), (234, [b"c"]), @@ -108,9 +118,7 @@ class TestPSBaseParser: ] def get_tokens(self, s): - from io import BytesIO - - class MyParser(PSBaseParser): + class MyParser(PSFileParser): def flush(self): self.add_results(*self.popall()) @@ -124,8 +132,6 @@ def flush(self): return r def get_objects(self, s): - from io import BytesIO - class MyParser(PSStackParser): def flush(self): self.add_results(*self.popall()) @@ -149,11 +155,243 @@ def test_2(self): logger.info(objs) assert objs == self.OBJS - def test_3(self): - """Regression test for streams that end with a keyword. - See: https://github.com/pdfminer/pdfminer.six/issues/884 - """ - parser = PSBaseParser(BytesIO(b"Do")) - parser._parse_keyword(b"Do", 0) - assert parser._tokens == [(0, KWD(b"Do"))] +TESTDATA = b""" +ugh +foo\r +bar\rbaz +quxx +bog""" +EXPECTED = [ + (0, b"\n"), + (1, b"ugh\n"), + (5, b"foo\r\n"), + (10, b"bar\r"), + (14, b"baz\n"), + (18, b"quxx\n"), + (23, b"bog"), +] + + +def run_parsers(data: bytes, expected: List[Any], makefunc: Any) -> None: + """Test stuff on both BytesIO and BinaryIO.""" + bp = PSInMemoryParser(data) + output = [] + func = makefunc(bp) + while True: + try: + output.append(func()) + except PSEOF: + break + assert output == expected + with tempfile.NamedTemporaryFile() as tf: + with open(tf.name, "wb") as outfh: + outfh.write(data) + with open(tf.name, "rb") as infh: + fp = PSFileParser(infh) + func = makefunc(fp) + output = [] + while True: + try: + output.append(func()) + except PSEOF: + break + assert output == expected + + +def test_nextline() -> None: + """Verify that we replicate the old nextline method.""" + run_parsers(TESTDATA, EXPECTED, lambda foo: foo.nextline) + + +def test_revreadlines() -> None: + """Verify that we replicate the old revreadlines method.""" + expected = list(reversed([line for pos, line in EXPECTED])) + + def make_next(parser: Any) -> Any: + itor = parser.revreadlines() + + def nextor() -> Any: + try: + line = next(itor) + except StopIteration: + raise PSEOF + return line + + return nextor + + run_parsers(TESTDATA, expected, make_next) + + +SIMPLE1 = b"""1 0 obj +<< + /Type /Catalog + /Outlines 2 0 R + /Pages 3 0 R +>> +endobj +""" +SIMPLETOK = [ + 1, + 0, + KWD(b"obj"), + KEYWORD_DICT_BEGIN, + LIT("Type"), + LIT("Catalog"), + LIT("Outlines"), + 2, + 0, + KWD(b"R"), + LIT("Pages"), + 3, + 0, + KWD(b"R"), + KEYWORD_DICT_END, + KWD(b"endobj"), +] + + +def list_parsers(data: bytes, expected: List[Any], discard_pos: bool = False) -> None: + bp = PSInMemoryParser(data) + if discard_pos: + tokens: List[Any] = [tok for pos, tok in list(bp)] + else: + tokens = list(bp) + assert tokens == expected + with tempfile.NamedTemporaryFile() as tf: + with open(tf.name, "wb") as outfh: + outfh.write(data) + with open(tf.name, "rb") as infh: + fp = PSFileParser(infh) + if discard_pos: + tokens = [tok for pos, tok in list(fp)] + else: + tokens = list(fp) + assert tokens == expected + + +def test_new_parser() -> None: + # Do a lot of them to make sure buffering works correctly + list_parsers(SIMPLE1 * 100, SIMPLETOK * 100, discard_pos=True) + + +def test_new_parser_eof() -> None: + # Make sure we get a keyword at eof + list_parsers(SIMPLE1[:-1], SIMPLETOK, discard_pos=True) + + +PAGE17 = b""" + /A;Name_With-Various***Characters? + /lime#20Green + /paired#28#29parentheses +""" + + +def test_new_parser1() -> None: + list_parsers(b"123.456", [(0, 123.456)]) + list_parsers(b"+.013", [(0, 0.013)]) + list_parsers(b"123", [(0, 123)]) + list_parsers(b"true false", [(0, True), (5, False)]) + list_parsers(b"(foobie bletch)", [(0, b"foobie bletch")]) + list_parsers(b"(foo", []) + + +def test_new_parser_names() -> None: + # Examples from PDF 1.7 page 17 + list_parsers( + PAGE17, + [ + (5, LIT("A;Name_With-Various***Characters?")), + (44, LIT("lime Green")), + (62, LIT("paired()parentheses")), + ], + ) + + +def test_new_parser_strings() -> None: + list_parsers( + rb"( Strings may contain balanced parentheses ( ) and " + rb"special characters ( * ! & } ^ % and so on ) . )", + [ + ( + 0, + rb" Strings may contain balanced parentheses ( ) and " + rb"special characters ( * ! & } ^ % and so on ) . ", + ) + ], + ) + list_parsers(b"()", [(0, b"")]) + list_parsers( + rb"""( These \ +two strings \ +are the same . ) + """, + [(0, b" These two strings are the same . ")], + ) + list_parsers(b"(foo\rbar)", [(0, b"foo\nbar")]) + list_parsers(b"(foo\r)", [(0, b"foo\n")]) + list_parsers(b"(foo\r\nbar\r\nbaz)", [(0, b"foo\nbar\nbaz")]) + list_parsers(b"(foo\n)", [(0, b"foo\n")]) + list_parsers(rb"(foo\r\nbaz)", [(0, b"foo\r\nbaz")]) + list_parsers(rb"(foo\r\nbar\r\nbaz)", [(0, b"foo\r\nbar\r\nbaz")]) + list_parsers( + rb"( This string contains \245two octal characters\307 . )", + [(0, b" This string contains \245two octal characters\307 . ")], + ) + list_parsers(rb"(\0053 \053 \53)", [(0, b"\0053 \053 +")]) + list_parsers( + rb"< 4E6F762073686D6F7A206B6120706F702E >", [(0, b"Nov shmoz ka pop.")] + ) + list_parsers(rb"<73 686 D6F7A2>", [(0, b"shmoz ")]) + list_parsers(rb"(\400)", [(0, b"")]) + + +def test_invalid_strings_eof() -> None: + list_parsers(rb"(\00", []) + list_parsers(rb"(abracadab", []) + + +def inline_parsers( + data: bytes, + expected: Tuple[int, bytes], + target: bytes = b"EI", + nexttoken: Any = None, + blocksize: int = 16, +) -> None: + bp = PSInMemoryParser(data) + assert bp.get_inline_data(target=target, blocksize=blocksize) == expected + if nexttoken is not None: + assert bp.nexttoken() == nexttoken + with tempfile.NamedTemporaryFile() as tf: + with open(tf.name, "wb") as outfh: + outfh.write(data) + with open(tf.name, "rb") as infh: + fp = PSFileParser(infh) + assert fp.get_inline_data(target=target, blocksize=blocksize) == expected + if nexttoken is not None: + assert fp.nexttoken() == nexttoken + + +def test_get_inline_data() -> None: + kwd_eio = KWD(b"EIO") + kwd_omg = KWD(b"OMG") + inline_parsers(b"""0123456789""", (-1, b"")) + inline_parsers(b"""0123456789EI""", (10, b"0123456789EI")) + inline_parsers( + b"""0123456789EIEIO""", (10, b"0123456789EI"), nexttoken=(12, kwd_eio) + ) + inline_parsers(b"""012EIEIO""", (3, b"012EI"), nexttoken=(5, kwd_eio), blocksize=4) + inline_parsers( + b"""0123012EIEIO""", (7, b"0123012EI"), nexttoken=(9, kwd_eio), blocksize=4 + ) + for blocksize in range(1, 8): + inline_parsers( + b"""012EIEIOOMG""", + ( + 3, + b"012EIEIO", + ), + target=b"EIEIO", + nexttoken=(8, kwd_omg), + blocksize=blocksize, + ) diff --git a/tools/dumppdf.py b/tools/dumppdf.py index f88389a9..630e4f8d 100755 --- a/tools/dumppdf.py +++ b/tools/dumppdf.py @@ -203,7 +203,6 @@ def resolve_dest(dest: object) -> Any: outfp.write("\n") except PDFNoOutlines: pass - parser.close() fp.close() From 1bb4cae8bb2016bdde6fcdd807b2f8f64b993757 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Thu, 19 Sep 2024 07:59:09 -0400 Subject: [PATCH 2/4] fix: make sure it is really bytes in font.decode --- pdfminer/pdfdevice.py | 8 ++++++++ pdfminer/pdffont.py | 8 ++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/pdfminer/pdfdevice.py b/pdfminer/pdfdevice.py index 2374601c..6e0c58b9 100644 --- a/pdfminer/pdfdevice.py +++ b/pdfminer/pdfdevice.py @@ -168,6 +168,10 @@ def render_string_horizontal( x -= obj * dxscale needcharspace = True else: + if isinstance(obj, str): + obj = utils.make_compat_bytes(obj) + if not isinstance(obj, bytes): + continue for cid in font.decode(obj): if needcharspace: x += charspace @@ -208,6 +212,10 @@ def render_string_vertical( y -= obj * dxscale needcharspace = True else: + if isinstance(obj, str): + obj = utils.make_compat_bytes(obj) + if not isinstance(obj, bytes): + continue for cid in font.decode(obj): if needcharspace: y += charspace diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index e3c51d73..5b0402fd 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -899,8 +899,8 @@ def is_vertical(self) -> bool: def is_multibyte(self) -> bool: return False - def decode(self, bytes: bytes) -> Iterable[int]: - return bytearray(bytes) # map(ord, bytes) + def decode(self, data: bytes) -> Iterable[int]: + return data def get_ascent(self) -> float: """Ascent above the baseline, in text space units""" @@ -1173,8 +1173,8 @@ def is_vertical(self) -> bool: def is_multibyte(self) -> bool: return True - def decode(self, bytes: bytes) -> Iterable[int]: - return self.cmap.decode(bytes) + def decode(self, data: bytes) -> Iterable[int]: + return self.cmap.decode(data) def char_disp(self, cid: int) -> Union[float, Tuple[Optional[float], float]]: """Returns an integer for horizontal fonts, a tuple for vertical fonts.""" From 4c7d494d70bdc5e68e51c76820c1a7b7db861cb4 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Thu, 19 Sep 2024 10:00:26 -0400 Subject: [PATCH 3/4] fix: a couple of invalid PDF fuzz cases --- pdfminer/pdfparser.py | 7 ++++++- pdfminer/utils.py | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py index 645e0dec..dae1af59 100644 --- a/pdfminer/pdfparser.py +++ b/pdfminer/pdfparser.py @@ -146,7 +146,12 @@ def flush(self) -> None: def do_keyword(self, pos: int, token: PSKeyword) -> None: if token is self.KEYWORD_R: # reference to indirect object - (_, _object_id), _ = self.pop(2) + try: + (_, _object_id), _ = self.pop(2) + except ValueError: + raise PDFSyntaxError( + "Expected generation and object id in indirect object reference" + ) object_id = safe_int(_object_id) if object_id is not None: obj = PDFObjRef(self.doc, object_id) diff --git a/pdfminer/utils.py b/pdfminer/utils.py index 88b44b98..277cfe15 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -245,7 +245,7 @@ def parse_rect(o: Any) -> Rect: try: (x0, y0, x1, y1) = o return float(x0), float(y0), float(x1), float(y1) - except ValueError: + except (ValueError, TypeError): raise PDFValueError("Could not parse rectangle") From 6e9d73f6c42fd2ad6772896c8f4ce1336af12b06 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Thu, 19 Sep 2024 17:34:36 -0400 Subject: [PATCH 4/4] fix: match behaviour between PSFile / PSInMemory parser --- pdfminer/psparser.py | 23 +++++++++++++++++------ tests/test_pdfminer_psparser.py | 8 +++++--- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index 0075b345..9e1feeb4 100755 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -399,15 +399,26 @@ def _parse_literal(self) -> bytes: def _parse_literal_hex(self) -> bytes: """State for escaped hex characters in literal names""" # Consume a hex digit only if we can ... consume a hex digit + if len(self.hex) >= 2: # it actually can't exceed 2 + self._curtoken += bytes((int(self.hex, 16),)) + self._parse1 = self._parse_literal + return b"/" c = self.fp.read(1) - if c and c in HEX and len(self.hex) < 2: + if c and c in HEX: self.hex += c else: - if c: + if c: # not EOF, but not hex either + log.warning("Invalid hex digit %r in literal", c) self.fp.seek(-1, io.SEEK_CUR) - if self.hex: - self._curtoken += bytes((int(self.hex, 16),)) - self._parse1 = self._parse_literal + # Add the intervening junk, just in case + try: + tok = LIT(self._curtoken.decode("utf-8")) + except UnicodeDecodeError: + tok = LIT(self._curtoken) + self._add_token(tok) + self._curtokenpos = self.tell() - 1 - len(self.hex) + self._add_token(KWD(b"#" + self.hex)) + self._parse1 = self._parse_main return c def _parse_number(self) -> bytes: @@ -597,7 +608,7 @@ def _parse_hexstring(self) -> bytes: | (?P [-+]? (?: \d*\.\d+ | \d+ ) ) | (?P [A-Za-z] [^#/%\[\]()<>{}\s]*) | (?P \([^()\\]*) - | (?P <[A-Fa-f\d\s]+>) + | (?P <[A-Fa-f\d\s]*>) | (?P <<) | (?P >>) | (?P .) diff --git a/tests/test_pdfminer_psparser.py b/tests/test_pdfminer_psparser.py index 9d869ea9..ad375383 100644 --- a/tests/test_pdfminer_psparser.py +++ b/tests/test_pdfminer_psparser.py @@ -49,7 +49,9 @@ class TestPSFileParser: (23, LIT("a")), (25, LIT("BCD")), (30, LIT("Some_Name")), - (41, LIT("foo_xbaa")), + (41, LIT("foo_")), + (48, KWD(b"#")), + (49, KWD(b"xbaa")), (54, 0), (56, 1), (59, -2), @@ -91,7 +93,7 @@ class TestPSFileParser: (23, LIT("a")), (25, LIT("BCD")), (30, LIT("Some_Name")), - (41, LIT("foo_xbaa")), + (41, LIT("foo_")), (54, 0), (56, 1), (59, -2), @@ -136,7 +138,7 @@ class MyParser(PSStackParser): def flush(self): self.add_results(*self.popall()) - parser = MyParser(BytesIO(s)) + parser = MyParser(s) r = [] try: while True: