From 0a1ab082d1730bfd04d0467d75f609ea2b53e282 Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhd@ecolingui.ca>
Date: Wed, 18 Sep 2024 12:41:36 -0400
Subject: [PATCH 1/4] Rewrite PSBaseParser and add an optimized in-memory
 version

---
 .gitignore                      |   1 +
 CHANGELOG.md                    |   1 +
 pdfminer/cmapdb.py              |   5 +-
 pdfminer/image.py               |   2 +-
 pdfminer/pdfdocument.py         |   7 +-
 pdfminer/pdffont.py             |  10 +-
 pdfminer/pdfinterp.py           | 138 +++--
 pdfminer/pdfparser.py           |  12 +-
 pdfminer/psparser.py            | 926 ++++++++++++++++++++++----------
 pdfminer/utils.py               |   7 +-
 tests/test_pdfminer_psparser.py | 270 +++++++++-
 tools/dumppdf.py                |   1 -
 12 files changed, 985 insertions(+), 395 deletions(-)

diff --git a/.gitignore b/.gitignore
index 7f27b7ae..f136d472 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,3 +26,4 @@ Pipfile.lock
 .vscode/
 poetry.lock
 .eggs
+*~
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5425c5d3..ac94d845 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 ### Changed
 
 - Using absolute instead of relative imports ([[#995](https://github.com/pdfminer/pdfminer.six/pull/995)])
+- Reimplement optimized parsers (really lexers) for file versus in-memory input
 
 ### Deprecated
 
diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py
index 87d9870e..93be1266 100644
--- a/pdfminer/cmapdb.py
+++ b/pdfminer/cmapdb.py
@@ -18,7 +18,6 @@
 import sys
 from typing import (
     Any,
-    BinaryIO,
     Dict,
     Iterable,
     Iterator,
@@ -278,8 +277,8 @@ def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap:
 
 
 class CMapParser(PSStackParser[PSKeyword]):
-    def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None:
-        PSStackParser.__init__(self, fp)
+    def __init__(self, cmap: CMapBase, data: bytes) -> None:
+        super().__init__(data)
         self.cmap = cmap
         # some ToUnicode maps don't have "begincmap" keyword.
         self._in_cmap = True
diff --git a/pdfminer/image.py b/pdfminer/image.py
index 355c7fb7..a6c26497 100644
--- a/pdfminer/image.py
+++ b/pdfminer/image.py
@@ -8,7 +8,7 @@
     from typing import Literal
 except ImportError:
     # Literal was introduced in Python 3.8
-    from typing_extensions import Literal  # type: ignore[assignment]
+    from typing_extensions import Literal  # type: ignore
 
 from pdfminer.jbig2 import JBIG2StreamReader, JBIG2StreamWriter
 from pdfminer.layout import LTImage
diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py
index 1c063359..dc8bd661 100644
--- a/pdfminer/pdfdocument.py
+++ b/pdfminer/pdfdocument.py
@@ -837,6 +837,7 @@ def getobj(self, objid: int) -> object:
         if objid in self._cached_objs:
             (obj, genno) = self._cached_objs[objid]
         else:
+            obj = None
             for xref in self.xrefs:
                 try:
                     (strmid, index, genno) = xref.get_pos(objid)
@@ -856,7 +857,7 @@ def getobj(self, objid: int) -> object:
                     break
                 except (PSEOF, PDFSyntaxError):
                     continue
-            else:
+            if obj is None:
                 raise PDFObjectNotFound(objid)
             log.debug("register: objid=%r: %r", objid, obj)
             if self.caching:
@@ -891,7 +892,9 @@ def get_page_labels(self) -> Iterator[str]:
         If the document includes page labels, generates strings, one per page.
         If not, raises PDFNoPageLabels.
 
-        The resulting iteration is unbounded.
+        The resulting iterator is unbounded, so it is recommended to
+        zip it with the iterator over actual pages returned by `get_pages`.
+
         """
         assert self.catalog is not None
 
diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py
index e1df40c2..e3c51d73 100644
--- a/pdfminer/pdffont.py
+++ b/pdfminer/pdffont.py
@@ -115,8 +115,8 @@ class Type1FontHeaderParser(PSStackParser[int]):
     KEYWORD_READONLY = KWD(b"readonly")
     KEYWORD_FOR = KWD(b"for")
 
-    def __init__(self, data: BinaryIO) -> None:
-        PSStackParser.__init__(self, data)
+    def __init__(self, data: bytes) -> None:
+        super().__init__(data)
         self._cid2unicode: Dict[int, str] = {}
 
     def get_encoding(self) -> Dict[int, str]:
@@ -969,7 +969,7 @@ def __init__(
         if "ToUnicode" in spec:
             strm = stream_value(spec["ToUnicode"])
             self.unicode_map = FileUnicodeMap()
-            CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
+            CMapParser(self.unicode_map, strm.get_data()).run()
         PDFFont.__init__(self, descriptor, widths)
 
     def to_unichr(self, cid: int) -> str:
@@ -1009,7 +1009,7 @@ def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> No
             self.fontfile = stream_value(descriptor.get("FontFile"))
             length1 = int_value(self.fontfile["Length1"])
             data = self.fontfile.get_data()[:length1]
-            parser = Type1FontHeaderParser(BytesIO(data))
+            parser = Type1FontHeaderParser(data)
             self.cid2unicode = parser.get_encoding()
 
     def __repr__(self) -> str:
@@ -1080,7 +1080,7 @@ def __init__(
             if isinstance(spec["ToUnicode"], PDFStream):
                 strm = stream_value(spec["ToUnicode"])
                 self.unicode_map = FileUnicodeMap()
-                CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
+                CMapParser(self.unicode_map, strm.get_data()).run()
             else:
                 cmap_name = literal_name(spec["ToUnicode"])
                 encoding = literal_name(spec["Encoding"])
diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py
index ae1c46a7..9f222931 100644
--- a/pdfminer/pdfinterp.py
+++ b/pdfminer/pdfinterp.py
@@ -1,6 +1,4 @@
 import logging
-import re
-from io import BytesIO
 from typing import Dict, List, Mapping, Optional, Sequence, Tuple, Union, cast
 
 from pdfminer import settings
@@ -18,6 +16,7 @@
     PDFType3Font,
 )
 from pdfminer.pdfpage import PDFPage
+from pdfminer.pdfparser import PDFSyntaxError
 from pdfminer.pdftypes import (
     LITERALS_ASCII85_DECODE,
     PDFObjRef,
@@ -31,6 +30,7 @@
 from pdfminer.psparser import (
     KWD,
     LIT,
+    PSBaseParserToken,
     PSKeyword,
     PSLiteral,
     PSStackParser,
@@ -248,85 +248,52 @@ def get_font(self, objid: object, spec: Mapping[str, object]) -> PDFFont:
         return font
 
 
+KEYWORD_BI = KWD(b"BI")
+KEYWORD_ID = KWD(b"ID")
+KEYWORD_EI = KWD(b"EI")
+
+
 class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]):
-    def __init__(self, streams: Sequence[object]) -> None:
-        self.streams = streams
-        self.istream = 0
-        # PSStackParser.__init__(fp=None) is safe only because we've overloaded
-        # all the methods that would attempt to access self.fp without first
-        # calling self.fillfp().
-        PSStackParser.__init__(self, None)  # type: ignore[arg-type]
-
-    def fillfp(self) -> None:
-        if not self.fp:
-            if self.istream < len(self.streams):
-                strm = stream_value(self.streams[self.istream])
-                self.istream += 1
-            else:
-                raise PSEOF("Unexpected EOF, file truncated?")
-            self.fp = BytesIO(strm.get_data())
+    """Parse the concatenation of multiple content streams, as
+    described in the spec (PDF 1.7, p.86):
+
+    ...the effect shall be as if all of the streams in the array were
+    concatenated, in order, to form a single stream.  Conforming
+    writers can create image objects and other resources as they
+    occur, even though they interrupt the content stream. The division
+    between streams may occur only at the boundaries between lexical
+    tokens (see 7.2, "Lexical Conventions") but shall be unrelated to
+    the page’s logical content or organization.
+    """
 
-    def seek(self, pos: int) -> None:
-        self.fillfp()
-        PSStackParser.seek(self, pos)
+    def __init__(self, streams: Sequence[object]) -> None:
+        self.streamiter = iter(streams)
+        try:
+            stream = stream_value(next(self.streamiter))
+        except StopIteration:
+            raise PSEOF
+        log.debug("PDFContentParser starting stream %r", stream)
+        super().__init__(stream.get_data())
 
-    def fillbuf(self) -> None:
-        if self.charpos < len(self.buf):
-            return
-        while 1:
-            self.fillfp()
-            self.bufpos = self.fp.tell()
-            self.buf = self.fp.read(self.BUFSIZ)
-            if self.buf:
-                break
-            self.fp = None  # type: ignore[assignment]
-        self.charpos = 0
-
-    def get_inline_data(self, pos: int, target: bytes = b"EI") -> Tuple[int, bytes]:
-        self.seek(pos)
-        i = 0
-        data = b""
-        while i <= len(target):
-            self.fillbuf()
-            if i:
-                ci = self.buf[self.charpos]
-                c = bytes((ci,))
-                data += c
-                self.charpos += 1
-                if (
-                    len(target) <= i
-                    and c.isspace()
-                    or i < len(target)
-                    and c == (bytes((target[i],)))
-                ):
-                    i += 1
-                else:
-                    i = 0
-            else:
-                try:
-                    j = self.buf.index(target[0], self.charpos)
-                    data += self.buf[self.charpos : j + 1]
-                    self.charpos = j + 1
-                    i = 1
-                except ValueError:
-                    data += self.buf[self.charpos :]
-                    self.charpos = len(self.buf)
-        data = data[: -(len(target) + 1)]  # strip the last part
-        data = re.sub(rb"(\x0d\x0a|[\x0d\x0a])$", b"", data)
-        return (pos, data)
+    def __next__(self) -> Tuple[int, PSBaseParserToken]:
+        while True:
+            try:
+                return super().__next__()
+            except StopIteration:
+                # Will also raise StopIteration if there are no more,
+                # which is exactly what we want
+                stream = stream_value(next(self.streamiter))
+                log.debug("PDFContentParser starting stream %r", stream)
+                self.reinit(stream.get_data())
 
     def flush(self) -> None:
         self.add_results(*self.popall())
 
-    KEYWORD_BI = KWD(b"BI")
-    KEYWORD_ID = KWD(b"ID")
-    KEYWORD_EI = KWD(b"EI")
-
     def do_keyword(self, pos: int, token: PSKeyword) -> None:
-        if token is self.KEYWORD_BI:
+        if token is KEYWORD_BI:
             # inline image within a content stream
             self.start_type(pos, "inline")
-        elif token is self.KEYWORD_ID:
+        elif token is KEYWORD_ID:
             try:
                 (_, objs) = self.end_type("inline")
                 if len(objs) % 2 != 0:
@@ -340,13 +307,32 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
                         filter = [filter]
                     if filter[0] in LITERALS_ASCII85_DECODE:
                         eos = b"~>"
-                (pos, data) = self.get_inline_data(pos + len(b"ID "), target=eos)
-                if eos != b"EI":  # it may be necessary for decoding
-                    data += eos
+                # PDF 1.7 p. 215: Unless the image uses ASCIIHexDecode
+                # or ASCII85Decode as one of its filters, the ID
+                # operator shall be followed by a single white-space
+                # character, and the next character shall be
+                # interpreted as the first byte of image data.
+                if eos == b"EI":
+                    self.seek(pos + len(token.name) + 1)
+                    (pos, data) = self.get_inline_data(target=eos)
+                    # FIXME: it is totally unspecified what to do with
+                    # a newline between the end of the data and "EI",
+                    # since there is no explicit stream length.  (PDF
+                    # 1.7 p. 756: There should be an end-of-line
+                    # marker after the data and before endstream; this
+                    # marker shall not be included in the stream
+                    # length.)
+                    data = data[: -len(eos)]
+                else:
+                    self.seek(pos + len(token.name))
+                    (pos, data) = self.get_inline_data(target=eos)
+                if pos == -1:
+                    raise PDFSyntaxError("End of inline stream %r not found" % eos)
                 obj = PDFStream(d, data)
                 self.push((pos, obj))
-                if eos == b"EI":  # otherwise it is still in the stream
-                    self.push((pos, self.KEYWORD_EI))
+                # This was included in the data but we need to "parse" it
+                if eos == b"EI":
+                    self.push((pos, KEYWORD_EI))
             except PSTypeError:
                 if settings.STRICT:
                     raise
diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py
index b00c2b35..645e0dec 100644
--- a/pdfminer/pdfparser.py
+++ b/pdfminer/pdfparser.py
@@ -1,5 +1,4 @@
 import logging
-from io import BytesIO
 from typing import TYPE_CHECKING, BinaryIO, Optional, Union
 
 from pdfminer import settings
@@ -36,8 +35,8 @@ class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
 
     """
 
-    def __init__(self, fp: BinaryIO) -> None:
-        PSStackParser.__init__(self, fp)
+    def __init__(self, data: Union[BinaryIO, bytes]) -> None:
+        super().__init__(data)
         self.doc: Optional[PDFDocument] = None
         self.fallback = False
 
@@ -92,10 +91,9 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
                     raise PDFSyntaxError("Unexpected EOF")
                 return
             pos += len(line)
-            self.fp.seek(pos)
-            data = bytearray(self.fp.read(objlen))
+            data = bytearray(self.read(pos, objlen))
             self.seek(pos + objlen)
-            while 1:
+            while True:
                 try:
                     (linepos, line) = self.nextline()
                 except PSEOF:
@@ -138,7 +136,7 @@ class PDFStreamParser(PDFParser):
     """
 
     def __init__(self, data: bytes) -> None:
-        PDFParser.__init__(self, BytesIO(data))
+        super().__init__(data)
 
     def flush(self) -> None:
         self.add_results(*self.popall())
diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py
index b4869560..0075b345 100755
--- a/pdfminer/psparser.py
+++ b/pdfminer/psparser.py
@@ -2,9 +2,12 @@
 import io
 import logging
 import re
+from binascii import unhexlify
+from collections import deque
 from typing import (
     Any,
     BinaryIO,
+    Deque,
     Dict,
     Generic,
     Iterator,
@@ -109,6 +112,7 @@ def intern(self, name: PSLiteral.NameType) -> _SymbolT:
 KEYWORD_ARRAY_END = KWD(b"]")
 KEYWORD_DICT_BEGIN = KWD(b"<<")
 KEYWORD_DICT_END = KWD(b">>")
+KEYWORD_GT = KWD(b">")
 
 
 def literal_name(x: Any) -> str:
@@ -136,17 +140,14 @@ def keyword_name(x: Any) -> Any:
     return name
 
 
-EOL = re.compile(rb"[\r\n]")
-SPC = re.compile(rb"\s")
-NONSPC = re.compile(rb"\S")
-HEX = re.compile(rb"[0-9a-fA-F]")
-END_LITERAL = re.compile(rb"[#/%\[\]()<>{}\s]")
-END_HEX_STRING = re.compile(rb"[^\s0-9a-fA-F]")
-HEX_PAIR = re.compile(rb"[0-9a-fA-F]{2}|.")
-END_NUMBER = re.compile(rb"[^0-9]")
-END_KEYWORD = re.compile(rb"[#/%\[\]()<>{}\s]")
-END_STRING = re.compile(rb"[()\134]")
-OCT_STRING = re.compile(rb"[0-7]")
+EOL = b"\r\n"
+WHITESPACE = b" \t\n\r\f\v"
+NUMBER = b"0123456789"
+HEX = NUMBER + b"abcdef" + b"ABCDEF"
+NOTLITERAL = b"#/%[]()<>{}" + WHITESPACE
+NOTKEYWORD = b"#/%[]()<>{}" + WHITESPACE
+NOTSTRING = b"()\\"
+OCTAL = b"01234567"
 ESC_STRING = {
     b"b": 8,
     b"t": 9,
@@ -162,90 +163,60 @@ def keyword_name(x: Any) -> Any:
 PSBaseParserToken = Union[float, bool, PSLiteral, PSKeyword, bytes]
 
 
-class PSBaseParser:
-    """Most basic PostScript parser that performs only tokenization."""
-
-    BUFSIZ = 4096
+class PSFileParser:
+    """
+    Parser (actually a lexer) for PDF data from a buffered file object.
+    """
 
     def __init__(self, fp: BinaryIO) -> None:
         self.fp = fp
+        self._tokens: Deque[Tuple[int, PSBaseParserToken]] = deque()
         self.seek(0)
 
-    def __repr__(self) -> str:
-        return "<%s: %r, bufpos=%d>" % (self.__class__.__name__, self.fp, self.bufpos)
-
-    def flush(self) -> None:
-        pass
-
-    def close(self) -> None:
-        self.flush()
-
-    def tell(self) -> int:
-        return self.bufpos + self.charpos
-
-    def poll(self, pos: Optional[int] = None, n: int = 80) -> None:
-        pos0 = self.fp.tell()
-        if not pos:
-            pos = self.bufpos + self.charpos
-        self.fp.seek(pos)
-        log.debug("poll(%d): %r", pos, self.fp.read(n))
-        self.fp.seek(pos0)
+    def reinit(self, fp: BinaryIO) -> None:
+        """Reinitialize parser with a new file."""
+        self.fp = fp
+        self.seek(0)
 
     def seek(self, pos: int) -> None:
-        """Seeks the parser to the given position."""
-        log.debug("seek: %r", pos)
+        """Seek to a position and reinitialize parser state."""
         self.fp.seek(pos)
-        # reset the status for nextline()
-        self.bufpos = pos
-        self.buf = b""
-        self.charpos = 0
-        # reset the status for nexttoken()
         self._parse1 = self._parse_main
         self._curtoken = b""
         self._curtokenpos = 0
-        self._tokens: List[Tuple[int, PSBaseParserToken]] = []
-
-    def fillbuf(self) -> None:
-        if self.charpos < len(self.buf):
-            return
-        # fetch next chunk.
-        self.bufpos = self.fp.tell()
-        self.buf = self.fp.read(self.BUFSIZ)
-        if not self.buf:
-            raise PSEOF("Unexpected EOF")
-        self.charpos = 0
+        self._tokens.clear()
 
-    def nextline(self) -> Tuple[int, bytes]:
-        """Fetches a next line that ends either with \\r or \\n."""
-        linebuf = b""
-        linepos = self.bufpos + self.charpos
-        eol = False
-        while 1:
-            self.fillbuf()
-            if eol:
-                c = self.buf[self.charpos : self.charpos + 1]
-                # handle b'\r\n'
-                if c == b"\n":
-                    linebuf += c
-                    self.charpos += 1
-                break
-            m = EOL.search(self.buf, self.charpos)
-            if m:
-                linebuf += self.buf[self.charpos : m.end(0)]
-                self.charpos = m.end(0)
-                if linebuf[-1:] == b"\r":
-                    eol = True
-                else:
-                    break
-            else:
-                linebuf += self.buf[self.charpos :]
-                self.charpos = len(self.buf)
-        log.debug("nextline: %r, %r", linepos, linebuf)
+    def tell(self) -> int:
+        """Get the current position in the file."""
+        return self.fp.tell()
 
-        return (linepos, linebuf)
+    def read(self, pos: int, objlen: int) -> bytes:
+        """Read data from a specified position, moving the current
+        position to the end of this data."""
+        self.fp.seek(pos)
+        return self.fp.read(objlen)
+
+    def nextline(self) -> Tuple[int, bytes]:
+        r"""Fetches a next line that ends either with \r, \n, or
+        \r\n."""
+        linepos = self.fp.tell()
+        # readline() is implemented on BinarIO so just use that
+        # (except that it only accepts \n as a separator)
+        line_or_lines = self.fp.readline()
+        if line_or_lines == b"":
+            raise PSEOF
+        first, sep, rest = line_or_lines.partition(b"\r")
+        if len(rest) == 0:
+            return (linepos, line_or_lines)
+        elif rest != b"\n":
+            self.fp.seek(linepos + len(first) + 1)
+            return (linepos, first + sep)
+        else:
+            self.fp.seek(linepos + len(first) + 2)
+            return (linepos, first + b"\r\n")
 
     def revreadlines(self) -> Iterator[bytes]:
-        """Fetches a next line backword.
+        """Fetches a next line backwards.
 
         This is used to locate the trailers at the end of a file.
         """
@@ -253,261 +224,592 @@ def revreadlines(self) -> Iterator[bytes]:
         pos = self.fp.tell()
         buf = b""
         while pos > 0:
-            prevpos = pos
-            pos = max(0, pos - self.BUFSIZ)
+            # NOTE: This can obviously be optimized to use regular
+            # expressions on the (known to exist) buffer in
+            # self.fp...
+            pos -= 1
             self.fp.seek(pos)
-            s = self.fp.read(prevpos - pos)
-            if not s:
-                break
-            while 1:
-                n = max(s.rfind(b"\r"), s.rfind(b"\n"))
-                if n == -1:
-                    buf = s + buf
+            c = self.fp.read(1)
+            if c in b"\r\n":
+                yield buf
+                buf = c
+                if c == b"\n" and pos > 0:
+                    self.fp.seek(pos - 1)
+                    cc = self.fp.read(1)
+                    if cc == b"\r":
+                        pos -= 1
+                        buf = cc + buf
+            else:
+                buf = c + buf
+        yield buf
+
+    def get_inline_data(
+        self, target: bytes = b"EI", blocksize: int = 4096
+    ) -> Tuple[int, bytes]:
+        """Get the data for an inline image up to the target
+        end-of-stream marker.
+
+        Returns a tuple of the position of the target in the data and the
+        data *including* the end of stream marker.  Advances the file
+        pointer to a position after the end of the stream.
+
+        The caller is responsible for removing the end-of-stream if
+        necessary (this depends on the filter being used) and parsing
+        the end-of-stream token (likewise) if necessary.
+        """
+        # PDF 1.7, p. 216: The bytes between the ID and EI operators
+        # shall be treated the same as a stream object’s data (see
+        # 7.3.8, "Stream Objects"), even though they do not follow the
+        # standard stream syntax.
+        data = []  # list of blocks
+        partial = b""  # partially seen target
+        pos = 0
+        while True:
+            # Did we see part of the target at the end of the last
+            # block?  Then scan ahead and try to find the rest (we
+            # assume the stream is buffered)
+            if partial:
+                extra_len = len(target) - len(partial)
+                extra = self.fp.read(extra_len)
+                if partial + extra == target:
+                    pos -= len(partial)
+                    data.append(extra)
                     break
-                yield s[n:] + buf
-                s = s[:n]
-                buf = b""
-
-    def _parse_main(self, s: bytes, i: int) -> int:
-        m = NONSPC.search(s, i)
-        if not m:
-            return len(s)
-        j = m.start(0)
-        c = s[j : j + 1]
-        self._curtokenpos = self.bufpos + j
+                # Put it back (assume buffering!)
+                self.fp.seek(-extra_len, io.SEEK_CUR)
+                partial = b""
+                # Fall through (the target could be at the beginning)
+            buf = self.fp.read(blocksize)
+            if not buf:
+                return (-1, b"")
+            tpos = buf.find(target)
+            if tpos != -1:
+                data.append(buf[: tpos + len(target)])
+                # Put the extra back (assume buffering!)
+                self.fp.seek(tpos - len(buf) + len(target), io.SEEK_CUR)
+                pos += tpos
+                break
+            else:
+                pos += len(buf)
+                # look for the longest partial match at the end
+                plen = len(target) - 1
+                while plen > 0:
+                    ppos = len(buf) - plen
+                    if buf[ppos:] == target[:plen]:
+                        partial = buf[ppos:]
+                        break
+                    plen -= 1
+                data.append(buf)
+        return (pos, b"".join(data))
+
+    def __iter__(self) -> Iterator[Tuple[int, PSBaseParserToken]]:
+        """Iterate over tokens."""
+        return self
+
+    def __next__(self) -> Tuple[int, PSBaseParserToken]:
+        """Get the next token in iteration, raising StopIteration when
+        done."""
+        while True:
+            c = self._parse1()
+            # print(c, self._curtoken, self._parse1)
+            if self._tokens or c == b"":
+                break
+        if not self._tokens:
+            raise StopIteration
+        return self._tokens.popleft()
+
+    def nexttoken(self) -> Tuple[int, PSBaseParserToken]:
+        """Get the next token in iteration, raising PSEOF when done."""
+        try:
+            return self.__next__()
+        except StopIteration:
+            raise PSEOF
+
+    def _parse_main(self) -> bytes:
+        """Initial/default state for the lexer."""
+        c = self.fp.read(1)
+        # note that b"" (EOF) is in everything, which is fine
+        if c in WHITESPACE:
+            return c
+        self._curtokenpos = self.fp.tell() - 1
         if c == b"%":
             self._curtoken = b"%"
             self._parse1 = self._parse_comment
-            return j + 1
         elif c == b"/":
             self._curtoken = b""
             self._parse1 = self._parse_literal
-            return j + 1
-        elif c in b"-+" or c.isdigit():
+        elif c in b"-+" or c in NUMBER:
             self._curtoken = c
             self._parse1 = self._parse_number
-            return j + 1
         elif c == b".":
             self._curtoken = c
             self._parse1 = self._parse_float
-            return j + 1
         elif c.isalpha():
             self._curtoken = c
             self._parse1 = self._parse_keyword
-            return j + 1
         elif c == b"(":
             self._curtoken = b""
             self.paren = 1
             self._parse1 = self._parse_string
-            return j + 1
         elif c == b"<":
             self._curtoken = b""
             self._parse1 = self._parse_wopen
-            return j + 1
         elif c == b">":
             self._curtoken = b""
             self._parse1 = self._parse_wclose
-            return j + 1
         elif c == b"\x00":
-            return j + 1
+            pass
         else:
             self._add_token(KWD(c))
-            return j + 1
+        return c
 
     def _add_token(self, obj: PSBaseParserToken) -> None:
+        """Add a succesfully parsed token."""
         self._tokens.append((self._curtokenpos, obj))
 
-    def _parse_comment(self, s: bytes, i: int) -> int:
-        m = EOL.search(s, i)
-        if not m:
-            self._curtoken += s[i:]
-            return len(s)
-        j = m.start(0)
-        self._curtoken += s[i:j]
-        self._parse1 = self._parse_main
-        # We ignore comments.
-        # self._tokens.append(self._curtoken)
-        return j
-
-    def _parse_literal(self, s: bytes, i: int) -> int:
-        m = END_LITERAL.search(s, i)
-        if not m:
-            self._curtoken += s[i:]
-            return len(s)
-        j = m.start(0)
-        self._curtoken += s[i:j]
-        c = s[j : j + 1]
+    def _parse_comment(self) -> bytes:
+        """Comment state for the lexer"""
+        c = self.fp.read(1)
+        if c in EOL:  # this includes b"", i.e. EOF
+            self._parse1 = self._parse_main
+            # We ignore comments.
+            # self._tokens.append(self._curtoken)
+        else:
+            self._curtoken += c
+        return c
+
+    def _parse_literal(self) -> bytes:
+        """Literal (keyword) state for the lexer."""
+        c = self.fp.read(1)
         if c == b"#":
             self.hex = b""
             self._parse1 = self._parse_literal_hex
-            return j + 1
-        try:
-            name: Union[str, bytes] = str(self._curtoken, "utf-8")
-        except Exception:
-            name = self._curtoken
-        self._add_token(LIT(name))
-        self._parse1 = self._parse_main
-        return j
+        elif c in NOTLITERAL:
+            if c:
+                self.fp.seek(-1, io.SEEK_CUR)
+            try:
+                self._add_token(LIT(self._curtoken.decode("utf-8")))
+            except UnicodeDecodeError:
+                self._add_token(LIT(self._curtoken))
+            self._parse1 = self._parse_main
+        else:
+            self._curtoken += c
+        return c
 
-    def _parse_literal_hex(self, s: bytes, i: int) -> int:
-        c = s[i : i + 1]
-        if HEX.match(c) and len(self.hex) < 2:
+    def _parse_literal_hex(self) -> bytes:
+        """State for escaped hex characters in literal names"""
+        # Consume a hex digit only if we can ... consume a hex digit
+        c = self.fp.read(1)
+        if c and c in HEX and len(self.hex) < 2:
             self.hex += c
-            return i + 1
-        if self.hex:
-            self._curtoken += bytes((int(self.hex, 16),))
-        self._parse1 = self._parse_literal
-        return i
-
-    def _parse_number(self, s: bytes, i: int) -> int:
-        m = END_NUMBER.search(s, i)
-        if not m:
-            self._curtoken += s[i:]
-            return len(s)
-        j = m.start(0)
-        self._curtoken += s[i:j]
-        c = s[j : j + 1]
-        if c == b".":
+        else:
+            if c:
+                self.fp.seek(-1, io.SEEK_CUR)
+            if self.hex:
+                self._curtoken += bytes((int(self.hex, 16),))
+            self._parse1 = self._parse_literal
+        return c
+
+    def _parse_number(self) -> bytes:
+        """State for numeric objects."""
+        c = self.fp.read(1)
+        if c and c in NUMBER:
+            self._curtoken += c
+        elif c == b".":
             self._curtoken += c
             self._parse1 = self._parse_float
-            return j + 1
-        try:
-            self._add_token(int(self._curtoken))
-        except ValueError:
-            pass
-        self._parse1 = self._parse_main
-        return j
-
-    def _parse_float(self, s: bytes, i: int) -> int:
-        m = END_NUMBER.search(s, i)
-        if not m:
-            self._curtoken += s[i:]
-            return len(s)
-        j = m.start(0)
-        self._curtoken += s[i:j]
-        try:
-            self._add_token(float(self._curtoken))
-        except ValueError:
-            pass
-        self._parse1 = self._parse_main
-        return j
-
-    def _parse_keyword(self, s: bytes, i: int) -> int:
-        m = END_KEYWORD.search(s, i)
-        if m:
-            j = m.start(0)
-            self._curtoken += s[i:j]
         else:
-            # Use the rest of the stream if no non-keyword character is found. This
-            # can happen if the keyword is the final bytes of the stream
-            # (https://github.com/pdfminer/pdfminer.six/issues/884).
-            j = len(s)
-            self._curtoken += s[i:]
-        if self._curtoken == b"true":
-            token: Union[bool, PSKeyword] = True
-        elif self._curtoken == b"false":
-            token = False
+            if c:
+                self.fp.seek(-1, io.SEEK_CUR)
+            try:
+                self._add_token(int(self._curtoken))
+            except ValueError:
+                log.warning("Invalid int literal: %r", self._curtoken)
+            self._parse1 = self._parse_main
+        return c
+
+    def _parse_float(self) -> bytes:
+        """State for fractional part of numeric objects."""
+        c = self.fp.read(1)
+        # b"" is in everything so we have to add an extra check
+        if not c or c not in NUMBER:
+            if c:
+                self.fp.seek(-1, io.SEEK_CUR)
+            try:
+                self._add_token(float(self._curtoken))
+            except ValueError:
+                log.warning("Invalid float literal: %r", self._curtoken)
+            self._parse1 = self._parse_main
+        else:
+            self._curtoken += c
+        return c
+
+    def _parse_keyword(self) -> bytes:
+        """State for keywords."""
+        c = self.fp.read(1)
+        if c in NOTKEYWORD:  # includes EOF
+            if c:
+                self.fp.seek(-1, io.SEEK_CUR)
+            if self._curtoken == b"true":
+                self._add_token(True)
+            elif self._curtoken == b"false":
+                self._add_token(False)
+            else:
+                self._add_token(KWD(self._curtoken))
+            self._parse1 = self._parse_main
         else:
-            token = KWD(self._curtoken)
-        self._add_token(token)
-        self._parse1 = self._parse_main
-        return j
-
-    def _parse_string(self, s: bytes, i: int) -> int:
-        m = END_STRING.search(s, i)
-        if not m:
-            self._curtoken += s[i:]
-            return len(s)
-        j = m.start(0)
-        self._curtoken += s[i:j]
-        c = s[j : j + 1]
-        if c == b"\\":
-            self.oct = b""
-            self._parse1 = self._parse_string_1
-            return j + 1
-        if c == b"(":
-            self.paren += 1
             self._curtoken += c
-            return j + 1
-        if c == b")":
-            self.paren -= 1
-            if self.paren:
-                # WTF, they said balanced parens need no special treatment.
+        return c
+
+    def _parse_string(self) -> bytes:
+        """State for string objects."""
+        c = self.fp.read(1)
+        if c and c in NOTSTRING:  # does not include EOF
+            if c == b"\\":
+                self._parse1 = self._parse_string_esc
+                return c
+            elif c == b"(":
+                self.paren += 1
                 self._curtoken += c
-                return j + 1
-        self._add_token(self._curtoken)
-        self._parse1 = self._parse_main
-        return j + 1
-
-    def _parse_string_1(self, s: bytes, i: int) -> int:
-        """Parse literal strings
+                return c
+            elif c == b")":
+                self.paren -= 1
+                if self.paren:
+                    self._curtoken += c
+                    return c
+            # We saw the last parenthesis and fell through (it will be
+            # consumed, but not added to self._curtoken)
+            self._add_token(self._curtoken)
+            self._parse1 = self._parse_main
+        elif c == b"\r":
+            # PDF 1.7 page 15: An end-of-line marker appearing within
+            # a literal string without a preceding REVERSE SOLIDUS
+            # shall be treated as a byte value of (0Ah), irrespective
+            # of whether the end-of-line marker was a CARRIAGE RETURN
+            # (0Dh), a LINE FEED (0Ah), or both.
+            cc = self.fp.read(1)
+            # Put it back if it isn't \n
+            if cc and cc != b"\n":
+                self.fp.seek(-1, io.SEEK_CUR)
+            self._curtoken += b"\n"
+        else:
+            self._curtoken += c
+        return c
+
+    def _parse_string_esc(self) -> bytes:
+        """State for escapes in literal strings.  We have seen a
+        backslash and nothing else."""
+        c = self.fp.read(1)
+        if c and c in OCTAL:  # exclude EOF
+            self.oct = c
+            self._parse1 = self._parse_string_octal
+            return c
+        elif c and c in ESC_STRING:
+            self._curtoken += bytes((ESC_STRING[c],))
+        elif c == b"\n":  # Skip newline after backslash
+            pass
+        elif c == b"\r":  # Also skip CRLF after
+            cc = self.fp.read(1)
+            # Put it back if it isn't \n
+            if cc and cc != b"\n":
+                self.fp.seek(-1, io.SEEK_CUR)
+        elif c == b"":
+            log.warning("EOF inside escape %r", self._curtoken)
+        else:
+            log.warning("Unrecognized escape %r", c)
+            self._curtoken += c
+        self._parse1 = self._parse_string
+        return c
 
-        PDF Reference 3.2.3
-        """
-        c = s[i : i + 1]
-        if OCT_STRING.match(c) and len(self.oct) < 3:
+    def _parse_string_octal(self) -> bytes:
+        """State for an octal escape."""
+        c = self.fp.read(1)
+        if c and c in OCTAL:  # exclude EOF
             self.oct += c
-            return i + 1
-
-        elif self.oct:
+            done = len(self.oct) >= 3  # it can't be > though
+        else:
+            if c:
+                self.fp.seek(-1, io.SEEK_CUR)
+            else:
+                log.warning("EOF in octal escape %r", self._curtoken)
+            done = True
+        if done:
             chrcode = int(self.oct, 8)
-            assert chrcode < 256, "Invalid octal %s (%d)" % (repr(self.oct), chrcode)
-            self._curtoken += bytes((chrcode,))
+            if chrcode >= 256:
+                # PDF1.7 p.16: "high-order overflow shall be ignored."
+                log.warning("Invalid octal %r (%d)", self.oct, chrcode)
+            else:
+                self._curtoken += bytes((chrcode,))
+            # Back to normal string parsing
             self._parse1 = self._parse_string
-            return i
-
-        elif c in ESC_STRING:
-            self._curtoken += bytes((ESC_STRING[c],))
-
-        elif c == b"\r" and len(s) > i + 1 and s[i + 1 : i + 2] == b"\n":
-            # If current and next character is \r\n skip both because enters
-            # after a \ are ignored
-            i += 1
+        return c
 
-        # default action
-        self._parse1 = self._parse_string
-        return i + 1
-
-    def _parse_wopen(self, s: bytes, i: int) -> int:
-        c = s[i : i + 1]
+    def _parse_wopen(self) -> bytes:
+        """State for start of dictionary or hex string."""
+        c = self.fp.read(1)
         if c == b"<":
             self._add_token(KEYWORD_DICT_BEGIN)
             self._parse1 = self._parse_main
-            i += 1
         else:
+            if c:
+                self.fp.seek(-1, io.SEEK_CUR)
             self._parse1 = self._parse_hexstring
-        return i
+        return c
 
-    def _parse_wclose(self, s: bytes, i: int) -> int:
-        c = s[i : i + 1]
+    def _parse_wclose(self) -> bytes:
+        """State for end of dictionary (accessed from initial state only)"""
+        c = self.fp.read(1)
         if c == b">":
             self._add_token(KEYWORD_DICT_END)
-            i += 1
-        self._parse1 = self._parse_main
-        return i
-
-    def _parse_hexstring(self, s: bytes, i: int) -> int:
-        m = END_HEX_STRING.search(s, i)
-        if not m:
-            self._curtoken += s[i:]
-            return len(s)
-        j = m.start(0)
-        self._curtoken += s[i:j]
-        token = HEX_PAIR.sub(
-            lambda m: bytes((int(m.group(0), 16),)),
-            SPC.sub(b"", self._curtoken),
-        )
-        self._add_token(token)
+        else:
+            # Assuming this is a keyword (which means nothing)
+            self._add_token(KEYWORD_GT)
+            if c:
+                self.fp.seek(-1, io.SEEK_CUR)
         self._parse1 = self._parse_main
-        return j
+        return c
+
+    def _parse_hexstring(self) -> bytes:
+        """State for parsing hexadecimal literal strings."""
+        c = self.fp.read(1)
+        if not c:
+            log.warning("EOF in hex string %r", self._curtoken)
+        elif c in WHITESPACE:
+            pass
+        elif c in HEX:
+            self._curtoken += c
+        elif c == b">":
+            if len(self._curtoken) % 2 == 1:
+                self._curtoken += b"0"
+            token = unhexlify(self._curtoken)
+            self._add_token(token)
+            self._parse1 = self._parse_main
+        else:
+            log.warning("unexpected character %r in hex string %r", c, self._curtoken)
+        return c
+
+
+LEXER = re.compile(
+    rb"""(?:
+      (?P<whitespace> \s+)
+    | (?P<comment> %[^\r\n]*[\r\n])
+    | (?P<name> /(?: \#[A-Fa-f\d][A-Fa-f\d] | [^#/%\[\]()<>{}\s])+ )
+    | (?P<number> [-+]? (?: \d*\.\d+ | \d+ ) )
+    | (?P<keyword> [A-Za-z] [^#/%\[\]()<>{}\s]*)
+    | (?P<startstr> \([^()\\]*)
+    | (?P<hexstr> <[A-Fa-f\d\s]+>)
+    | (?P<startdict> <<)
+    | (?P<enddict> >>)
+    | (?P<other> .)
+)
+""",
+    re.VERBOSE,
+)
+STRLEXER = re.compile(
+    rb"""(?:
+      (?P<octal> \\[0-7]{1,3})
+    | (?P<linebreak> \\(?:\r\n?|\n))
+    | (?P<escape> \\.)
+    | (?P<parenleft> \()
+    | (?P<parenright> \))
+    | (?P<newline> \r\n?|\n)
+    | (?P<other> .)
+)""",
+    re.VERBOSE,
+)
+HEXDIGIT = re.compile(rb"#([A-Fa-f\d][A-Fa-f\d])")
+EOLR = re.compile(rb"\r\n?|\n")
+SPC = re.compile(rb"\s")
+
+
+class PSInMemoryParser:
+    """
+    Parser for in-memory data streams.
+    """
+
+    def __init__(self, data: bytes) -> None:
+        self.data = data
+        self.pos = 0
+        self.end = len(data)
+        self._tokens: Deque[Tuple[int, PSBaseParserToken]] = deque()
+
+    def reinit(self, data: bytes) -> None:
+        """Reinitialize parser with a new buffer."""
+        self.data = data
+        self.seek(0)
+
+    def seek(self, pos: int) -> None:
+        """Seek to a position and reinitialize parser state."""
+        self.pos = pos
+        self._curtoken = b""
+        self._curtokenpos = 0
+        self._tokens.clear()
+
+    def tell(self) -> int:
+        """Get the current position in the buffer."""
+        return self.pos
+
+    def read(self, pos: int, objlen: int) -> bytes:
+        """Read data from a specified position, moving the current
+        position to the end of this data."""
+        self.pos = min(pos + objlen, len(self.data))
+        return self.data[pos : self.pos]
+
+    def nextline(self) -> Tuple[int, bytes]:
+        r"""Fetches a next line that ends either with \r, \n, or \r\n."""
+        if self.pos == self.end:
+            raise PSEOF
+        linepos = self.pos
+        m = EOLR.search(self.data, self.pos)
+        if m is None:
+            self.pos = self.end
+        else:
+            self.pos = m.end()
+        return (linepos, self.data[linepos : self.pos])
+
+    def revreadlines(self) -> Iterator[bytes]:
+        """Fetches a next line backwards.
+
+        This is used to locate the trailers at the end of a file.  So,
+        it isn't actually used in PSInMemoryParser, but is here for
+        completeness.
+        """
+        endline = pos = self.end
+        while True:
+            nidx = self.data.rfind(b"\n", 0, pos)
+            ridx = self.data.rfind(b"\r", 0, pos)
+            best = max(nidx, ridx)
+            if best == -1:
+                yield self.data[:endline]
+                break
+            yield self.data[best + 1 : endline]
+            endline = best + 1
+            pos = best
+            if pos > 0 and self.data[pos - 1 : pos + 1] == b"\r\n":
+                pos -= 1
+
+    def get_inline_data(
+        self, target: bytes = b"EI", blocksize: int = -1
+    ) -> Tuple[int, bytes]:
+        """Get the data for an inline image up to the target
+        end-of-stream marker.
+
+        Returns a tuple of the position of the target in the data and the
+        data *including* the end of stream marker.  Advances the file
+        pointer to a position after the end of the stream.
+
+        The caller is responsible for removing the end-of-stream if
+        necessary (this depends on the filter being used) and parsing
+        the end-of-stream token (likewise) if necessary.
+        """
+        tpos = self.data.find(target, self.pos)
+        if tpos != -1:
+            nextpos = tpos + len(target)
+            result = (tpos, self.data[self.pos : nextpos])
+            self.pos = nextpos
+            return result
+        return (-1, b"")
+
+    def __iter__(self) -> Iterator[Tuple[int, PSBaseParserToken]]:
+        """Iterate over tokens."""
+        return self
 
     def nexttoken(self) -> Tuple[int, PSBaseParserToken]:
-        while not self._tokens:
-            self.fillbuf()
-            self.charpos = self._parse1(self.buf, self.charpos)
-        token = self._tokens.pop(0)
-        log.debug("nexttoken: %r", token)
-        return token
+        """Get the next token in iteration, raising PSEOF when done."""
+        try:
+            return self.__next__()
+        except StopIteration:
+            raise PSEOF
+
+    def __next__(self) -> Tuple[int, PSBaseParserToken]:
+        """Get the next token in iteration, raising StopIteration when
+        done."""
+        while True:
+            m = LEXER.match(self.data, self.pos)
+            if m is None:  # can only happen at EOS
+                raise StopIteration
+            self._curtokenpos = m.start()
+            self.pos = m.end()
+            if m.lastgroup not in ("whitespace", "comment"):  # type: ignore
+                # Okay, we got a token or something
+                break
+        self._curtoken = m[0]
+        if m.lastgroup == "name":  # type: ignore
+            self._curtoken = m[0][1:]
+            self._curtoken = HEXDIGIT.sub(
+                lambda x: bytes((int(x[1], 16),)), self._curtoken
+            )
+            try:
+                tok = LIT(self._curtoken.decode("utf-8"))
+            except UnicodeDecodeError:
+                tok = LIT(self._curtoken)
+            return (self._curtokenpos, tok)
+        if m.lastgroup == "number":  # type: ignore
+            if b"." in self._curtoken:
+                return (self._curtokenpos, float(self._curtoken))
+            else:
+                return (self._curtokenpos, int(self._curtoken))
+        if m.lastgroup == "startdict":  # type: ignore
+            return (self._curtokenpos, KEYWORD_DICT_BEGIN)
+        if m.lastgroup == "enddict":  # type: ignore
+            return (self._curtokenpos, KEYWORD_DICT_END)
+        if m.lastgroup == "startstr":  # type: ignore
+            return self._parse_endstr(self.data[m.start() + 1 : m.end()], m.end())
+        if m.lastgroup == "hexstr":  # type: ignore
+            self._curtoken = SPC.sub(b"", self._curtoken[1:-1])
+            if len(self._curtoken) % 2 == 1:
+                self._curtoken += b"0"
+            return (self._curtokenpos, unhexlify(self._curtoken))
+        # Anything else is treated as a keyword (whether explicitly matched or not)
+        if self._curtoken == b"true":
+            return (self._curtokenpos, True)
+        elif self._curtoken == b"false":
+            return (self._curtokenpos, False)
+        else:
+            return (self._curtokenpos, KWD(self._curtoken))
+
+    def _parse_endstr(self, start: bytes, pos: int) -> Tuple[int, PSBaseParserToken]:
+        """Parse the remainder of a string."""
+        # Handle nonsense CRLF conversion in strings (PDF 1.7, p.15)
+        parts = [EOLR.sub(b"\n", start)]
+        paren = 1
+        for m in STRLEXER.finditer(self.data, pos):
+            self.pos = m.end()
+            if m.lastgroup == "parenright":  # type: ignore
+                paren -= 1
+                if paren == 0:
+                    # By far the most common situation!
+                    break
+                parts.append(m[0])
+            elif m.lastgroup == "parenleft":  # type: ignore
+                parts.append(m[0])
+                paren += 1
+            elif m.lastgroup == "escape":  # type: ignore
+                chr = m[0][1:2]
+                if chr not in ESC_STRING:
+                    log.warning("Unrecognized escape %r", m[0])
+                    parts.append(chr)
+                else:
+                    parts.append(bytes((ESC_STRING[chr],)))
+            elif m.lastgroup == "octal":  # type: ignore
+                chrcode = int(m[0][1:], 8)
+                if chrcode >= 256:
+                    # PDF1.7 p.16: "high-order overflow shall be
+                    # ignored."
+                    log.warning("Invalid octal %r (%d)", m[0][1:], chrcode)
+                else:
+                    parts.append(bytes((chrcode,)))
+            elif m.lastgroup == "newline":  # type: ignore
+                # Handle nonsense CRLF conversion in strings (PDF 1.7, p.15)
+                parts.append(b"\n")
+            elif m.lastgroup == "linebreak":  # type: ignore
+                pass
+            else:
+                parts.append(m[0])
+        if paren != 0:
+            log.warning("Unterminated string at %d", pos)
+            raise StopIteration
+        return (self._curtokenpos, b"".join(parts))
 
 
 # Stack slots may by occupied by any of:
@@ -521,35 +823,53 @@ def nexttoken(self) -> Tuple[int, PSBaseParserToken]:
 PSStackEntry = Tuple[int, PSStackType[ExtraT]]
 
 
-class PSStackParser(PSBaseParser, Generic[ExtraT]):
-    def __init__(self, fp: BinaryIO) -> None:
-        PSBaseParser.__init__(self, fp)
+class PSStackParser(Generic[ExtraT]):
+    """Basic parser for PDF objects, can take a file or a `bytes` as
+    input."""
+
+    def __init__(self, reader: Union[BinaryIO, bytes]) -> None:
+        self.reinit(reader)
+
+    def reinit(self, reader: Union[BinaryIO, bytes]) -> None:
+        """Reinitialize parser with a new file or buffer."""
+        if isinstance(reader, bytes):
+            self._parser: Union[PSInMemoryParser, PSFileParser] = PSInMemoryParser(
+                reader
+            )
+        else:
+            self._parser = PSFileParser(reader)
         self.reset()
 
     def reset(self) -> None:
+        """Reset parser state."""
         self.context: List[Tuple[int, Optional[str], List[PSStackEntry[ExtraT]]]] = []
         self.curtype: Optional[str] = None
         self.curstack: List[PSStackEntry[ExtraT]] = []
         self.results: List[PSStackEntry[ExtraT]] = []
 
     def seek(self, pos: int) -> None:
-        PSBaseParser.seek(self, pos)
+        """Seek to a position and reset parser state."""
+        self._parser.seek(pos)
         self.reset()
 
     def push(self, *objs: PSStackEntry[ExtraT]) -> None:
+        """Push some objects onto the stack."""
         self.curstack.extend(objs)
 
     def pop(self, n: int) -> List[PSStackEntry[ExtraT]]:
+        """Pop some objects off the stack."""
         objs = self.curstack[-n:]
         self.curstack[-n:] = []
         return objs
 
     def popall(self) -> List[PSStackEntry[ExtraT]]:
+        """Pop all the things off the stack."""
         objs = self.curstack
         self.curstack = []
         return objs
 
     def add_results(self, *objs: PSStackEntry[ExtraT]) -> None:
+        """Move some objects to the output."""
         try:
             log.debug("add_results: %r", objs)
         except Exception:
@@ -557,11 +877,13 @@ def add_results(self, *objs: PSStackEntry[ExtraT]) -> None:
         self.results.extend(objs)
 
     def start_type(self, pos: int, type: str) -> None:
+        """Start a composite object (array, dict, etc)."""
         self.context.append((pos, self.curtype, self.curstack))
         (self.curtype, self.curstack) = (type, [])
         log.debug("start_type: pos=%r, type=%r", pos, type)
 
     def end_type(self, type: str) -> Tuple[int, List[PSStackType[ExtraT]]]:
+        """End a composite object (array, dict, etc)."""
         if self.curtype != type:
             raise PSTypeError(f"Type mismatch: {self.curtype!r} != {type!r}")
         objs = [obj for (_, obj) in self.curstack]
@@ -570,6 +892,11 @@ def end_type(self, type: str) -> Tuple[int, List[PSStackType[ExtraT]]]:
         return (pos, objs)
 
     def do_keyword(self, pos: int, token: PSKeyword) -> None:
+        """Handle a PDF keyword."""
+        pass
+
+    def flush(self) -> None:
+        """Get everything off the stack and into the output?"""
         pass
 
     def nextobject(self) -> PSStackEntry[ExtraT]:
@@ -644,10 +971,49 @@ def nextobject(self) -> PSStackEntry[ExtraT]:
             if self.context:
                 continue
             else:
-                self.flush()
+                self.flush()  # Does nothing here, but in subclasses... (ugh)
         obj = self.results.pop(0)
         try:
             log.debug("nextobject: %r", obj)
         except Exception:
             log.debug("nextobject: (unprintable object)")
         return obj
+
+    # Delegation follows
+    def nextline(self) -> Tuple[int, bytes]:
+        r"""Fetches a next line that ends either with \r, \n, or
+        \r\n."""
+        return self._parser.nextline()
+
+    def revreadlines(self) -> Iterator[bytes]:
+        """Fetches a next line backwards.
+
+        This is used to locate the trailers at the end of a file.
+        """
+        return self._parser.revreadlines()
+
+    def read(self, pos: int, objlen: int) -> bytes:
+        """Read data from a specified position, moving the current
+        position to the end of this data."""
+        return self._parser.read(pos, objlen)
+
+    def nexttoken(self) -> Tuple[int, PSBaseParserToken]:
+        """Get the next token in iteration, raising PSEOF when done."""
+        try:
+            return self.__next__()
+        except StopIteration:
+            raise PSEOF
+
+    def get_inline_data(self, target: bytes = b"EI") -> Tuple[int, bytes]:
+        """Get the data for an inline image up to the target
+        end-of-stream marker."""
+        return self._parser.get_inline_data(target)
+
+    def __iter__(self) -> Iterator[Tuple[int, PSBaseParserToken]]:
+        """Iterate over tokens."""
+        return self
+
+    def __next__(self) -> Tuple[int, PSBaseParserToken]:
+        """Get the next token in iteration, raising StopIteration when
+        done."""
+        return self._parser.__next__()
diff --git a/pdfminer/utils.py b/pdfminer/utils.py
index a5b53852..88b44b98 100644
--- a/pdfminer/utils.py
+++ b/pdfminer/utils.py
@@ -75,11 +75,10 @@ def make_compat_bytes(in_str: str) -> bytes:
 def make_compat_str(o: object) -> str:
     """Converts everything to string, if bytes guessing the encoding."""
     if isinstance(o, bytes):
-        enc = charset_normalizer.detect(o)
-        try:
-            return o.decode(enc["encoding"])
-        except UnicodeDecodeError:
+        result = charset_normalizer.from_bytes(o)
+        if result is None:
             return str(o)
+        return str(result.best())
     else:
         return str(o)
 
diff --git a/tests/test_pdfminer_psparser.py b/tests/test_pdfminer_psparser.py
index a1599184..9d869ea9 100644
--- a/tests/test_pdfminer_psparser.py
+++ b/tests/test_pdfminer_psparser.py
@@ -1,13 +1,23 @@
 import logging
+import tempfile
 from io import BytesIO
+from typing import Any, List, Tuple
 
 from pdfminer.psexceptions import PSEOF
-from pdfminer.psparser import KWD, LIT, PSBaseParser, PSStackParser
+from pdfminer.psparser import (
+    KEYWORD_DICT_BEGIN,
+    KEYWORD_DICT_END,
+    KWD,
+    LIT,
+    PSFileParser,
+    PSInMemoryParser,
+    PSStackParser,
+)
 
 logger = logging.getLogger(__name__)
 
 
-class TestPSBaseParser:
+class TestPSFileParser:
     """Simplistic Test cases"""
 
     TESTDATA = rb"""%!PS
@@ -57,7 +67,7 @@ class TestPSBaseParser:
         (191, b""),
         (194, b" "),
         (199, b"@@ "),
-        (211, b"\xab\xcd\x00\x124\x05"),
+        (211, b"\xab\xcd\x00\x124\x50"),
         (226, KWD(b"func")),
         (230, LIT("a")),
         (232, LIT("b")),
@@ -99,7 +109,7 @@ class TestPSBaseParser:
         (191, b""),
         (194, b" "),
         (199, b"@@ "),
-        (211, b"\xab\xcd\x00\x124\x05"),
+        (211, b"\xab\xcd\x00\x124\x50"),
         (230, LIT("a")),
         (232, LIT("b")),
         (234, [b"c"]),
@@ -108,9 +118,7 @@ class TestPSBaseParser:
     ]
 
     def get_tokens(self, s):
-        from io import BytesIO
-
-        class MyParser(PSBaseParser):
+        class MyParser(PSFileParser):
             def flush(self):
                 self.add_results(*self.popall())
 
@@ -124,8 +132,6 @@ def flush(self):
         return r
 
     def get_objects(self, s):
-        from io import BytesIO
-
         class MyParser(PSStackParser):
             def flush(self):
                 self.add_results(*self.popall())
@@ -149,11 +155,243 @@ def test_2(self):
         logger.info(objs)
         assert objs == self.OBJS
 
-    def test_3(self):
-        """Regression test for streams that end with a keyword.
 
-        See: https://github.com/pdfminer/pdfminer.six/issues/884
-        """
-        parser = PSBaseParser(BytesIO(b"Do"))
-        parser._parse_keyword(b"Do", 0)
-        assert parser._tokens == [(0, KWD(b"Do"))]
+TESTDATA = b"""
+ugh
+foo\r
+bar\rbaz
+quxx
+bog"""
+EXPECTED = [
+    (0, b"\n"),
+    (1, b"ugh\n"),
+    (5, b"foo\r\n"),
+    (10, b"bar\r"),
+    (14, b"baz\n"),
+    (18, b"quxx\n"),
+    (23, b"bog"),
+]
+
+
+def run_parsers(data: bytes, expected: List[Any], makefunc: Any) -> None:
+    """Test stuff on both BytesIO and BinaryIO."""
+    bp = PSInMemoryParser(data)
+    output = []
+    func = makefunc(bp)
+    while True:
+        try:
+            output.append(func())
+        except PSEOF:
+            break
+    assert output == expected
+    with tempfile.NamedTemporaryFile() as tf:
+        with open(tf.name, "wb") as outfh:
+            outfh.write(data)
+        with open(tf.name, "rb") as infh:
+            fp = PSFileParser(infh)
+            func = makefunc(fp)
+            output = []
+            while True:
+                try:
+                    output.append(func())
+                except PSEOF:
+                    break
+            assert output == expected
+
+
+def test_nextline() -> None:
+    """Verify that we replicate the old nextline method."""
+    run_parsers(TESTDATA, EXPECTED, lambda foo: foo.nextline)
+
+
+def test_revreadlines() -> None:
+    """Verify that we replicate the old revreadlines method."""
+    expected = list(reversed([line for pos, line in EXPECTED]))
+
+    def make_next(parser: Any) -> Any:
+        itor = parser.revreadlines()
+
+        def nextor() -> Any:
+            try:
+                line = next(itor)
+            except StopIteration:
+                raise PSEOF
+            return line
+
+        return nextor
+
+    run_parsers(TESTDATA, expected, make_next)
+
+
+SIMPLE1 = b"""1 0 obj
+<<
+ /Type /Catalog
+ /Outlines 2 0 R
+ /Pages 3 0 R
+>>
+endobj
+"""
+SIMPLETOK = [
+    1,
+    0,
+    KWD(b"obj"),
+    KEYWORD_DICT_BEGIN,
+    LIT("Type"),
+    LIT("Catalog"),
+    LIT("Outlines"),
+    2,
+    0,
+    KWD(b"R"),
+    LIT("Pages"),
+    3,
+    0,
+    KWD(b"R"),
+    KEYWORD_DICT_END,
+    KWD(b"endobj"),
+]
+
+
+def list_parsers(data: bytes, expected: List[Any], discard_pos: bool = False) -> None:
+    bp = PSInMemoryParser(data)
+    if discard_pos:
+        tokens: List[Any] = [tok for pos, tok in list(bp)]
+    else:
+        tokens = list(bp)
+    assert tokens == expected
+    with tempfile.NamedTemporaryFile() as tf:
+        with open(tf.name, "wb") as outfh:
+            outfh.write(data)
+        with open(tf.name, "rb") as infh:
+            fp = PSFileParser(infh)
+            if discard_pos:
+                tokens = [tok for pos, tok in list(fp)]
+            else:
+                tokens = list(fp)
+            assert tokens == expected
+
+
+def test_new_parser() -> None:
+    # Do a lot of them to make sure buffering works correctly
+    list_parsers(SIMPLE1 * 100, SIMPLETOK * 100, discard_pos=True)
+
+
+def test_new_parser_eof() -> None:
+    # Make sure we get a keyword at eof
+    list_parsers(SIMPLE1[:-1], SIMPLETOK, discard_pos=True)
+
+
+PAGE17 = b"""
+    /A;Name_With-Various***Characters?
+    /lime#20Green
+    /paired#28#29parentheses
+"""
+
+
+def test_new_parser1() -> None:
+    list_parsers(b"123.456", [(0, 123.456)])
+    list_parsers(b"+.013", [(0, 0.013)])
+    list_parsers(b"123", [(0, 123)])
+    list_parsers(b"true false", [(0, True), (5, False)])
+    list_parsers(b"(foobie bletch)", [(0, b"foobie bletch")])
+    list_parsers(b"(foo", [])
+
+
+def test_new_parser_names() -> None:
+    # Examples from PDF 1.7 page 17
+    list_parsers(
+        PAGE17,
+        [
+            (5, LIT("A;Name_With-Various***Characters?")),
+            (44, LIT("lime Green")),
+            (62, LIT("paired()parentheses")),
+        ],
+    )
+
+
+def test_new_parser_strings() -> None:
+    list_parsers(
+        rb"( Strings may contain balanced parentheses ( ) and "
+        rb"special characters ( * ! & } ^ % and so on ) . )",
+        [
+            (
+                0,
+                rb" Strings may contain balanced parentheses ( ) and "
+                rb"special characters ( * ! & } ^ % and so on ) . ",
+            )
+        ],
+    )
+    list_parsers(b"()", [(0, b"")])
+    list_parsers(
+        rb"""( These \
+two strings \
+are the same . )
+    """,
+        [(0, b" These two strings are the same . ")],
+    )
+    list_parsers(b"(foo\rbar)", [(0, b"foo\nbar")])
+    list_parsers(b"(foo\r)", [(0, b"foo\n")])
+    list_parsers(b"(foo\r\nbar\r\nbaz)", [(0, b"foo\nbar\nbaz")])
+    list_parsers(b"(foo\n)", [(0, b"foo\n")])
+    list_parsers(rb"(foo\r\nbaz)", [(0, b"foo\r\nbaz")])
+    list_parsers(rb"(foo\r\nbar\r\nbaz)", [(0, b"foo\r\nbar\r\nbaz")])
+    list_parsers(
+        rb"( This string contains \245two octal characters\307 . )",
+        [(0, b" This string contains \245two octal characters\307 . ")],
+    )
+    list_parsers(rb"(\0053 \053 \53)", [(0, b"\0053 \053 +")])
+    list_parsers(
+        rb"< 4E6F762073686D6F7A206B6120706F702E >", [(0, b"Nov shmoz ka pop.")]
+    )
+    list_parsers(rb"<73 686 D6F7A2>", [(0, b"shmoz ")])
+    list_parsers(rb"(\400)", [(0, b"")])
+
+
+def test_invalid_strings_eof() -> None:
+    list_parsers(rb"(\00", [])
+    list_parsers(rb"(abracadab", [])
+
+
+def inline_parsers(
+    data: bytes,
+    expected: Tuple[int, bytes],
+    target: bytes = b"EI",
+    nexttoken: Any = None,
+    blocksize: int = 16,
+) -> None:
+    bp = PSInMemoryParser(data)
+    assert bp.get_inline_data(target=target, blocksize=blocksize) == expected
+    if nexttoken is not None:
+        assert bp.nexttoken() == nexttoken
+    with tempfile.NamedTemporaryFile() as tf:
+        with open(tf.name, "wb") as outfh:
+            outfh.write(data)
+        with open(tf.name, "rb") as infh:
+            fp = PSFileParser(infh)
+            assert fp.get_inline_data(target=target, blocksize=blocksize) == expected
+            if nexttoken is not None:
+                assert fp.nexttoken() == nexttoken
+
+
+def test_get_inline_data() -> None:
+    kwd_eio = KWD(b"EIO")
+    kwd_omg = KWD(b"OMG")
+    inline_parsers(b"""0123456789""", (-1, b""))
+    inline_parsers(b"""0123456789EI""", (10, b"0123456789EI"))
+    inline_parsers(
+        b"""0123456789EIEIO""", (10, b"0123456789EI"), nexttoken=(12, kwd_eio)
+    )
+    inline_parsers(b"""012EIEIO""", (3, b"012EI"), nexttoken=(5, kwd_eio), blocksize=4)
+    inline_parsers(
+        b"""0123012EIEIO""", (7, b"0123012EI"), nexttoken=(9, kwd_eio), blocksize=4
+    )
+    for blocksize in range(1, 8):
+        inline_parsers(
+            b"""012EIEIOOMG""",
+            (
+                3,
+                b"012EIEIO",
+            ),
+            target=b"EIEIO",
+            nexttoken=(8, kwd_omg),
+            blocksize=blocksize,
+        )
diff --git a/tools/dumppdf.py b/tools/dumppdf.py
index f88389a9..630e4f8d 100755
--- a/tools/dumppdf.py
+++ b/tools/dumppdf.py
@@ -203,7 +203,6 @@ def resolve_dest(dest: object) -> Any:
         outfp.write("</outlines>\n")
     except PDFNoOutlines:
         pass
-    parser.close()
     fp.close()
 
 

From 1bb4cae8bb2016bdde6fcdd807b2f8f64b993757 Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhd@ecolingui.ca>
Date: Thu, 19 Sep 2024 07:59:09 -0400
Subject: [PATCH 2/4] fix: make sure it is really bytes in font.decode

---
 pdfminer/pdfdevice.py | 8 ++++++++
 pdfminer/pdffont.py   | 8 ++++----
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/pdfminer/pdfdevice.py b/pdfminer/pdfdevice.py
index 2374601c..6e0c58b9 100644
--- a/pdfminer/pdfdevice.py
+++ b/pdfminer/pdfdevice.py
@@ -168,6 +168,10 @@ def render_string_horizontal(
                 x -= obj * dxscale
                 needcharspace = True
             else:
+                if isinstance(obj, str):
+                    obj = utils.make_compat_bytes(obj)
+                if not isinstance(obj, bytes):
+                    continue
                 for cid in font.decode(obj):
                     if needcharspace:
                         x += charspace
@@ -208,6 +212,10 @@ def render_string_vertical(
                 y -= obj * dxscale
                 needcharspace = True
             else:
+                if isinstance(obj, str):
+                    obj = utils.make_compat_bytes(obj)
+                if not isinstance(obj, bytes):
+                    continue
                 for cid in font.decode(obj):
                     if needcharspace:
                         y += charspace
diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py
index e3c51d73..5b0402fd 100644
--- a/pdfminer/pdffont.py
+++ b/pdfminer/pdffont.py
@@ -899,8 +899,8 @@ def is_vertical(self) -> bool:
     def is_multibyte(self) -> bool:
         return False
 
-    def decode(self, bytes: bytes) -> Iterable[int]:
-        return bytearray(bytes)  # map(ord, bytes)
+    def decode(self, data: bytes) -> Iterable[int]:
+        return data
 
     def get_ascent(self) -> float:
         """Ascent above the baseline, in text space units"""
@@ -1173,8 +1173,8 @@ def is_vertical(self) -> bool:
     def is_multibyte(self) -> bool:
         return True
 
-    def decode(self, bytes: bytes) -> Iterable[int]:
-        return self.cmap.decode(bytes)
+    def decode(self, data: bytes) -> Iterable[int]:
+        return self.cmap.decode(data)
 
     def char_disp(self, cid: int) -> Union[float, Tuple[Optional[float], float]]:
         """Returns an integer for horizontal fonts, a tuple for vertical fonts."""

From 4c7d494d70bdc5e68e51c76820c1a7b7db861cb4 Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhd@ecolingui.ca>
Date: Thu, 19 Sep 2024 10:00:26 -0400
Subject: [PATCH 3/4] fix: a couple of invalid PDF fuzz cases

---
 pdfminer/pdfparser.py | 7 ++++++-
 pdfminer/utils.py     | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py
index 645e0dec..dae1af59 100644
--- a/pdfminer/pdfparser.py
+++ b/pdfminer/pdfparser.py
@@ -146,7 +146,12 @@ def flush(self) -> None:
     def do_keyword(self, pos: int, token: PSKeyword) -> None:
         if token is self.KEYWORD_R:
             # reference to indirect object
-            (_, _object_id), _ = self.pop(2)
+            try:
+                (_, _object_id), _ = self.pop(2)
+            except ValueError:
+                raise PDFSyntaxError(
+                    "Expected generation and object id in indirect object reference"
+                )
             object_id = safe_int(_object_id)
             if object_id is not None:
                 obj = PDFObjRef(self.doc, object_id)
diff --git a/pdfminer/utils.py b/pdfminer/utils.py
index 88b44b98..277cfe15 100644
--- a/pdfminer/utils.py
+++ b/pdfminer/utils.py
@@ -245,7 +245,7 @@ def parse_rect(o: Any) -> Rect:
     try:
         (x0, y0, x1, y1) = o
         return float(x0), float(y0), float(x1), float(y1)
-    except ValueError:
+    except (ValueError, TypeError):
         raise PDFValueError("Could not parse rectangle")
 
 

From 6e9d73f6c42fd2ad6772896c8f4ce1336af12b06 Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhd@ecolingui.ca>
Date: Thu, 19 Sep 2024 17:34:36 -0400
Subject: [PATCH 4/4] fix: match behaviour between PSFile / PSInMemory parser

---
 pdfminer/psparser.py            | 23 +++++++++++++++++------
 tests/test_pdfminer_psparser.py |  8 +++++---
 2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py
index 0075b345..9e1feeb4 100755
--- a/pdfminer/psparser.py
+++ b/pdfminer/psparser.py
@@ -399,15 +399,26 @@ def _parse_literal(self) -> bytes:
     def _parse_literal_hex(self) -> bytes:
         """State for escaped hex characters in literal names"""
         # Consume a hex digit only if we can ... consume a hex digit
+        if len(self.hex) >= 2:  # it actually can't exceed 2
+            self._curtoken += bytes((int(self.hex, 16),))
+            self._parse1 = self._parse_literal
+            return b"/"
         c = self.fp.read(1)
-        if c and c in HEX and len(self.hex) < 2:
+        if c and c in HEX:
             self.hex += c
         else:
-            if c:
+            if c:  # not EOF, but not hex either
+                log.warning("Invalid hex digit %r in literal", c)
                 self.fp.seek(-1, io.SEEK_CUR)
-            if self.hex:
-                self._curtoken += bytes((int(self.hex, 16),))
-            self._parse1 = self._parse_literal
+                # Add the intervening junk, just in case
+                try:
+                    tok = LIT(self._curtoken.decode("utf-8"))
+                except UnicodeDecodeError:
+                    tok = LIT(self._curtoken)
+                self._add_token(tok)
+                self._curtokenpos = self.tell() - 1 - len(self.hex)
+                self._add_token(KWD(b"#" + self.hex))
+            self._parse1 = self._parse_main
         return c
 
     def _parse_number(self) -> bytes:
@@ -597,7 +608,7 @@ def _parse_hexstring(self) -> bytes:
     | (?P<number> [-+]? (?: \d*\.\d+ | \d+ ) )
     | (?P<keyword> [A-Za-z] [^#/%\[\]()<>{}\s]*)
     | (?P<startstr> \([^()\\]*)
-    | (?P<hexstr> <[A-Fa-f\d\s]+>)
+    | (?P<hexstr> <[A-Fa-f\d\s]*>)
     | (?P<startdict> <<)
     | (?P<enddict> >>)
     | (?P<other> .)
diff --git a/tests/test_pdfminer_psparser.py b/tests/test_pdfminer_psparser.py
index 9d869ea9..ad375383 100644
--- a/tests/test_pdfminer_psparser.py
+++ b/tests/test_pdfminer_psparser.py
@@ -49,7 +49,9 @@ class TestPSFileParser:
         (23, LIT("a")),
         (25, LIT("BCD")),
         (30, LIT("Some_Name")),
-        (41, LIT("foo_xbaa")),
+        (41, LIT("foo_")),
+        (48, KWD(b"#")),
+        (49, KWD(b"xbaa")),
         (54, 0),
         (56, 1),
         (59, -2),
@@ -91,7 +93,7 @@ class TestPSFileParser:
         (23, LIT("a")),
         (25, LIT("BCD")),
         (30, LIT("Some_Name")),
-        (41, LIT("foo_xbaa")),
+        (41, LIT("foo_")),
         (54, 0),
         (56, 1),
         (59, -2),
@@ -136,7 +138,7 @@ class MyParser(PSStackParser):
             def flush(self):
                 self.add_results(*self.popall())
 
-        parser = MyParser(BytesIO(s))
+        parser = MyParser(s)
         r = []
         try:
             while True: