""" Implementation of PDF object types and other generic functionality. The internals were imported from PyPDF2, with modifications. See :ref:`here ` for the original license of the PyPDF2 project. """ import binascii import codecs import decimal import enum import logging import os import re import typing from dataclasses import dataclass, field from datetime import datetime, timedelta, timezone from io import BytesIO from typing import Any, Callable, Iterator, Optional, Tuple, Union from .misc import ( IndirectObjectExpected, PdfError, PdfReadError, PdfStreamError, PdfStrictReadError, PdfWriteError, is_regular_character, read_non_whitespace, read_until_delimiter, read_until_regex, skip_over_whitespace, ) if typing.TYPE_CHECKING: from .crypt.api import SecurityHandler __all__ = [ 'Dereferenceable', 'Reference', 'TrailerReference', 'PdfObject', 'IndirectObject', 'NullObject', 'BooleanObject', 'FloatObject', 'NumberObject', 'ByteStringObject', 'TextStringObject', 'NameObject', 'ArrayObject', 'DictionaryObject', 'StreamObject', 'read_object', 'pdf_name', 'pdf_string', 'pdf_date', 'TextStringEncoding', 'EncryptedObjAccess', 'DecryptedObjectProxy', ] OBJECT_PREFIXES = b'/<[tf(n%' NUMBER_SIGNS = b'+-' INDIRECT_PATTERN = re.compile(r"(\d+)\s+(\d+)\s+R[^a-zA-Z]".encode('ascii')) logger = logging.getLogger(__name__) class EncryptedObjAccess(enum.Enum): """ Defines what to do when an encrypted object is encountered when retrieving an object from a container. """ PROXY = 0 """ Return the proxy object as-is, and leave further encryption/decryption handling to the caller. """ TRANSPARENT = 1 """ Transparently decrypt the proxy's content (similarly wrapping any sub-containers in :class:`.DecryptedObjectProxy`, so this applies recursively). .. note:: This is the default in most situations, since it's the least likely to get in the way of any APIs that are not explicitly aware of content encryption concerns. """ RAW = 2 """ Return the underlying raw object as written, without attempting or deferring decryption. """ def _deproxy_decrypt(obj, eoa: EncryptedObjAccess): if isinstance(obj, DecryptedObjectProxy): if eoa == EncryptedObjAccess.TRANSPARENT: return obj.decrypted elif eoa == EncryptedObjAccess.RAW: return obj.raw_object return obj class Dereferenceable: """ Represents an opaque reference to a PDF object associated with a PDF Handler (see :class:`PdfHandler <.rw_common.PdfHandler>`). This can either be a reference to an object with an object ID (see :class:`.Reference`) or a reference to the trailer of a PDF document (see :class:`.TrailerReference`). """ def get_object(self) -> 'PdfObject': """Retrieve the PDF object backing this dereferenceable. :return: A :class:`.PdfObject`. """ raise NotImplementedError def get_pdf_handler(self): """Return the PDF handler associated with this dereferenceable. :return: a :class:`~.rw_common.PdfHandler`. """ raise NotImplementedError class TrailerReference(Dereferenceable): """ A reference to the trailer of a PDF document. .. warning:: Since the trailer does not have a well-defined object ID in files with "classical" cross-reference tables (as opposed to cross-reference streams), this is not a subclass of :class:`.Reference`. :param reader: a :class:`~pyhanko.pdf_utils.reader.PdfFileReader` """ def __init__(self, reader): self.reader = reader def get_object(self) -> 'PdfObject': return self.reader.trailer def get_pdf_handler(self): return self.reader @dataclass(frozen=True) class Reference(Dereferenceable): """ A reference to an object with a certain ID and generation number, with a PDF handler attached to it. .. warning:: Contrary to what one might expect, the generation number does *not* indicate the document revision in which the object was modified. In fact, nonzero generation numbers are exceedingly rare these days; in most real-world PDF files, objects are simply overridden without ever increasing the generation number. Except in very specific circumstances, dereferencing a :class:`.Reference` will return the most recent version of the object with the stated object ID and generation number. """ idnum: int """ The object's ID. """ generation: int = 0 """ The object's generation number (usually `0`) """ pdf: object = field(repr=False, hash=False, compare=False, default=None) """ The PDF handler associated with this reference, an instance of :class:`~.rw_common.PdfHandler`. .. warning:: This field is ignored when hashing or comparing :class:`.Reference` objects, so it is the API user's responsibility to not mix up references originating from unrelated PDF handlers. """ def get_object(self) -> 'PdfObject': if self.pdf is None: return NullObject() from pyhanko.pdf_utils.rw_common import PdfHandler assert isinstance(self.pdf, PdfHandler) return self.pdf.get_object(self).get_object() def get_pdf_handler(self): return self.pdf def read_object( stream, container_ref: 'Dereferenceable', as_metadata_stream: bool = False ) -> 'PdfObject': """ Read a PDF object from an input stream. .. note:: The `container_ref` parameter tells the API which reference to register when the returned object is modified in an incremental update. See also here :ref:`here ` for further information. :param stream: An input stream. :param container_ref: A reference to an object containing this one. *Note:* It is perfectly possible (and common) for `container_ref` to resolve to the return value of this function. :param as_metadata_stream: Whether to dereference the object as an XMP metadata stream. :return: A :class:`.PdfObject`. """ tok = stream.read(1) stream.seek(-1, os.SEEK_CUR) # reset to start idx = OBJECT_PREFIXES.find(tok) if idx == 0: # name object result = NameObject.read_from_stream(stream) elif idx == 1: # hexadecimal string OR dictionary peek = stream.read(2) stream.seek(-2, os.SEEK_CUR) # reset to start if peek == b'<<': result = DictionaryObject.read_from_stream( stream, container_ref, as_metadata_stream=as_metadata_stream ) else: result = read_hex_string_from_stream(stream) elif idx == 2: # array object result = ArrayObject.read_from_stream(stream, container_ref) elif idx == 3 or idx == 4: # boolean object result = BooleanObject.read_from_stream(stream) elif idx == 5: # string object result = read_string_from_stream(stream) elif idx == 6: # null object result = NullObject.read_from_stream(stream) elif idx == 7: # comment while tok not in (b'\r', b'\n'): tok = stream.read(1) read_non_whitespace(stream) stream.seek(-1, os.SEEK_CUR) result = read_object(stream, container_ref) else: # number object OR indirect reference if tok in NUMBER_SIGNS: # number result = NumberObject.read_from_stream(stream) else: peek = stream.read(20) stream.seek(-len(peek), os.SEEK_CUR) # reset to start if INDIRECT_PATTERN.match(peek) is not None: result = IndirectObject.read_from_stream(stream, container_ref) else: result = NumberObject.read_from_stream(stream) result.container_ref = container_ref return result class PdfObject: """Superclass for all PDF objects.""" container_ref: Optional[Dereferenceable] = None """ For objects read from a file, `container_ref` points to the unique addressable object containing this object. .. _container-ref-example: .. note:: Consider the following object definition in a PDF file: .. code-block:: text 4 0 obj << /Foo (Bar) >> This declares a dictionary with ID `4`, but the values ``/Foo`` and ``(Bar)`` are also PDF objects (a name and a string, respectively). All of these will have `container_ref` given by a :class:`.Reference` with object ID `4` and generation number `0`. If an object is part of the trailer of a PDF file, `container_ref` will be a :class:`.TrailerReference`. For newly created objects (i.e. those not read from a file), `container_ref` is always ``None``. """ # TODO simplify a number of modification routines using this new API def get_container_ref(self) -> Dereferenceable: """ Return a reference to the closest parent object containing this object. Raises an error if no such reference can be found. """ ref = self.container_ref if ref is None: # pragma: nocover raise PdfReadError( 'No container reference available. This object probably ' 'wasn\'t read from a file.' ) return ref def get_object(self): """Resolves indirect references. :return: `self`, unless an instance of :class:`.IndirectObject`. """ return self def write_to_stream( self, stream, handler: Optional['SecurityHandler'] = None, container_ref: Optional[Reference] = None, ): """ Abstract method to render this object to an output stream. :param stream: An output stream. :param container_ref: Local encryption key. :param handler: Security handler """ raise NotImplementedError class NullObject(PdfObject): """ PDF `null` object. All instances are treated as equal and falsy. """ def write_to_stream( self, stream, handler: Optional['SecurityHandler'] = None, container_ref=None, ): stream.write(b"null") @staticmethod def read_from_stream(stream): nulltxt = stream.read(4) if nulltxt != b"null": raise PdfReadError("Could not read Null object") return NullObject() def __eq__(self, other): return self is other or isinstance(other, NullObject) def __hash__(self): return hash(None) def __bool__(self): return False class BooleanObject(PdfObject): """PDF boolean value.""" def __init__(self, value): self.value = value def write_to_stream( self, stream, handler: Optional['SecurityHandler'] = None, container_ref=None, ): if self.value: stream.write(b"true") else: stream.write(b"false") @staticmethod def read_from_stream(stream): word = stream.read(4) if word == b"true": return BooleanObject(True) elif word == b"fals": if stream.read(1) == b"e": return BooleanObject(False) raise PdfReadError('Could not read Boolean object') def __bool__(self): return bool(self.value) def __eq__(self, other): return isinstance(other, (BooleanObject, bool)) and bool(self) == bool( other ) def __str__(self): return str(bool(self)) def __repr__(self): return str(self) class ArrayObject(list, PdfObject): """ PDF array object. This class extends from Python's list class, and supports its interface. .. warning:: Contrary to the case of dictionary objects, PyPDF2 does not transparently dereference array entries when accessed using :meth:`__getitem__`. For usability & consistency reasons, I decided to depart from that and dereference automatically. This makes the behaviour of :class:`.ArrayObject` consistent with :class:`.DictionaryObject`. That said, some vestiges of the old PyPDF2 behaviour may linger in the codebase. I'll fix those as I get to them. """ def __getitem__(self, index): return self.raw_get(index).get_object() def raw_get( self, index, decrypt: EncryptedObjAccess = EncryptedObjAccess.TRANSPARENT, ): """ .. versionchanged:: 0.14.0 ``decrypt`` parameter is no longer boolean Get a value from an array without dereferencing. In other words, if the value corresponding to the given key is of type :class:`.IndirectObject`, the indirect reference will not be resolved. :param index: Key to look up in the dictionary. :param decrypt: What to do when retrieving encrypted objects; see :class:`.EncryptedObjAccess`. The default is :attr:`.EncryptedObjAccess.TRANSPARENT`. :return: A :class:`.PdfObject`. """ val = list.__getitem__(self, index) return _deproxy_decrypt(val, decrypt) def write_to_stream( self, stream, handler: Optional['SecurityHandler'] = None, container_ref=None, ): stream.write(b"[") for data in self: stream.write(b" ") data.write_to_stream( stream, handler=handler, container_ref=container_ref ) stream.write(b" ]") @staticmethod def read_from_stream(stream, container_ref): arr = ArrayObject() tmp = stream.read(1) if tmp != b"[": raise PdfReadError("Could not read array") while True: # skip leading whitespace & check for array ending peekahead = read_non_whitespace(stream) if peekahead == b"]": break stream.seek(-1, os.SEEK_CUR) # read and append obj arr.append(read_object(stream, container_ref)) return arr class IndirectObject(PdfObject, Dereferenceable): """ Thin wrapper around a :class:`.Reference`, implementing both the :class:`.Dereferenceable` and :class:`.PdfObject` interfaces. .. warning:: For many purposes, this class is functionally interchangeable with :class:`.Reference`, with one important exception: :class:`.IndirectObject` instances pointing to the same reference but occurring at different locations in the file may have distinct `container_ref` values. """ def __init__(self, idnum, generation, pdf): self.reference = Reference(idnum, generation, pdf) def get_object(self): """ :return: The PDF object this reference points to. """ obj = self.reference.get_object() # there are few legitimate use cases for indirect references # pointing to indirect references, but the standard doesn't forbid # them, so we have to support them. # TODO protect against reference loops? return obj.get_object() if isinstance(obj, IndirectObject) else obj def get_pdf_handler(self): return self.reference.get_pdf_handler() @property def idnum(self) -> int: """ :return: the object ID of this reference. """ return self.reference.idnum @property def generation(self): """ :return: the generation number of this reference. """ return self.reference.generation def __repr__(self): return "IndirectObject(%r, %r)" % (self.idnum, self.generation) # TODO I'm starting to think that making indirect objects hashable # is a bad idea. Think about that for a bit, I might just be getting # overly pedantic. def __hash__(self): return hash((self.idnum, self.generation)) def __eq__(self, other): return ( other is not None and isinstance(other, IndirectObject) and self.reference == other.reference ) def __ne__(self, other): return not self.__eq__(other) def write_to_stream( self, stream, handler: Optional['SecurityHandler'] = None, container_ref=None, ): stream.write(b"%d %d R" % (self.idnum, self.generation)) @staticmethod def read_from_stream(stream, container_ref: 'Dereferenceable'): idnum_str = b"" while True: tok = stream.read(1) if not tok: # stream has truncated prematurely raise PdfStreamError("Stream has ended unexpectedly") if tok.isspace(): if not idnum_str: continue break idnum_str += tok generation_str = b"" while True: tok = stream.read(1) if not tok: # stream has truncated prematurely raise PdfStreamError("Stream has ended unexpectedly") if tok.isspace(): if not generation_str: continue break generation_str += tok r = read_non_whitespace(stream) if r != b"R": pos = hex(stream.tell()) raise PdfReadError( "Error reading indirect object reference at byte %s" % pos ) try: idnum, generation = int(idnum_str), int(generation_str) if not (idnum > 0 and generation >= 0): raise ValueError except ValueError: pos = hex(stream.tell()) raise PdfReadError( f"Parse error on indirect object reference around {pos}" ) return IndirectObject( int(idnum_str), int(generation_str), container_ref.get_pdf_handler() ) class FloatObject(decimal.Decimal, PdfObject): """ PDF Float object. Internally, these are treated as decimals (and therefore actually fixed-point objects, to be precise). """ def __new__(cls, value="0"): return decimal.Decimal.__new__(cls, str(value)) def __repr__(self): if self == self.to_integral(): return str(self.quantize(decimal.Decimal(1))) else: return str(self) def as_numeric(self): """ :return: a Python ``float`` value for this object. """ return float(self) def write_to_stream( self, stream, handler: Optional['SecurityHandler'] = None, container_ref=None, ): stream.write(repr(self).encode('ascii')) class NumberObject(int, PdfObject): """ PDF number object. This is the PDF type for integer values. """ NumberPattern = re.compile(b'[^+-.0-9]') ByteDot = b"." # noinspection PyArgumentList def __new__(cls, value): val = int(value) return int.__new__(cls, val) def as_numeric(self): """ :return: a Python ``int`` value for this object. """ return int(self) def write_to_stream( self, stream, handler: Optional['SecurityHandler'] = None, container_ref=None, ): stream.write(repr(self).encode('ascii')) @staticmethod def read_from_stream(stream): num = read_until_regex( stream, regex=NumberObject.NumberPattern, # for consistency with other read_object() output ignore_eof=True, ) if num.find(NumberObject.ByteDot) != -1: return FloatObject(num.decode('ascii')) else: return NumberObject(num.decode('ascii')) # TODO: not sure I like this behaviour of PyPDF2. Review. def pdf_string( string: Union[str, bytes, bytearray] ) -> Union['ByteStringObject', 'TextStringObject']: """ Encode a string as a :class:`.TextStringObject` if possible, or a :class:`.ByteStringObject` otherwise. :param string: A Python string. """ if isinstance(string, str): return TextStringObject(string) elif isinstance(string, (bytes, bytearray)): guessed = _guess_enc_by_bom(string) try: retval = TextStringObject(guessed.decode(string)) retval.autodetected_encoding = guessed return retval except UnicodeDecodeError: return ByteStringObject(string) else: raise TypeError("pdf_string should have str or bytes arg") HEX_DIGITS = b'0123456789abcdefABCDEF' def read_hex_string_from_stream( stream, ) -> Union['ByteStringObject', 'TextStringObject']: """ Read a hex string from a stream into a PDF string object. :param stream: An input stream. """ stream.read(1) odd = False def read_tokens(): nonlocal odd while True: tok = read_non_whitespace(stream) if tok == b">": return elif tok not in HEX_DIGITS: raise PdfStreamError( "Unexpected token in hex string: " + repr(tok) ) yield tok odd = not odd result = binascii.unhexlify( b''.join(read_tokens()) + (b'0' if odd else b'') ) return pdf_string(result) def _read_string_literal_bytes(stream) -> bytes: stream.read(1) parens = 1 txt = BytesIO() while True: tok = stream.read(1) if not tok: # stream has truncated prematurely raise PdfStreamError("Stream has ended unexpectedly") if tok == b"(": parens += 1 elif tok == b")": parens -= 1 if parens == 0: break elif tok == b"\\": tok = stream.read(1) if tok in b"() /%<>[]#_&$\\": pass # simply use the second byte we read elif tok == b"n": tok = b"\n" elif tok == b"r": tok = b"\r" elif tok == b"t": tok = b"\t" elif tok == b"b": tok = b"\b" elif tok == b"f": tok = b"\f" elif tok.isdigit(): # "The number ddd may consist of one, two, or three # octal digits; high-order overflow shall be ignored. # Three octal digits shall be used, with leading zeros # as needed, if the next character of the string is also # a digit." (PDF reference 7.3.4.2, p 16) for i in range(2): ntok = stream.read(1) if ntok.isdigit(): tok += ntok else: # premature end, seek back stream.seek(-1, os.SEEK_CUR) break octal = int(tok, base=8) # interpret as byte tok = bytes((octal,)) elif tok in b"\n\r": # This case is hit when a backslash followed by a line # break occurs. If it's a multi-char EOL, consume the # second character: tok = stream.read(1) if tok not in b"\n\r": stream.seek(-1, os.SEEK_CUR) # Then don't add anything to the actual string, since this # line break was escaped: tok = b'' else: raise PdfReadError("Unexpected escaped string: " + repr(tok)) txt.write(tok) return txt.getvalue() def read_string_from_stream( stream, ) -> Union['ByteStringObject', 'TextStringObject']: """ Read a PDF string literal from a stream. Attempt to decode it into a text string by autodetecting the encoding, or failing that, return it as a byte string instead. :param stream: An input stream. """ return pdf_string(_read_string_literal_bytes(stream)) class ByteStringObject(bytes, PdfObject): """PDF bytestring class.""" original_bytes = property(lambda self: self) """ For compatibility with :attr:`.TextStringObject.original_bytes` """ def write_to_stream( self, stream, handler: Optional['SecurityHandler'] = None, container_ref=None, ): bytearr: bytes = self if handler is not None and container_ref is not None: cf = handler.get_string_filter() local_key = cf.derive_object_key( container_ref.idnum, container_ref.generation ) bytearr = cf.encrypt(local_key, bytearr) stream.write(b"<") stream.write(binascii.hexlify(bytearr)) stream.write(b">") class TextStringEncoding(enum.Enum): """ Encodings for PDF text strings. """ PDF_DOC = None """ PDFDocEncoding (one-byte character codes; PDF-specific). """ UTF16BE = (codecs.BOM_UTF16_BE, 'utf-16be') """ UTF-16BE encoding. """ UTF8 = (codecs.BOM_UTF8, 'utf-8') """ UTF-8 encoding (PDF 2.0) """ UTF16LE = (codecs.BOM_UTF16_LE, 'utf-16le') """ UTF-16LE encoding. .. note:: This is strictly speaking invalid in PDF 2.0, but some authoring tools output such strings anyway (presumably due to the fact that it's the default wide character encoding on Windows). """ def encode(self, string: str) -> bytes: """ Encode a string with BOM. :param string: The string to encode. :return: The encoded string. """ if self == TextStringEncoding.PDF_DOC: return encode_pdfdocencoding(string) else: bom, enc = self.value return bom + string.encode(enc) def decode(self, string: Union[bytes, bytearray]) -> str: """ Decode a string with BOM. :param string: The string to encode. :return: The encoded string. :raise UnicodeDecodeError: Raised if decoding fails. """ if self == TextStringEncoding.PDF_DOC: return decode_pdfdocencoding(string) elif self == TextStringEncoding.UTF8: return string.decode('utf-8-sig') else: return string.decode('utf-16') def _guess_enc_by_bom(encoded: Union[bytes, bytearray]) -> TextStringEncoding: if encoded.startswith(codecs.BOM_UTF16_BE): return TextStringEncoding.UTF16BE elif encoded.startswith(codecs.BOM_UTF16_LE): return TextStringEncoding.UTF16LE elif encoded.startswith(codecs.BOM_UTF8): return TextStringEncoding.UTF8 else: # This is probably a big performance hit here, but we need to # convert string objects into the text/unicode-aware version if # possible... and the only way to check if that's possible is # to try. Some strings are strings, some are just byte arrays. return TextStringEncoding.PDF_DOC class TextStringObject(str, PdfObject): """ PDF text string object. """ autodetected_encoding: Optional[TextStringEncoding] = None """ Autodetected encoding when parsing the file. """ force_output_encoding: Optional[TextStringEncoding] = None """ Output encoding to use when serialising the string. The default is to try PDFDocEncoding first, and fall back to UTF-16BE. """ @property def original_bytes(self): """ Retrieve the original bytes of the string as specified in the source file. This may be necessary if this string was misidentified as a text string. """ # We're a text string object, but the library is trying to get our raw # bytes. This can happen if we auto-detected this string as text, but # we were wrong. It's pretty common. Return the original bytes that # would have been used to create this object, based upon the autodetect # method. if self.autodetected_encoding: return self.autodetected_encoding.encode(self) else: raise PdfError("No information about original bytes") def write_to_stream( self, stream, handler: Optional['SecurityHandler'] = None, container_ref=None, ): encoded: bytes if self.force_output_encoding is not None: encoded = self.force_output_encoding.encode(self) else: # Try to write the string out as a PDFDocEncoding encoded string. # It's nicer to look at in the PDF file. Sadly, we take a # performance hit here for trying... try: encoded = encode_pdfdocencoding(self) except UnicodeEncodeError: # fall back to UTF-16BE by default, since it's the only # valid pre-2.0 Unicode encoding. encoded = codecs.BOM_UTF16_BE + self.encode("utf-16be") cf = None if handler is not None and container_ref is not None: cf_name = handler.crypt_filter_config.string_filter_name # apply default processing if the filter is the identity filter cf = None if cf_name == '/Identity' else handler.get_string_filter() if cf is not None: local_key = cf.derive_object_key( container_ref.idnum, container_ref.generation ) encoded = cf.encrypt(local_key, encoded) obj = ByteStringObject(encoded) obj.write_to_stream(stream) else: stream.write(b"(") for c in encoded: c_ = bytes([c]) if not c_.isalnum() and c != 0x20: stream.write(b"\\%03o" % c) else: stream.write(c_) stream.write(b")") def _as_hex_digit(ascii_char): if 0x30 <= ascii_char <= 0x39: return ascii_char - 0x30 elif 0x41 <= ascii_char <= 0x46: return ascii_char - 0x37 elif 0x61 <= ascii_char <= 0x66: return ascii_char - 0x57 else: raise PdfReadError( "Numeric escape in PDF name must use hexadecimal digits" ) def _decode_name(name_bytes: bytes) -> 'NameObject': """ Decode the bytes that make up a name object (minus the initial /), expanding all escapes along the way. """ result = BytesIO() result.write(b'/') name_iter = iter(name_bytes) try: while True: cur_byte = next(name_iter) if cur_byte == 0x23: # '#' is the 2-digit escape prefix # escape sequence: grab next two bytes try: digit1 = next(name_iter) digit2 = next(name_iter) except StopIteration: raise PdfReadError( f"Unterminated escape in PDF name /{repr(name_bytes)}" ) cur_byte = _as_hex_digit(digit1) * 16 + _as_hex_digit(digit2) elif not (0x21 <= cur_byte <= 0x7E) or not is_regular_character( cur_byte ): raise PdfReadError( f"Byte (0x{cur_byte:02x}) must be escaped in a PDF name" ) result.write(bytes((cur_byte,))) except StopIteration: pass name_bytes = result.getvalue() # NOTE: we assume UTF-8, but the PDF spec actually doesn't prescribe # a character encoding for names, they're just byte sequences. # This doesn't matter in 99.99% of cases (since names are not supposed # to contain renderable text, and are typically 7-bit ASCII anyhow), # but it's not 100% correct. I don't see a way to fix this without causing # massive non-obvious API breakage (since NameObject inherits from 'str' as # in PyPDF2), i.e. the correctness benefit is vastly outweighed by the # risks (for now) encodings_to_try = ('utf8', 'latin1') # latin1 should never trigger decoding errors, since Python's implementation # maps even unassigned values to corresponding unicode codepoints name_str = None for enc in encodings_to_try: try: name_str = name_bytes.decode(enc) break except ValueError: pass assert name_str is not None return NameObject(name_str) class NameObject(str, PdfObject): """ PDF name object. These are valid Python strings, but names and strings are treated differently in the PDF specification, so proper care is required. """ def write_to_stream( self, stream, handler: Optional['SecurityHandler'] = None, container_ref=None, ): byte_iter = iter(self.encode('utf8')) if not next(byte_iter) == 0x2F: raise PdfWriteError( f"Could not serialise name object {repr(self)}, " f"must start with /" ) stream.write(b'/') for cur_byte in byte_iter: if ( cur_byte == 0x23 or not (0x21 <= cur_byte <= 0x7E) or not is_regular_character(cur_byte) ): stream.write('#{:X}'.format(cur_byte).encode('ascii')) else: # no convenient syntax for writing a single byte... as_bytes = bytes((cur_byte,)) stream.write(as_bytes) @staticmethod def read_from_stream(stream): name_start = stream.read(1) if name_start != b'/': raise PdfReadError("Name object should start with /") name_bytes = read_until_delimiter(stream) return _decode_name(name_bytes) def _normalise_key(key): if not isinstance(key, NameObject): if isinstance(key, str): return NameObject(key) else: raise ValueError("key must be a name object") return key class DictionaryObject(dict, PdfObject): """ A PDF dictionary object. Keys in a PDF dictionary are PDF names, and values are PDF objects. When accessing a key using the standard :meth:`__getitem__` syntax, :class:`.IndirectObject` references will be resolved. """ def __init__(self, dict_data=None): if dict_data is not None: super().__init__( {_normalise_key(k): v for k, v in dict_data.items()} ) else: super().__init__() def raw_get( self, key: Union[NameObject, str], decrypt: EncryptedObjAccess = EncryptedObjAccess.TRANSPARENT, ): """ .. versionchanged:: 0.14.0 ``decrypt`` parameter is no longer boolean Get a value from a dictionary without dereferencing. In other words, if the value corresponding to the given key is of type :class:`.IndirectObject`, the indirect reference will not be resolved. :param key: Key to look up in the dictionary. :param decrypt: What to do when retrieving encrypted objects; see :class:`.EncryptedObjAccess`. The default is :attr:`.EncryptedObjAccess.TRANSPARENT`. :return: A :class:`.PdfObject`. """ val = dict.__getitem__(self, key) return _deproxy_decrypt(val, decrypt) def __setitem__(self, key, value): key = _normalise_key(key) if not isinstance(value, PdfObject): raise ValueError("value must be PdfObject") if self.container_ref is not None: value.container_ref = self.container_ref return dict.__setitem__(self, key, value) def setdefault(self, key, value=None): key = _normalise_key(key) if not isinstance(value, PdfObject): raise ValueError("value must be PdfObject") if self.container_ref is not None: value.container_ref = self.container_ref return dict.setdefault(self, key, value) def __getitem__(self, key): raw_obj = dict.__getitem__(self, key) if key == '/Metadata' and isinstance(raw_obj, IndirectObject): from pyhanko.pdf_utils.rw_common import PdfHandler handler = raw_obj.get_pdf_handler() assert isinstance(handler, PdfHandler) return handler.get_object( raw_obj.reference, as_metadata_stream=True ) else: deref_obj = raw_obj.get_object() if isinstance(deref_obj, NullObject): raise KeyError(key) else: return deref_obj def get_and_apply( self, key, function: Callable[[PdfObject], Any], *, raw=False, default=None, ): try: value = self.raw_get(key) if raw else self[key] except KeyError: return default return function(value) def get_value_as_reference(self, key, optional=False) -> Reference: def as_ref(obj): if isinstance(obj, IndirectObject): return obj.reference raise IndirectObjectExpected value = self.get_and_apply(key, as_ref, raw=True) if value is None and not optional: raise KeyError return value def write_to_stream( self, stream, handler: Optional['SecurityHandler'] = None, container_ref=None, ): stream.write(b"<<\n") for key, value in list(self.items()): key.write_to_stream(stream, handler, container_ref) stream.write(b" ") value.write_to_stream(stream, handler, container_ref) stream.write(b"\n") stream.write(b">>") @staticmethod def read_from_stream( stream, container_ref: 'Dereferenceable', as_metadata_stream: bool = False, ): tmp = stream.read(2) if tmp != b"<<": raise PdfReadError( "Dictionary read error at byte 0x%s: " "stream must begin with '<<'" % hex(stream.tell()) ) data = {} handler = container_ref.get_pdf_handler() while True: tok = read_non_whitespace(stream) if tok == b">": stream.read(1) break stream.seek(-1, os.SEEK_CUR) try: key = NameObject.read_from_stream(stream) except Exception as ex: raise PdfReadError( "Failed to read dictionary key at byte 0x%s; expected PDF name" % hex(stream.tell()) ) from ex read_non_whitespace(stream) stream.seek(-1, os.SEEK_CUR) value = read_object(stream, container_ref) if key not in data: data[key] = value else: err = ( "Multiple definitions in dictionary at byte " "%s for key %s" % (hex(stream.tell()), key) ) if handler.strict: raise PdfStrictReadError(err) else: logger.warning(err) pos = stream.tell() s = read_non_whitespace(stream, allow_eof=True) stream_data = None if s == b's' and stream.read(5) == b'tream': # odd PDF file output has spaces after 'stream' keyword # but before EOL. Original PyPDF2 patch provided by Danial Sandler, # modified by Matthias Valvekens skip_over_whitespace(stream, stop_after_eol=True) # this is a stream object, not a dictionary length = data[pdf_name("/Length")] if isinstance(length, IndirectObject): t = stream.tell() length = handler.get_object(length) stream.seek(t) stream_data = stream.read(length) e = read_non_whitespace(stream) ndstream = stream.read(8) if (e + ndstream) != b"endstream": # (sigh) - the odd PDF file has a length that is too long, so # we need to read backwards to find the "endstream" ending. # ReportLab (unknown version) generates files with this bug, # and Python users into PDF files tend to be our audience. # we need to do this to correct the streamdata and chop off # an extra character. orig_endstream_pos = stream.tell() stream.seek(-10, os.SEEK_CUR) end = stream.read(9) if end == b"endstream": # we found it by looking back one character further. stream_data = stream_data[:-1] else: raise PdfReadError( "Unable to find 'endstream' marker after " "stream at byte %s." % hex(orig_endstream_pos) ) else: stream.seek(pos) if stream_data is not None: # pass in everything as encoded data, the StreamObject class # will take care of decoding as necessary stm_cls = StreamObject if as_metadata_stream: try: # noinspection PyUnresolvedReferences from pyhanko.pdf_utils.metadata.xmp_xml import ( MetadataStream, ) stm_cls = MetadataStream except ImportError: # pragma: nocover pass return stm_cls(data, encoded_data=stream_data) else: return DictionaryObject(data) class StreamObject(DictionaryObject): """ PDF stream object. Essentially, a PDF stream is a dictionary object with a binary blob of data attached. This data can be encoded by various filters (not all of which are currently supported, see :mod:`.filters`). A stream object can be initialised with encoded or decoded data. The former is used by :class:`.reader.PdfFileReader` to provide on-demand decoding, with :class:`.writer.BasePdfFileWriter` and its subclasses working the other way around. .. note:: The :class:`.StreamObject` class manages some of its dictionary keys by itself. This is partly the case for the various ``/Filter`` and ``/DecodeParms`` entries, but also for the ``/Length`` entry. The latter will be overwritten as necessary. :param dict_data: The dictionary data for this stream object. :param stream_data: The (unencoded) stream data. :param encoded_data: The encoded stream data. .. warning:: Ordinarily, a stream can be initialised either from decoded and from encoded data. If both `stream_data` and `encoded_data` are provided, the caller is responsible for making sure that both are compatible given the currently relevant filter configuration. :param handler: A reference to the currently active :class:`.pyhanko.pdf_utils.crypt.SecurityHandler`. This is only necessary if the stream requires crypt filters. """ def __init__( self, dict_data: Optional[dict] = None, stream_data: Optional[bytes] = None, encoded_data: Optional[bytes] = None, handler: Optional['SecurityHandler'] = None, ): super().__init__(dict_data) self._data = stream_data self._encoded_data = encoded_data self._handler = handler def _implicit_decrypt_stream_content( self, handler, ref: Reference, decrypted_entries: dict ): """ Internal method to handle decrypting streams that are encrypted with the document's default encryption handler for streams and/or embedded files (i.e. not with any custom crypt filters). This routine is called deep in the object fetching stack, and you should never invoke it yourself. It's defined as a method in :class:`.StreamObject` because it needs to be able to preserve the type (subclass) of the stream object on which it is called, in order to properly feed into the logic surrounding metadata streams. """ if handler is not None: self._handler = handler # can't deal with crypt filters here if self._has_crypt_filter: # in this case, dealing with encryption is delegated # to the stream decoding process, so just pretend the data # is decrypted. # We pass a reference to the security handler below, # which is sufficient to take care of /Crypt filters # in the stream. decrypted_data = self.encoded_data else: if self.is_embedded_file_stream: cf = handler.get_embedded_file_filter() else: cf = handler.get_stream_filter() local_key = cf.derive_object_key(ref.idnum, ref.generation) decrypted_data = cf.decrypt(local_key, self.encoded_data) return self.__class__( decrypted_entries, encoded_data=decrypted_data, handler=handler ) @property def _has_crypt_filter(self) -> bool: return '/Crypt' in (name for name, _ in self._filters()) def add_crypt_filter( self, name=NameObject('/Identity'), params=None, handler: Optional['SecurityHandler'] = None, ): if handler is not None: self._handler = handler if self._handler is None: raise PdfStreamError("There is no security handler around") if name not in self._handler.crypt_filter_config: raise PdfStreamError( f"The crypt filter {name} is not known to the security handler." ) params = params or DictionaryObject() params['/Type'] = pdf_name('/CryptFilterDecodeParms') params['/Name'] = name self.apply_filter( pdf_name('/Crypt'), params=params, allow_duplicates=True ) def _filters(self) -> Iterator[Tuple[str, Optional[dict]]]: try: filter_arr = self[pdf_name('/Filter')] except KeyError: return if isinstance(filter_arr, NameObject): # we have a single filter instance filter_arr = (filter_arr,) elif not isinstance(filter_arr, ArrayObject): raise PdfStreamError( '/Filter should be a name object or an array of names.' ) try: decode_params = self[pdf_name('/DecodeParms')] if isinstance(decode_params, DictionaryObject): # one instance decode_params = [decode_params] if isinstance(decode_params, list): lendiff = len(filter_arr) - len(decode_params) # this should be zero, but let's be lenient if lendiff > 0: decode_params += [NullObject()] * lendiff except KeyError: decode_params = [NullObject()] * len(filter_arr) # make sure to deal with resolving decrypted object proxies by # calling get_object() yield from zip( filter_arr, (param_set.get_object() for param_set in decode_params) ) def _stream_decoders(self): from . import filters for filter_type, params in self._filters(): try: if params is None or isinstance(params, NullObject): params = {} if filter_type == '/Crypt': # crypt filters get special treatment # if we're dealing with the identity filter, just move on if params.get('/Name', '/Identity') == '/Identity': continue # if it's another one, we need a reference to the security # handler sh = self._handler if sh is None: raise PdfStreamError( "PDF streams require a security handler to use " "explicit /Crypt filters." ) decoder = filters.CryptFilterDecoder(sh) else: decoder = filters.get_generic_decoder(filter_type) yield decoder, params except KeyError: raise NotImplementedError( "Filters of type %s are not supported." % filter_type ) def strip_filters(self): """ Ensure the stream is decoded, and remove any filters. """ self._data = self._encoded_data = self.data self.pop(pdf_name('/Filter'), None) self.pop(pdf_name('/DecodeParms'), None) @property def data(self) -> bytes: """ Return the decoded stream data as bytes. If the stream hasn't been decoded yet, it will be decoded on-the-fly. :raises .misc.PdfStreamError: If the stream could not be decoded. """ if self._data is None: data = self._encoded_data if data is None: raise PdfStreamError("No data available.") for filter_cls, decode_params in self._stream_decoders(): data = filter_cls.decode(data, decode_params) if isinstance(data, memoryview): data = data.tobytes() self._data = data assert self._data is not None return self._data @property def encoded_data(self) -> bytes: """ Return the encoded stream data as bytes. If the stream hasn't been encoded yet, it will be encoded on-the-fly. :raises .misc.PdfStreamError: If the stream could not be encoded. """ if self._encoded_data is None: data = self._data if data is None: raise PdfStreamError("No data available.") decoders = tuple(self._stream_decoders()) for filter_cls, decode_params in reversed(decoders): data = filter_cls.encode(data, decode_params) self._encoded_data = data assert self._encoded_data is not None return self._encoded_data def apply_filter( self, filter_name, params=None, allow_duplicates: Optional[bool] = True ): """ Apply a new filter to this stream. This filter will be prepended to any existing filters. This means that is is placed *last* in the encoding order, but *first* in the decoding order. *Note:* Calling this method on an encoded stream will first cause the stream to be decoded using the filters already present. The cached value for the encoded stream data will be cleared. :param filter_name: Name of the filter (see :const:`~pyhanko.pdf_utils.filters.DECODERS`) :param params: Parameters to the filter (will be written to ``/DecodeParms`` if not ``None``) :param allow_duplicates: If ``None``, silently ignore duplicate filters. If ``False``, raise ValueError when attempting to add a duplicate filter. If ``True`` (default), duplicate filters are allowed. """ # If the stream already contains (encoded) data, we have to reencode it # later on, which requires a decoding operation. data = self._data if data is None and self._encoded_data is not None: data = self.data # ... and list all current filters with their parameters. cur_filters = list(self._filters()) # normalise the input parameters if not isinstance(filter_name, NameObject): filter_name = pdf_name(filter_name) if params is not None and not isinstance(params, DictionaryObject): params = DictionaryObject(params) if not cur_filters: # only one filter, so don't write arrays self[pdf_name('/Filter')] = filter_name if params: self[pdf_name('/DecodeParms')] = params else: # FIXME deal with shortened names for standard filters # split cur_filters back into two pieces filter_names, param_sets = zip(*cur_filters) if not allow_duplicates and filter_name in filter_names: if allow_duplicates is False: raise PdfWriteError( f'Filter {filter_name} has already been applied to ' f'this stream.' ) else: # Silently ignore return # prepend the new filter (order is important!) self[pdf_name('/Filter')] = ArrayObject( (filter_name,) + filter_names ) if params or any(param_sets): def _params(): yield params or NullObject() for param_set in param_sets: yield param_set or NullObject() self[pdf_name('/DecodeParms')] = ArrayObject(_params()) self._encoded_data = None self._data = data def compress(self): """ Convenience method to add a ``/FlateDecode`` filter with default settings, if one is not already present. *Note:* compression is not actually applied until the stream is written. """ self.apply_filter(pdf_name('/FlateDecode'), allow_duplicates=None) @property def is_embedded_file_stream(self): try: return self.raw_get('/Type') == '/EmbeddedFile' except KeyError: return False def write_to_stream( self, stream, handler: Optional['SecurityHandler'] = None, container_ref=None, ): data = self.encoded_data if ( handler is not None and container_ref is not None and not self._has_crypt_filter ): cf = handler.get_stream_filter() local_key = cf.derive_object_key( container_ref.idnum, container_ref.generation ) data = cf.encrypt(local_key, data) self[NameObject("/Length")] = NumberObject(len(data)) # write the dictionary super().write_to_stream(stream, handler, container_ref) del self["/Length"] stream.write(b"\nstream\n") stream.write(data) stream.write(b"\nendstream") def encode_pdfdocencoding(unicode_string): def _build(): for c in unicode_string: try: yield _pdfDocEncoding_rev[c] except KeyError: raise UnicodeEncodeError( "pdfdocencoding", c, -1, -1, "does not exist in translation table", ) return bytes(_build()) def decode_pdfdocencoding(byte_array): def _build(): for b in byte_array: c = _pdfDocEncoding[b] if c == '\u0000': raise UnicodeDecodeError( "pdfdocencoding", bytes((b,)), -1, -1, "does not exist in translation table", ) yield c return ''.join(_build()) _pdfDocEncoding = ( '\u0000', '\u0000', '\u0000', '\u0000', '\u0000', '\u0000', '\u0000', '\u0000', '\u0000', '\u0000', '\u0000', '\u0000', '\u0000', '\u0000', '\u0000', '\u0000', '\u0000', '\u0000', '\u0000', '\u0000', '\u0000', '\u0000', '\u0000', '\u0000', '\u02d8', '\u02c7', '\u02c6', '\u02d9', '\u02dd', '\u02db', '\u02da', '\u02dc', '\u0020', '\u0021', '\u0022', '\u0023', '\u0024', '\u0025', '\u0026', '\u0027', '\u0028', '\u0029', '\u002a', '\u002b', '\u002c', '\u002d', '\u002e', '\u002f', '\u0030', '\u0031', '\u0032', '\u0033', '\u0034', '\u0035', '\u0036', '\u0037', '\u0038', '\u0039', '\u003a', '\u003b', '\u003c', '\u003d', '\u003e', '\u003f', '\u0040', '\u0041', '\u0042', '\u0043', '\u0044', '\u0045', '\u0046', '\u0047', '\u0048', '\u0049', '\u004a', '\u004b', '\u004c', '\u004d', '\u004e', '\u004f', '\u0050', '\u0051', '\u0052', '\u0053', '\u0054', '\u0055', '\u0056', '\u0057', '\u0058', '\u0059', '\u005a', '\u005b', '\u005c', '\u005d', '\u005e', '\u005f', '\u0060', '\u0061', '\u0062', '\u0063', '\u0064', '\u0065', '\u0066', '\u0067', '\u0068', '\u0069', '\u006a', '\u006b', '\u006c', '\u006d', '\u006e', '\u006f', '\u0070', '\u0071', '\u0072', '\u0073', '\u0074', '\u0075', '\u0076', '\u0077', '\u0078', '\u0079', '\u007a', '\u007b', '\u007c', '\u007d', '\u007e', '\u0000', '\u2022', '\u2020', '\u2021', '\u2026', '\u2014', '\u2013', '\u0192', '\u2044', '\u2039', '\u203a', '\u2212', '\u2030', '\u201e', '\u201c', '\u201d', '\u2018', '\u2019', '\u201a', '\u2122', '\ufb01', '\ufb02', '\u0141', '\u0152', '\u0160', '\u0178', '\u017d', '\u0131', '\u0142', '\u0153', '\u0161', '\u017e', '\u0000', '\u20ac', '\u00a1', '\u00a2', '\u00a3', '\u00a4', '\u00a5', '\u00a6', '\u00a7', '\u00a8', '\u00a9', '\u00aa', '\u00ab', '\u00ac', '\u0000', '\u00ae', '\u00af', '\u00b0', '\u00b1', '\u00b2', '\u00b3', '\u00b4', '\u00b5', '\u00b6', '\u00b7', '\u00b8', '\u00b9', '\u00ba', '\u00bb', '\u00bc', '\u00bd', '\u00be', '\u00bf', '\u00c0', '\u00c1', '\u00c2', '\u00c3', '\u00c4', '\u00c5', '\u00c6', '\u00c7', '\u00c8', '\u00c9', '\u00ca', '\u00cb', '\u00cc', '\u00cd', '\u00ce', '\u00cf', '\u00d0', '\u00d1', '\u00d2', '\u00d3', '\u00d4', '\u00d5', '\u00d6', '\u00d7', '\u00d8', '\u00d9', '\u00da', '\u00db', '\u00dc', '\u00dd', '\u00de', '\u00df', '\u00e0', '\u00e1', '\u00e2', '\u00e3', '\u00e4', '\u00e5', '\u00e6', '\u00e7', '\u00e8', '\u00e9', '\u00ea', '\u00eb', '\u00ec', '\u00ed', '\u00ee', '\u00ef', '\u00f0', '\u00f1', '\u00f2', '\u00f3', '\u00f4', '\u00f5', '\u00f6', '\u00f7', '\u00f8', '\u00f9', '\u00fa', '\u00fb', '\u00fc', '\u00fd', '\u00fe', '\u00ff', ) assert len(_pdfDocEncoding) == 256 _pdfDocEncoding_rev = {char: ix for ix, char in enumerate(_pdfDocEncoding)} pdf_name = NameObject PROXYABLE = (TextStringObject, ByteStringObject, DictionaryObject, ArrayObject) def proxy_encrypted_obj(encrypted_obj, handler): if isinstance(encrypted_obj, PROXYABLE): return DecryptedObjectProxy(encrypted_obj, handler) else: return encrypted_obj class DecryptedObjectProxy(PdfObject): """ Internal proxy class that allows transparent on-demand encryption of objects. .. warning:: Most public-facing APIs won't leave you to deal with these *directly* (that's half the reason this class exists in the first place), and the API of this class is considered internal. However, for reasons related to the historical PyPDF2 codebase from which pyHanko's object handling code ultimately derives, there are some Python builtins that might cause these wrapper objects to inadvertently "leak". Please `tell us about such cases `_ so we can make those types of access more convenient and robust. .. danger:: The ``__eq__`` implementation on this class is not safe for general use, due to the fact that certain structures in PDF are exempt from encryption. Only compare proxy objects with ``==`` in areas of the document where these exemptions don't apply. :param raw_object: A raw object, typically as-parsed from a PDF file. :param handler: The security handler governing this object. """ raw_object: PdfObject """ The underlying raw object, in its encrypted state. """ def __init__(self, raw_object: PdfObject, handler): self.raw_object = raw_object self._decrypted: Optional[PdfObject] = None self.handler = handler @property def decrypted(self) -> PdfObject: """ The decrypted PDF object exposed as a property. If this object is a container object, its constituent parts will be wrapped in :class:`.DecryptedObjectProxy` as well, in order to defer further decryption until the values are requested through a getter method on the container. """ if self._decrypted is not None: return self._decrypted from .crypt import SecurityHandler decrypted: PdfObject obj = self.raw_object handler: SecurityHandler = self.handler container_ref = obj.container_ref if not isinstance(container_ref, Reference): raise ValueError( "Proxyable objects must have a container ref pointing to a " f"numbered object, not '{container_ref}'." ) # pragma: nocover if isinstance(obj, ByteStringObject) or isinstance( obj, TextStringObject ): cf = handler.get_string_filter() local_key = cf.derive_object_key( container_ref.idnum, container_ref.generation ) decrypted = pdf_string(cf.decrypt(local_key, obj.original_bytes)) elif isinstance(obj, DictionaryObject): decrypted_entries = { dictkey: proxy_encrypted_obj(value, handler) for dictkey, value in obj.items() } if isinstance(obj, StreamObject): decrypted = obj._implicit_decrypt_stream_content( handler, container_ref, decrypted_entries ) else: decrypted = DictionaryObject(decrypted_entries) elif isinstance(obj, ArrayObject): decrypted_map = map(lambda v: proxy_encrypted_obj(v, handler), obj) decrypted = ArrayObject(decrypted_map) else: # pragma: nocover raise TypeError(f'Object of type {type(obj)} is not proxyable.') decrypted.container_ref = obj.container_ref self._decrypted = decrypted return decrypted def write_to_stream( self, stream, handler: Optional['SecurityHandler'] = None, container_ref=None, ): # maybe the encryption key for this object changed (due to it being # included as part of a larger object or somesuch, without proper # dereferencing), so to avoid unexpected shenanigans, let's start from # scratch. self.decrypted.write_to_stream(stream, handler, container_ref) def get_object(self): return self.decrypted.get_object() @property def container_ref(self): return self.raw_object.container_ref def __eq__(self, other): # NOTE: this will fail if the dictionary contains "un-decryptable" # descendants! The diff_analysis module is aware of this restriction, # but you probably shouldn't use this __eq__ method to compare # arbitrary objects in a PDF file. return ( isinstance(other, DecryptedObjectProxy) and other.decrypted == self.decrypted ) ASN_DT_FORMAT = "D:%Y%m%d%H%M%S" def pdf_date(dt: datetime) -> TextStringObject: """ Convert a datetime object into a PDF string. This function supports both timezone-aware and naive datetime objects. :param dt: The datetime object to convert. :return: A :class:`TextStringObject` representing the datetime passed in. """ base_dt = dt.strftime(ASN_DT_FORMAT) utc_offset_string = '' utc_offset = dt.utcoffset() if utc_offset is not None: # compute UTC offset string tz_seconds = utc_offset.total_seconds() if not tz_seconds: utc_offset_string = 'Z' else: sign = '+' if tz_seconds < 0: sign = '-' tz_seconds = abs(tz_seconds) hrs, tz_seconds = divmod(tz_seconds, 3600) mins = tz_seconds // 60 # XXX the apostrophe after the minute part of the offset is NOT # what's in the spec, but Adobe Reader DC refuses to validate # signatures with a date string that doesn't contain it. # No idea why. utc_offset_string = sign + ("%02d'%02d'" % (hrs, mins)) return TextStringObject(base_dt + utc_offset_string) # The year field is the only mandatory one MIN_DATE_REGEX = re.compile(r'^D:(\d{4})') MIN_DATE_REGEX_LENIENT = re.compile(r'^(?:D:)?(\d{4})') TWO_DIGIT_START = re.compile(r'^(\d\d)') UTC_OFFSET = re.compile(r"(\d\d)(?:'(\d\d))?'?") def parse_pdf_date(date_str: str, strict: bool = True) -> datetime: m = (MIN_DATE_REGEX if strict else MIN_DATE_REGEX_LENIENT).match(date_str) if not m: raise PdfReadError(f"{date_str} does not appear to be a date string.") year = int(m.group(1)) # now, there are a number of 2-digit groups (anywhere from 0 to 5) date_remaining = date_str[m.end(0) :] lower_order = [1, 1, 0, 0, 0] for ix in range(5): m = TWO_DIGIT_START.match(date_remaining) if not m: break lower_order[ix] = int(m.group(1)) date_remaining = date_remaining[2:] # TODO range checks month, day, hour, minute, second = lower_order # finally, parse the timezone tz_info = None if date_remaining: sgn = date_remaining[0] if sgn == 'Z' and len(date_remaining) == 1: tz_offset = timedelta(0) elif sgn in ('+', '-'): tz_spec = date_remaining[1:] tz_match = UTC_OFFSET.fullmatch(tz_spec) if not tz_match: raise PdfReadError( f"Improper timezone specification in {date_str}: {tz_spec}" ) tz_hours = int(tz_match.group(1)) tz_minutes = int(tz_match.group(2) or 0) tz_offset = timedelta(hours=tz_hours, minutes=tz_minutes) if sgn == '-': tz_offset = -tz_offset else: raise PdfReadError(f"Improper trailing characters in {date_str}.") tz_info = timezone(tz_offset) try: return datetime( year=year, month=month, day=day, hour=hour, minute=minute, second=second, microsecond=0, tzinfo=tz_info, ) except ValueError as e: raise PdfReadError("Improper date value", e)