diff --git a/.travis.yml b/.travis.yml index f01ec62..f6824d5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,9 +1,7 @@ language: python python: - - "3.5" - - "3.6" - - "3.7" - - "3.8" + - "3.9" + - "3.11" env: CFLAGS="-O0" cache: @@ -11,13 +9,10 @@ cache: - $HOME/.cache/pip install: - - if [[ $TRAVIS_PYTHON_VERSION < 3 ]]; then pip install -r requirements_py2.txt; fi - - if [[ $TRAVIS_PYTHON_VERSION > 3 ]]; then pip install -r requirements_py3.txt; fi - - if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install unittest2; fi -script: - python setup.py test + - pip install -e . +script: python setup.py test after_success: - coveralls # See: http://docs.travis-ci.com/user/migrating-from-legacy/ -sudo: false \ No newline at end of file +sudo: false diff --git a/CHANGES.txt b/CHANGES.txt index dcea3e6..149c1e0 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,3 +1,18 @@ +v0.5.0, 2021-05-03 -- +0.5.1 (unreleased) + + +- Make pdfquery compatible with Python 3.9 and 3.11 + + +0.5.0 (2021-05-04) + - #67 Fix range() page numbers for Python3 & prevent long cache file names + - Remove references to old version of PDFMiner + - Fixed an isort issue + - Update (un)supported Python versions + - Improve performance on large pdfs + - Remove reference to deprecated easy_install + - Fix two broken testcases v0.4.3, 2016-03-27 -- Add laparams parameter to __init__. v0.4.2, 2016-02-07 -- Annotations bugfix. v0.4.1, 2015-12-21 -- Annotations bugfix. diff --git a/README.rst b/README.rst index 29f2668..d73277b 100644 --- a/README.rst +++ b/README.rst @@ -18,10 +18,16 @@ PDFs with as little code as possible. .. contents:: **Table of Contents** -Installation -============ +Installation as a package +========================= + +``pip install pdfquery`` + + +Installation for development +============================ -``easy_install pdfquery`` or ``pip install pdfquery``. +``pip install -e ".[test,flake8,docs,release]"`` Quick Start =========== diff --git a/appveyor.yml b/appveyor.yml index 63daa9e..d88f778 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,13 +1,9 @@ environment: matrix: - # https://www.appveyor.com/docs/windows-images-software/#python - # currently lxml does not successfully install in 3.5 and 3.8 -# - PYTHON: "C:\\Python35" - - PYTHON: "C:\\Python36" - - PYTHON: "C:\\Python37" -# - PYTHON: "C:\\Python38" + - PYTHON: "C:\\Python39" + - PYTHON: "C:\\Python311" build: off test_script: - - "%PYTHON%\\python.exe setup.py test" \ No newline at end of file + - "%PYTHON%\\python.exe setup.py test" diff --git a/dev_requirements.txt b/dev_requirements.txt deleted file mode 100644 index e079f8a..0000000 --- a/dev_requirements.txt +++ /dev/null @@ -1 +0,0 @@ -pytest diff --git a/dist/pdfquery-0.1.0.tar.gz b/dist/pdfquery-0.1.0.tar.gz deleted file mode 100644 index 16e5075..0000000 Binary files a/dist/pdfquery-0.1.0.tar.gz and /dev/null differ diff --git a/dist/pdfquery-0.1.1.tar.gz b/dist/pdfquery-0.1.1.tar.gz deleted file mode 100644 index 64c833d..0000000 Binary files a/dist/pdfquery-0.1.1.tar.gz and /dev/null differ diff --git a/dist/pdfquery-0.1.2.tar.gz b/dist/pdfquery-0.1.2.tar.gz deleted file mode 100644 index bac4714..0000000 Binary files a/dist/pdfquery-0.1.2.tar.gz and /dev/null differ diff --git a/dist/pdfquery-0.1.3.tar.gz b/dist/pdfquery-0.1.3.tar.gz deleted file mode 100644 index 69419c9..0000000 Binary files a/dist/pdfquery-0.1.3.tar.gz and /dev/null differ diff --git a/dist/pdfquery-0.2.1.tar.gz b/dist/pdfquery-0.2.1.tar.gz deleted file mode 100644 index c700f4e..0000000 Binary files a/dist/pdfquery-0.2.1.tar.gz and /dev/null differ diff --git a/dist/pdfquery-0.2.2.tar.gz b/dist/pdfquery-0.2.2.tar.gz deleted file mode 100644 index 07aed7b..0000000 Binary files a/dist/pdfquery-0.2.2.tar.gz and /dev/null differ diff --git a/dist/pdfquery-0.2.3.tar.gz b/dist/pdfquery-0.2.3.tar.gz deleted file mode 100644 index e2de59d..0000000 Binary files a/dist/pdfquery-0.2.3.tar.gz and /dev/null differ diff --git a/dist/pdfquery-0.2.4.tar.gz b/dist/pdfquery-0.2.4.tar.gz deleted file mode 100644 index 60e7448..0000000 Binary files a/dist/pdfquery-0.2.4.tar.gz and /dev/null differ diff --git a/dist/pdfquery-0.2.5.tar.gz b/dist/pdfquery-0.2.5.tar.gz deleted file mode 100644 index 791ed22..0000000 Binary files a/dist/pdfquery-0.2.5.tar.gz and /dev/null differ diff --git a/dist/pdfquery-0.2.6.tar.gz b/dist/pdfquery-0.2.6.tar.gz deleted file mode 100644 index 3ef455b..0000000 Binary files a/dist/pdfquery-0.2.6.tar.gz and /dev/null differ diff --git a/dist/pdfquery-0.2.7.tar.gz b/dist/pdfquery-0.2.7.tar.gz deleted file mode 100644 index 8c93ba8..0000000 Binary files a/dist/pdfquery-0.2.7.tar.gz and /dev/null differ diff --git a/dist/pdfquery-0.2.tar.gz b/dist/pdfquery-0.2.tar.gz deleted file mode 100644 index e74541d..0000000 Binary files a/dist/pdfquery-0.2.tar.gz and /dev/null differ diff --git a/dist/pdfquery-0.3.0.tar.gz b/dist/pdfquery-0.3.0.tar.gz deleted file mode 100644 index d754eb9..0000000 Binary files a/dist/pdfquery-0.3.0.tar.gz and /dev/null differ diff --git a/dist/pdfquery-0.3.1.tar.gz b/dist/pdfquery-0.3.1.tar.gz deleted file mode 100644 index 34e4cab..0000000 Binary files a/dist/pdfquery-0.3.1.tar.gz and /dev/null differ diff --git a/dist/pdfquery-0.4.0.tar.gz b/dist/pdfquery-0.4.0.tar.gz deleted file mode 100644 index 461d6bd..0000000 Binary files a/dist/pdfquery-0.4.0.tar.gz and /dev/null differ diff --git a/dist/pdfquery-0.4.1.tar.gz b/dist/pdfquery-0.4.1.tar.gz deleted file mode 100644 index 68a5904..0000000 Binary files a/dist/pdfquery-0.4.1.tar.gz and /dev/null differ diff --git a/dist/pdfquery-0.4.2.tar.gz b/dist/pdfquery-0.4.2.tar.gz deleted file mode 100644 index d739388..0000000 Binary files a/dist/pdfquery-0.4.2.tar.gz and /dev/null differ diff --git a/dist/pdfquery-0.4.3.tar.gz b/dist/pdfquery-0.4.3.tar.gz deleted file mode 100644 index 99ee702..0000000 Binary files a/dist/pdfquery-0.4.3.tar.gz and /dev/null differ diff --git a/pdfquery/__init__.py b/pdfquery/__init__.py index 8f1dd85..de574bd 100644 --- a/pdfquery/__init__.py +++ b/pdfquery/__init__.py @@ -1 +1 @@ -from .pdfquery import PDFQuery \ No newline at end of file +from .pdfquery import PDFQuery diff --git a/pdfquery/cache.py b/pdfquery/cache.py index 1386ccc..517abcb 100644 --- a/pdfquery/cache.py +++ b/pdfquery/cache.py @@ -1,9 +1,10 @@ import hashlib import zipfile + from lxml import etree -class BaseCache(object): +class BaseCache(object): def __init__(self): self.hash_key = None @@ -32,30 +33,34 @@ class DummyCache(BaseCache): class FileCache(BaseCache): - - def __init__(self, directory='/tmp/'): + def __init__(self, directory="/tmp/"): self.directory = directory super(FileCache, self).__init__() def get_cache_filename(self, page_range_key): return "pdfquery_{hash_key}{page_range_key}.xml".format( - hash_key=self.hash_key, - page_range_key=page_range_key + hash_key=self.hash_key, page_range_key=page_range_key ) def get_cache_file(self, page_range_key, mode): try: - return zipfile.ZipFile(self.directory+self.get_cache_filename(page_range_key)+".zip", mode) + return zipfile.ZipFile( + self.directory + self.get_cache_filename(page_range_key) + ".zip", mode + ) except IOError: return None def set(self, page_range_key, tree): - xml = etree.tostring(tree, encoding='utf-8', pretty_print=False, xml_declaration=True) - cache_file = self.get_cache_file(page_range_key, 'w') + xml = etree.tostring( + tree, encoding="utf-8", pretty_print=False, xml_declaration=True + ) + cache_file = self.get_cache_file(page_range_key, "w") cache_file.writestr(self.get_cache_filename(page_range_key), xml) cache_file.close() def get(self, page_range_key): - cache_file = self.get_cache_file(page_range_key, 'r') + cache_file = self.get_cache_file(page_range_key, "r") if cache_file: - return etree.fromstring(cache_file.read(self.get_cache_filename(page_range_key))) \ No newline at end of file + return etree.fromstring( + cache_file.read(self.get_cache_filename(page_range_key)) + ) diff --git a/pdfquery/pdfquery.py b/pdfquery/pdfquery.py index 1c622e3..cedf597 100644 --- a/pdfquery/pdfquery.py +++ b/pdfquery/pdfquery.py @@ -1,49 +1,33 @@ -from __future__ import print_function -# -*- coding: utf-8 -*- - -# builtins import codecs +import hashlib import json import numbers import re -import chardet -try: - from collections import OrderedDict -except ImportError: - OrderedDict = dict # sorry py2.6! Ordering isn't that important for our purposes anyway. +from collections import OrderedDict -# pdfminer -from pdfminer.psparser import PSLiteral -from pdfminer.pdfparser import PDFParser -try: - # pdfminer < 20131022 - from pdfminer.pdfparser import PDFDocument, PDFPage -except ImportError: - # pdfminer >= 20131022 - from pdfminer.pdfdocument import PDFDocument - from pdfminer.pdfpage import PDFPage -from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter -from pdfminer.layout import LAParams, LTChar, LTImage, LTPage +import chardet +import cssselect +import six +from lxml import etree from pdfminer.converter import PDFPageAggregator +from pdfminer.layout import LAParams, LTChar, LTImage, LTPage +from pdfminer.pdfdocument import PDFDocument +from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager +from pdfminer.pdfpage import PDFPage +from pdfminer.pdfparser import PDFParser from pdfminer.pdftypes import resolve1 - -# other dependencies +from pdfminer.psparser import PSLiteral from pyquery import PyQuery -from lxml import etree -import cssselect -import six -from six.moves import map from six.moves import zip -# local imports -from .pdftranslator import PDFQueryTranslator from .cache import DummyCache +from .pdftranslator import PDFQueryTranslator # Re-sort the PDFMiner Layout tree so elements that fit inside other elements # will be children of them def _append_sorted(root, el, comparator): - """ Add el as a child of root, or as a child of one of root's children. + """Add el as a child of root, or as a child of one of root's children. Comparator is a function(a, b) returning > 0 if a is a child of b, < 0 if b is a child of a, 0 if neither. """ @@ -61,21 +45,24 @@ def _append_sorted(root, el, comparator): def _box_in_box(el, child): - """ Return True if child is contained within el. """ - return all([ - float(el.get('x0')) <= float(child.get('x0')), - float(el.get('x1')) >= float(child.get('x1')), - float(el.get('y0')) <= float(child.get('y0')), - float(el.get('y1')) >= float(child.get('y1')), - ]) + """Return True if child is contained within el.""" + return ( + float(el.get("x0")) <= float(child.get("x0")) + and float(el.get("x1")) >= float(child.get("x1")) + and float(el.get("y0")) <= float(child.get("y0")) + and float(el.get("y1")) >= float(child.get("y1")) + ) + + +_comp_bbox_keys_required = set(["x0", "x1", "y0", "y1"]) -_comp_bbox_keys_required = set(['x0', 'x1', 'y0', 'y1']) def _comp_bbox(el, el2): - """ Return 1 if el in el2, -1 if el2 in el, else 0""" + """Return 1 if el in el2, -1 if el2 in el, else 0""" # only compare if both elements have x/y coordinates - if _comp_bbox_keys_required <= set(el.keys()) and \ - _comp_bbox_keys_required <= set(el2.keys()): + if _comp_bbox_keys_required <= set(el.keys()) and _comp_bbox_keys_required <= set( + el2.keys() + ): if _box_in_box(el2, el): return 1 if _box_in_box(el, el2): @@ -84,47 +71,50 @@ def _comp_bbox(el, el2): # assorted helpers -def _flatten(l, ltypes=(list, tuple)): +def _flatten(l): # via http://rightfootin.blogspot.com/2006/09/more-on-python-flatten.html ltype = type(l) l = list(l) i = 0 while i < len(l): - while isinstance(l[i], ltypes): + while isinstance(l[i], (list, tuple, range)): if not l[i]: l.pop(i) i -= 1 break else: - l[i:i + 1] = l[i] + l[i : i + 1] = l[i] i += 1 return ltype(l) + # these might have to be removed from the start of a decoded string after # conversion -bom_headers = set([ - six.text_type(codecs.BOM_UTF8, 'utf8'), - six.text_type(codecs.BOM_UTF16_LE, 'utf-16LE'), - six.text_type(codecs.BOM_UTF16_BE, 'utf-16BE'), - six.text_type(codecs.BOM_UTF32_LE, 'utf-32LE'), - six.text_type(codecs.BOM_UTF32_BE, 'utf-32BE'), -]) +bom_headers = set( + [ + six.text_type(codecs.BOM_UTF8, "utf8"), + six.text_type(codecs.BOM_UTF16_LE, "utf-16LE"), + six.text_type(codecs.BOM_UTF16_BE, "utf-16BE"), + six.text_type(codecs.BOM_UTF32_LE, "utf-32LE"), + six.text_type(codecs.BOM_UTF32_BE, "utf-32BE"), + ] +) def smart_unicode_decode(encoded_string): """ - Given an encoded string of unknown format, detect the format with - chardet and return the unicode version. - Example input from bug #11: - ('\xfe\xff\x00I\x00n\x00s\x00p\x00e\x00c\x00t\x00i\x00o\x00n\x00' - '\x00R\x00e\x00p\x00o\x00r\x00t\x00 \x00v\x002\x00.\x002') + Given an encoded string of unknown format, detect the format with + chardet and return the unicode version. + Example input from bug #11: + ('\xfe\xff\x00I\x00n\x00s\x00p\x00e\x00c\x00t\x00i\x00o\x00n\x00' + '\x00R\x00e\x00p\x00o\x00r\x00t\x00 \x00v\x002\x00.\x002') """ if not encoded_string: - return u'' + return "" # optimization -- first try ascii try: - return encoded_string.decode('ascii') + return encoded_string.decode("ascii") except UnicodeDecodeError: pass @@ -132,11 +122,13 @@ def smart_unicode_decode(encoded_string): detected_encoding = chardet.detect(encoded_string) # bug 54 -- depending on chardet version, if encoding is not guessed, # either detected_encoding will be None or detected_encoding['encoding'] will be None - detected_encoding = detected_encoding['encoding'] if detected_encoding and detected_encoding.get('encoding') else 'utf8' + detected_encoding = ( + detected_encoding["encoding"] + if detected_encoding and detected_encoding.get("encoding") + else "utf8" + ) decoded_string = six.text_type( - encoded_string, - encoding=detected_encoding, - errors='replace' + encoded_string, encoding=detected_encoding, errors="replace" ) # unicode string may still have useless BOM character at the beginning @@ -145,6 +137,7 @@ def smart_unicode_decode(encoded_string): return decoded_string + def prepare_for_json_encoding(obj): """ Convert an arbitrary object into just JSON data types (list, dict, unicode str, int, bool, null). @@ -155,18 +148,24 @@ def prepare_for_json_encoding(obj): if obj_type == dict: # alphabetizing keys lets us compare attributes for equality across runs return OrderedDict( - (prepare_for_json_encoding(k), - prepare_for_json_encoding(obj[k])) for k in sorted(obj.keys()) + (prepare_for_json_encoding(k), prepare_for_json_encoding(obj[k])) + for k in sorted(obj.keys()) ) if obj_type == six.binary_type: return smart_unicode_decode(obj) - if obj_type == bool or obj is None or obj_type == six.text_type or isinstance(obj, numbers.Number): + if ( + obj_type == bool + or obj is None + or obj_type == six.text_type + or isinstance(obj, numbers.Number) + ): return obj if obj_type == PSLiteral: # special case because pdfminer.six currently adds extra quotes to PSLiteral.__repr__ - return u"/%s" % obj.name + return "/%s" % obj.name return six.text_type(obj) + def obj_to_string(obj, top=True): """ Turn an arbitrary object into a unicode string. If complex (dict/list/tuple), will be json-encoded. @@ -178,9 +177,13 @@ def obj_to_string(obj, top=True): # via http://stackoverflow.com/a/25920392/307769 -invalid_xml_chars_re = re.compile(u'[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\u10000-\u10FFFF]+') +invalid_xml_chars_re = re.compile( + "[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\u10000-\u10FFFF]+" +) + + def strip_invalid_xml_chars(s): - return invalid_xml_chars_re.sub(r'', s) + return invalid_xml_chars_re.sub(r"", s) # custom PDFDocument class @@ -215,12 +218,13 @@ def get_page_number(self, index): """ # get and cache page ranges - if not hasattr(self, 'page_range_pairs'): + if not hasattr(self, "page_range_pairs"): try: - page_ranges = resolve1(self.catalog['PageLabels'])['Nums'] + page_ranges = resolve1(self.catalog["PageLabels"])["Nums"] assert len(page_ranges) > 1 and len(page_ranges) % 2 == 0 self.page_range_pairs = list( - reversed(list(zip(page_ranges[::2], page_ranges[1::2])))) + reversed(list(zip(page_ranges[::2], page_ranges[1::2]))) + ) except: self.page_range_pairs = [] @@ -236,32 +240,32 @@ def get_page_number(self, index): page_label = "" # handle numeric part of label - if 'S' in label_format: - + if "S" in label_format: # first find number for this page ... page_label = index - starting_index - if 'St' in label_format: # alternate start value - page_label += label_format['St'] + if "St" in label_format: # alternate start value + page_label += label_format["St"] else: page_label += 1 # ... then convert to correct format - num_type = label_format['S'].name + num_type = label_format["S"].name # roman (upper or lower) - if num_type.lower() == 'r': + if num_type.lower() == "r": import roman + page_label = roman.toRoman(page_label) - if num_type == 'r': + if num_type == "r": page_label = page_label.lower() # letters - elif num_type.lower() == 'a': + elif num_type.lower() == "a": # a to z for the first 26 pages, aa to zz for the next 26, and # so on letter = chr(page_label % 26 + 65) letter *= page_label / 26 + 1 - if num_type == 'a': + if num_type == "a": letter = letter.lower() page_label = letter @@ -270,24 +274,27 @@ def get_page_number(self, index): page_label = obj_to_string(page_label) # handle string prefix - if 'P' in label_format: - page_label = smart_unicode_decode(label_format['P']) + page_label + if "P" in label_format: + page_label = smart_unicode_decode(label_format["P"]) + page_label return page_label # create etree parser using custom Element class + class LayoutElement(etree.ElementBase): @property def layout(self): - if not hasattr(self, '_layout'): + if not hasattr(self, "_layout"): self._layout = None return self._layout @layout.setter def layout(self, value): self._layout = value + + parser_lookup = etree.ElementDefaultClassLookup(element=LayoutElement) parser = etree.XMLParser() parser.set_element_class_lookup(parser_lookup) @@ -296,17 +303,17 @@ def layout(self, value): # main class class PDFQuery(object): def __init__( - self, - file, - merge_tags=('LTChar', 'LTAnno'), - round_floats=True, - round_digits=3, - input_text_formatter=None, - normalize_spaces=True, - resort=True, - parse_tree_cacher=None, - laparams={'all_texts':True, 'detect_vertical':True}, - password='' + self, + file, + merge_tags=("LTChar", "LTAnno"), + round_floats=True, + round_digits=3, + input_text_formatter=None, + normalize_spaces=True, + resort=True, + parse_tree_cacher=None, + laparams={"all_texts": True, "detect_vertical": True}, + password="", ): # store input self.merge_tags = merge_tags @@ -318,32 +325,21 @@ def __init__( if input_text_formatter: self.input_text_formatter = input_text_formatter elif normalize_spaces: - r = re.compile(r'\s+') - self.input_text_formatter = lambda s: re.sub(r, ' ', s) + r = re.compile(r"\s+") + self.input_text_formatter = lambda s: re.sub(r, " ", s) else: self.input_text_formatter = None # open doc - if not hasattr(file, 'read'): + if not hasattr(file, "read"): try: - file = open(file, 'rb') + file = open(file, "rb") except TypeError: raise TypeError("File must be file object or filepath string.") parser = PDFParser(file) - if hasattr(QPDFDocument, 'set_parser'): - # pdfminer < 20131022 - doc = QPDFDocument() - parser.set_document(doc) - doc.set_parser(parser) - else: - # pdfminer >= 20131022 - doc = QPDFDocument(parser, password) - parser.set_document(doc) - if hasattr(doc, 'initialize'): - # as of pdfminer==20140328, "PDFDocument.initialize() method is - # removed and no longer needed." - doc.initialize() + doc = QPDFDocument(parser, password) + parser.set_document(doc) self.doc = doc self.parser = parser self.tree = None @@ -391,11 +387,11 @@ def load(self, *page_numbers): def extract(self, searches, tree=None, as_dict=True): """ - >>> foo = pdf.extract([['pages', 'LTPage']]) - >>> foo - {'pages': [, ]} - >>> pdf.extract([['bar', ':in_bbox("100,100,400,400")']], foo['pages'][0]) - {'bar': [, ,... + >>> foo = pdf.extract([['pages', 'LTPage']]) + >>> foo + {'pages': [, ]} + >>> pdf.extract([['bar', ':in_bbox("100,100,400,400")']], foo['pages'][0]) + {'bar': [, ,... """ if self.tree is None or self.pq is None: self.load() @@ -410,25 +406,31 @@ def extract(self, searches, tree=None, as_dict=True): if len(search) < 3: search = list(search) + [formatter] key, search, tmp_formatter = search - if key == 'with_formatter': + if key == "with_formatter": if isinstance(search, six.string_types): # is a pyquery method name, e.g. 'text' formatter = lambda o, search=search: getattr(o, search)() - elif hasattr(search, '__call__') or not search: + elif hasattr(search, "__call__") or not search: # is a method, or None to end formatting formatter = search else: - raise TypeError("Formatter should be either a pyquery " - "method name or a callable function.") - elif key == 'with_parent': + raise TypeError( + "Formatter should be either a pyquery " + "method name or a callable function." + ) + elif key == "with_parent": parent = pq(search) if search else pq else: try: - result = parent("*").filter(search) if \ - hasattr(search, '__call__') else parent(search) + result = ( + parent("*").filter(search) + if hasattr(search, "__call__") + else parent(search) + ) except cssselect.SelectorSyntaxError as e: raise cssselect.SelectorSyntaxError( - "Error applying selector '%s': %s" % (search, e)) + "Error applying selector '%s': %s" % (search, e) + ) if tmp_formatter: result = tmp_formatter(result) results += result if type(result) == tuple else [[key, result]] @@ -439,9 +441,9 @@ def extract(self, searches, tree=None, as_dict=True): # tree building stuff def get_pyquery(self, tree=None, page_numbers=None): """ - Wrap given tree in pyquery and return. - If no tree supplied, will generate one from given page_numbers, or - all page numbers. + Wrap given tree in pyquery and return. + If no tree supplied, will generate one from given page_numbers, or + all page numbers. """ if not page_numbers: page_numbers = [] @@ -450,16 +452,18 @@ def get_pyquery(self, tree=None, page_numbers=None): tree = self.tree else: tree = self.get_tree(page_numbers) - if hasattr(tree, 'getroot'): + if hasattr(tree, "getroot"): tree = tree.getroot() return PyQuery(tree, css_translator=PDFQueryTranslator()) def get_tree(self, *page_numbers): """ - Return lxml.etree.ElementTree for entire document, or page numbers - given if any. + Return lxml.etree.ElementTree for entire document, or page numbers + given if any. """ - cache_key = "_".join(map(str, _flatten(page_numbers))) + hasher = hashlib.md5() + hasher.update(str(page_numbers).encode("UTF-8")) + cache_key = "_{}".format(hasher.hexdigest()) tree = self._parse_tree_cacher.get(cache_key) if tree is None: # set up root @@ -476,24 +480,26 @@ def get_tree(self, *page_numbers): # If that happens we just replace non-word characters # with '_'. if "Invalid attribute name" in e.args[0]: - k = re.sub(r'\W', '_', k) + k = re.sub(r"\W", "_", k) root.set(k, v) # Parse pages and append to root. # If nothing was passed in for page_numbers, we do this for all # pages, but if None was explicitly passed in, we skip it. - if not(len(page_numbers) == 1 and page_numbers[0] is None): + if not (len(page_numbers) == 1 and page_numbers[0] is None): if page_numbers: - pages = [[n, self.get_layout(self.get_page(n))] for n in - _flatten(page_numbers)] + pages = [ + [n, self.get_layout(self.get_page(n))] + for n in _flatten(page_numbers) + ] else: pages = enumerate(self.get_layouts()) for n, page in pages: page = self._xmlize(page) if self.resort: self._sort(page) - page.set('page_index', obj_to_string(n)) - page.set('page_label', self.doc.get_page_number(n)) + page.set("page_index", obj_to_string(n)) + page.set("page_label", self.doc.get_page_number(n)) root.append(page) self._clean_text(root) @@ -505,8 +511,8 @@ def get_tree(self, *page_numbers): def _clean_text(self, branch): """ - Remove text from node if same text exists in its children. - Apply string formatter if set. + Remove text from node if same text exists in its children. + Apply string formatter if set. """ if branch.text and self.input_text_formatter: branch.text = self.input_text_formatter(branch.text) @@ -514,7 +520,7 @@ def _clean_text(self, branch): for child in branch: self._clean_text(child) if branch.text and branch.text.find(child.text) >= 0: - branch.text = branch.text.replace(child.text, '', 1) + branch.text = branch.text.replace(child.text, "", 1) except TypeError: # not an iterable node pass @@ -525,20 +531,39 @@ def _xmlize(self, node, root=None): else: # collect attributes of current node tags = self._getattrs( - node, 'y0', 'y1', 'x0', 'x1', 'width', 'height', 'bbox', - 'linewidth', 'pts', 'index', 'name', 'matrix', 'word_margin' + node, + "y0", + "y1", + "x0", + "x1", + "width", + "height", + "bbox", + "linewidth", + "pts", + "index", + "name", + "matrix", + "word_margin", ) if type(node) == LTImage: - tags.update(self._getattrs( - node, 'colorspace', 'bits', 'imagemask', 'srcsize', - 'stream', 'name', 'pts', 'linewidth') + tags.update( + self._getattrs( + node, + "colorspace", + "bits", + "imagemask", + "srcsize", + "stream", + "name", + "pts", + "linewidth", + ) ) elif type(node) == LTChar: - tags.update(self._getattrs( - node, 'fontname', 'adv', 'upright', 'size') - ) + tags.update(self._getattrs(node, "fontname", "adv", "upright", "size")) elif type(node) == LTPage: - tags.update(self._getattrs(node, 'pageid', 'rotate')) + tags.update(self._getattrs(node, "pageid", "rotate")) # create node branch = parser.makeelement(node.__class__.__name__, tags) @@ -549,11 +574,11 @@ def _xmlize(self, node, root=None): root = branch # add text - if hasattr(node, 'get_text'): + if hasattr(node, "get_text"): branch.text = strip_invalid_xml_chars(node.get_text()) # add children if node is an iterable - if hasattr(node, '__iter__'): + if hasattr(node, "__iter__"): last = None for child in node: child = self._xmlize(child, root) @@ -563,8 +588,8 @@ def _xmlize(self, node, root=None): elif last is not None and last.tag in self.merge_tags: last.text += child.text last.set( - '_obj_id', - last.get('_obj_id','') + "," + child.get('_obj_id','') + "_obj_id", + last.get("_obj_id", "") + "," + child.get("_obj_id", ""), ) continue # sort children by bounding boxes @@ -576,15 +601,18 @@ def _xmlize(self, node, root=None): return branch def _sort(self, tree): - """ Sort same-level elements top to bottom and left to right. """ + """Sort same-level elements top to bottom and left to right.""" children = list(tree) if children: - tree[:] = sorted(children, key=lambda child: (-float(child.get('y1')), float(child.get('x0')))) + tree[:] = sorted( + children, + key=lambda child: (-float(child.get("y1")), float(child.get("x0"))), + ) for child in children: self._sort(child) def _getattrs(self, obj, *attrs): - """ Return dictionary of given attrs on given object, if they exist, + """Return dictionary of given attrs on given object, if they exist, processing through _filter_value(). """ filtered_attrs = {} @@ -599,17 +627,17 @@ def _filter_value(self, val): if self.round_floats: if type(val) == float: val = round(val, self.round_digits) - elif hasattr(val, '__iter__') and not isinstance(val, six.string_types): + elif hasattr(val, "__iter__") and not isinstance(val, six.string_types): val = [self._filter_value(item) for item in val] return val # page access stuff def get_page(self, page_number): - """ Get PDFPage object -- 0-indexed.""" + """Get PDFPage object -- 0-indexed.""" return self._cached_pages(target_page=page_number) def get_layout(self, page): - """ Get PDFMiner Layout object for given page object or page number. """ + """Get PDFMiner Layout object for given page object or page number.""" if type(page) == int: page = self.get_page(page) self.interpreter.process_page(page) @@ -618,7 +646,7 @@ def get_layout(self, page): return layout def get_layouts(self): - """ Get list of PDFMiner Layout objects for each page. """ + """Get list of PDFMiner Layout objects for each page.""" return (self.get_layout(page) for page in self._cached_pages()) def _cached_pages(self, target_page=-1): @@ -628,13 +656,7 @@ def _cached_pages(self, target_page=-1): so we won't know how many there are until we parse the whole document, which we don't want to do until we need to. """ - try: - # pdfminer < 20131022 - self._pages_iter = self._pages_iter or self.doc.get_pages() - except AttributeError: - # pdfminer >= 20131022 - self._pages_iter = self._pages_iter or \ - PDFPage.create_pages(self.doc) + self._pages_iter = self._pages_iter or PDFPage.create_pages(self.doc) if target_page >= 0: while len(self._pages) <= target_page: @@ -651,22 +673,21 @@ def _cached_pages(self, target_page=-1): return self._pages def _add_annots(self, layout, annots): - """Adds annotations to the layout object - """ + """Adds annotations to the layout object""" if annots: for annot in resolve1(annots): annot = resolve1(annot) - if annot.get('Rect') is not None: - annot['bbox'] = annot.pop('Rect') # Rename key + if annot.get("Rect") is not None: + annot["bbox"] = annot.pop("Rect") # Rename key annot = self._set_hwxy_attrs(annot) try: - annot['URI'] = resolve1(annot['A'])['URI'] - except KeyError: + annot["URI"] = resolve1(annot["A"])["URI"] + except: # noqa pass for k, v in six.iteritems(annot): if not isinstance(v, six.string_types): annot[k] = obj_to_string(v) - elem = parser.makeelement('Annot', annot) + elem = parser.makeelement("Annot", annot) layout.add(elem) return layout @@ -675,17 +696,18 @@ def _set_hwxy_attrs(attr): """Using the bbox attribute, set the h, w, x0, x1, y0, and y1 attributes. """ - bbox = attr['bbox'] - attr['x0'] = bbox[0] - attr['x1'] = bbox[2] - attr['y0'] = bbox[1] - attr['y1'] = bbox[3] - attr['height'] = attr['y1'] - attr['y0'] - attr['width'] = attr['x1'] - attr['x0'] + bbox = attr["bbox"] + attr["x0"] = bbox[0] + attr["x1"] = bbox[2] + attr["y0"] = bbox[1] + attr["y1"] = bbox[3] + attr["height"] = attr["y1"] - attr["y0"] + attr["width"] = attr["x1"] - attr["x0"] return attr if __name__ == "__main__": import doctest + pdf = PDFQuery("../examples/sample.pdf") - doctest.testmod(extraglobs={'pdf': pdf}, optionflags=doctest.ELLIPSIS) + doctest.testmod(extraglobs={"pdf": pdf}, optionflags=doctest.ELLIPSIS) diff --git a/pdfquery/pdftranslator.py b/pdfquery/pdftranslator.py index 0f8429a..14a18aa 100644 --- a/pdfquery/pdftranslator.py +++ b/pdfquery/pdftranslator.py @@ -1,5 +1,3 @@ -#-*- coding:utf-8 -*- -# # Copyright (C) 2008 - Olivier Lauzanne # # Distributed under the BSD license, see LICENSE.txt @@ -7,12 +5,11 @@ class PDFQueryTranslator(cssselect_xpath.GenericTranslator): - def xpath_in_bbox_function(self, xpath, fn): if len(fn.arguments) > 1: - x0,y0,x1,y1 = [float(t.value) for t in fn.arguments] + x0, y0, x1, y1 = [float(t.value) for t in fn.arguments] else: - x0,y0,x1,y1 = map(float, fn.arguments[0].value.split(",")) + x0, y0, x1, y1 = map(float, fn.arguments[0].value.split(",")) # TODO: seems to be doing < rather than <= ??? xpath.add_condition("@x0 >= %s" % x0) xpath.add_condition("@y0 >= %s" % y0) @@ -22,12 +19,12 @@ def xpath_in_bbox_function(self, xpath, fn): def xpath_overlaps_bbox_function(self, xpath, fn): if len(fn.arguments) > 1: - x0,y0,x1,y1 = [float(t.value) for t in fn.arguments] + x0, y0, x1, y1 = [float(t.value) for t in fn.arguments] else: - x0,y0,x1,y1 = map(float, fn.arguments[0].value.split(",")) + x0, y0, x1, y1 = map(float, fn.arguments[0].value.split(",")) # TODO: seems to be doing < rather than <= ??? xpath.add_condition("@x0 <= %s" % x1) xpath.add_condition("@y0 <= %s" % y1) xpath.add_condition("@x1 >= %s" % x0) xpath.add_condition("@y1 >= %s" % y0) - return xpath \ No newline at end of file + return xpath diff --git a/requirements_py3.txt b/requirements.txt similarity index 100% rename from requirements_py3.txt rename to requirements.txt diff --git a/requirements_py2.txt b/requirements_py2.txt deleted file mode 100644 index 277c8d3..0000000 --- a/requirements_py2.txt +++ /dev/null @@ -1,7 +0,0 @@ -cssselect>=0.7.1 -chardet -lxml>=3.0 -pdfminer>=20110515 -six -pyquery>=1.2.2 -roman>=1.4.0 diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..bf1acc3 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,74 @@ + +# setuptools config +# see http://setuptools.readthedocs.io/en/latest/setuptools.html#configuring-setup-using-setup-cfg-files + +[metadata] +name = pdfquery +version = 0.5.1.dev0 +description = Concise and friendly PDF scraper using JQuery or XPath selectors +long_description = file: README.rst +license = MIT +author = Jack Cushman +author_email = jcushman@gmail.com +url = https://github.com/jcushman/pdfquery +classifiers = + Development Status :: 5 - Production/Stable + Intended Audience :: Developers + Operating System :: Unix + Operating System :: MacOS + Programming Language :: Python :: 3.9 + Programming Language :: Python :: 3.11 + Topic :: Software Development :: Libraries :: Python Modules + +[options] +zip_safe = False +include_package_data = True +packages = find: +install_requires = + cssselect + chardet + lxml + pdfminer.six + pyquery + roman +tests_require = + pytest + tox + pytest-remove-stale-bytecode + +[options.extras_require] +dev = + pre-commit + pdbpp + black + isort + flake8 +test = + pytest + pytest-remove-stale-bytecode + tox +coverage = pytest-cov +docs = + sphinx + doc8 +release = zest.releaser +pycodestyle = flake8 + +[aliases] +test=pytest + +[bdist_wheel] +universal = 1 + +[tool:pytest] +testpaths = tests + +[flake8] +max-line-length = 120 +exclude = env,.tox,doc + +[zest.releaser] +create-wheel = yes + +[distutils] +index-servers = isp diff --git a/setup.py b/setup.py index f255a41..6068493 100644 --- a/setup.py +++ b/setup.py @@ -1,48 +1,3 @@ -# -*- coding: utf-8 -*- -import sys -from setuptools import setup, find_packages +from setuptools import setup -# set up tests -if sys.version_info[:2] < (2, 7): - test_suite = 'unittest2.collector' -else: - test_suite = 'tests' - -# Work around a traceback on Python < 2.7.4 and < 3.3.1 -# http://bugs.python.org/issue15881#msg170215 -try: - import multiprocessing # noqa: unused -except ImportError: - pass - -setup( - name='pdfquery', - version='0.4.3', - author=u'Jack Cushman', - author_email='jcushman@gmail.com', - packages=find_packages(), - url='https://github.com/jcushman/pdfquery', - license='MIT', - description='Concise and friendly PDF scraper using JQuery or XPath selectors.', - keywords='', - long_description=open('README.rst').read(), - install_requires = open('requirements_py3.txt').read() if sys.version_info >= (3, 0) else open('requirements_py2.txt').read(), - classifiers=[ - "Development Status :: 4 - Beta", - "Topic :: Text Processing", - "Topic :: Utilities", - "License :: OSI Approved :: MIT License", - "Intended Audience :: Developers", - "Operating System :: OS Independent", - "Programming Language :: Python", - "Programming Language :: Python :: 2", - "Programming Language :: Python :: 2.6", - "Programming Language :: Python :: 2.7", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.3", - "Programming Language :: Python :: 3.4", - "Programming Language :: Python :: 3.5", - ], - - test_suite=test_suite, -) +setup() diff --git a/tests/samples/bug67.pdf b/tests/samples/bug67.pdf new file mode 100644 index 0000000..42e0717 Binary files /dev/null and b/tests/samples/bug67.pdf differ diff --git a/tests/saved_output/IRS_1040A_output.xml b/tests/saved_output/IRS_1040A_output.xml index 8a80090..d7045a3 100644 --- a/tests/saved_output/IRS_1040A_output.xml +++ b/tests/saved_output/IRS_1040A_output.xml @@ -1,122 +1,91 @@ - 2 TO BE REMOVED BEFORE PRINTING - TLS, have you transmitted all R text files for this cycle update? + 2 TO BE REMOVED BEFORE PRINTING + TLS, have you transmitted all R text files for this cycle update? - I.R.S. SPECIFICATIONS INSTRUCTIONS TO PRINTERS FORM 1040A, PAGE 1 of 2 MARGINS: TOP 13 mm (1⁄2 "), CENTER SIDES. PRINTS: HEAD TO HEAD PAPER: WHITE WRITING, SUB. 20. INK: BLACK FLAT SIZE: 203 mm (8") x 279 mm (11") PERFORATE: (NONE) DO NOT PRINT — DO NOT PRINT — DO NOT PRINT — DO NOT PRINT - Action - Date - Signature + I.R.S. SPECIFICATIONS INSTRUCTIONS TO PRINTERS FORM 1040A, PAGE 1 of 2 MARGINS: TOP 13 mm (1⁄2 "), CENTER SIDES. PRINTS: HEAD TO HEAD PAPER: WHITE WRITING, SUB. 20. INK: BLACK FLAT SIZE: 203 mm (8") x 279 mm (11") PERFORATE: (NONE) DO NOT PRINT — DO NOT PRINT — DO NOT PRINT — DO NOT PRINT + Action + Date + Signature - - O.K. to print + + O.K. to print - Revised proofs requested - Date + Revised proofs requested + Date - Separation 1 of 2: Black - Separation 2 of 2: PMS 185 (RED) - - + Separation 1 of 2: Black + Separation 2 of 2: PMS 185 (RED) + + - - Write the words “Stimulus Payment” across the top of the form you file. + + Write the words “Stimulus Payment” across the top of the form you file. - Form - Department of the Treasury—Internal Revenue Service U.S. Individual Income Tax Return - 1040A Label (See page 15.) - 2007 - IRS Use Only—Do not write or staple in this space. - (99) + Form 1040A Label (See page 15.) + Department of the Treasury—Internal Revenue Service U.S. Individual Income Tax Return Your first name and initial Last name + 2007 + IRS Use Only—Do not write or staple in this space. + (99) - + - Your first name and initial - Last name - LABEL HERE - John E. - Michaels + LABEL HERE + John E. + Michaels - If a joint return, spouse’s first name and initial - Last name + If a joint return, spouse’s first name and initial + Last name + Susan R. + Michaels + + + Home address (number and street). If you have a P.O. box, see page 15. + Apt. no. + 1040 Main Street + + + City, town or post office, state, and ZIP code. If you have a foreign address, see page 15. + + - - OMB No. 1545-0074 Your social security number + + OMB No. 1545-0074 + Your social security number - 011 00 2222 + 011 00 2222 Spouse’s social security number - Spouse’s social security number - Otherwise, please print or type. Presidential Election Campaign (cid:2) Check here if you, or your spouse if filing jointly, want $3 to go to this fund (see page 15) (cid:2) Filing status Check only one box. Exemptions Use the Susan R. Michaels IRS label. Home address (number and street). If you have a P.O. box, see page 15. -Home address (number and street). If you have a P.O. box, see page 15. -Apt. no. -Apt. no. -Otherwise, -1040 Main Street -1040 Main Street -please print -or type. -City, town or post office, state, and ZIP code. If you have a foreign address, see page 15. -City, town or post office, state, and ZIP code. If you have a foreign address, see page 15. -Hometown, TX 77099 -Hometown, TX 77099 -Presidential -Y -Y -Election Campaign (cid:2) Check here if you, or your spouse if filing jointly, want $3 to go to this fund (see page 15) (cid:2) -L -1 -1 -4 5 -4 5 -Single -Married filing jointly (even if only one had income) -Married filing separately. Enter spouse’s SSN above and -full name here. (cid:2) -Single -Married filing jointly (even if only one had income) -Married filing separately. Enter spouse’s SSN above and -full name here. (cid:2) -Filing -N -23 -23 -status -Check only -one box. -6a -6a -Yourself. -Yourself. -If someone can claim you as a dependent, do not check -Exemptions - + Use the IRS label. Otherwise, please print or type. - 011 00 1111 You must enter your SSN(s) above. + 011 00 1111 You must enter your SSN(s) above. - (cid:4) - (cid:4) + (cid:4) + (cid:4) - Checking a box below will not change your tax or refund. + Checking a box below will not change your tax or refund. + + Hometown, TX 77099 + Presidential Election Campaign (cid:2) Check here if you, or your spouse if filing jointly, want $3 to go to this fund (see page 15) (cid:2) @@ -124,64 +93,93 @@ full name here. (cid:2) - Spouse - You + Spouse + You - L N E O T F IL + Y - Head of household (with qualifying person). (See page 16.) If the qualifying person is a child but not your dependent, enter this child’s name here. (cid:2) Qualifying widow(er) with dependent child (see page 17) - E + L + + + + + + + 1 + 4 + Yourself. Single Married filing jointly (even if only one had income) Married filing separately. Enter spouse’s SSN above and full name here. (cid:2) Yourself. + + Head of household (with qualifying person). (See page 16.) If the qualifying person is a child but not your dependent, enter this child’s name here. (cid:2) Qualifying widow(er) with dependent child (see page 17) + Filing status Check only one box. + + + N E O T F I L + + + + 2 3 + + + + - (cid:2) + + + + + 5 + E + (cid:2) + - If someone can claim you as a dependent, do not check box 6a. - Boxes checked on 6a and 6b No. of children on 6c who: ● lived with you - L + + + 6a + If someone can claim you as a dependent, do not check box 6a. + Boxes checked on 6a and 6b + Exemptions - M O N P M -O N - - b c - Spouse + L + + A M P b +c +b +c +O N D - - - + Spouse Dependents: + No. of children on 6c who: ● lived with you - Dependents: - (4) - if qualifying - (3) Dependent’s relationship to + (4) + if qualifying + (3) Dependent’s relationship to you - (2) Dependent’s social - child for child tax credit (see - O - A - security number - (1) First name - Last name - you - X + (2) Dependent’s social security number + child for child tax credit (see page 18) + O + (1) First name + Last name - page 18) - If more than six dependents, see page 18. - ● did not live with you due to divorce or separation (see page 19) + X + ● did not live with you due to divorce or separation (see page 19) + If more than six dependents, see page 18. + - E + E @@ -192,7 +190,6 @@ full name here. (cid:2) - D @@ -208,7 +205,7 @@ full name here. (cid:2) - Dependents on 6c not entered above + Dependents on 6c not entered above @@ -228,339 +225,313 @@ full name here. (cid:2) - - + + - If you were self-employed or a partner, include the amount you would enter on Schedule SE, line 3. + If you were self-employed or a partner, include the amount you would enter on Schedule SE, line 3. - Add numbers on lines above (cid:2) - d - Total number of exemptions claimed. + Add numbers on lines above (cid:2) + d + Total number of exemptions claimed. - Income Attach Form(s) W-2 here. Also attach Form(s) 1099-R if tax was withheld. - - (cid:2) - 7 - Wages, salaries, tips, etc. Attach Form(s) W-2. - 7 + Income + + (cid:2) + 7 + Wages, salaries, tips, etc. Attach Form(s) W-2. + 7 + Attach Form(s) W-2 here. Also attach Form(s) 1099-R if tax was withheld. - 8a b 9a b Taxable interest. Attach Schedule 1 if required. Tax-exempt interest. Do not include on line 8a. Ordinary dividends. Attach Schedule 1 if required. Qualified dividends (see page 22). 10 11a Capital gain distributions (see page 22). IRA distributions. 11a 12a Pensions and - 8a - 8b + 8a b 9a b Taxable interest. Attach Schedule 1 if required. Tax-exempt interest. Do not include on line 8a. Ordinary dividends. Attach Schedule 1 if required. Qualified dividends (see page 22). 10 11a Capital gain distributions (see page 22). IRA distributions. 11a 12a Pensions and annuities. + 8a - 9a - - 9b + 8b + + 9a - - 10 + 9b + + 10 - 11b - Taxable amount (see page 22). Taxable amount (see page 23). - 11b + 11b + Taxable amount (see page 22). Taxable amount (see page 23). + 11b - 12b - annuities. - 12a - 12b + 12b + 12a + 12b - 13 + 13 - If you did not get a W-2, see page 21. - Enclose, but do not attach, any payment. - 13 Unemployment compensation and Alaska Permanent Fund dividends. 14a 14b - - - Social security, tier 1 railroad retirement, and veterans disability and death benefits + If you did not get a W-2, see page 21. + Enclose, but do not attach, any payment. + 13 Unemployment compensation and Alaska Permanent Fund dividends. 14a 14b + + + Social security, tier 1 railroad retirement, and veterans disability and death benefits - Taxable amount (see page 25). - 14b - + Taxable amount (see page 25). + 14b + - Social security benefits. - (cid:2) - 14a - - + Social security benefits. + (cid:2) + 14a + + - 15 - Add lines 7 through 14b (far right column). This is your total income. - (cid:2) + (cid:2) - 15 + 15 + Add lines 7 through 14b (far right column). This is your total income. + 15 - 16 17 18 - 16 17 18 - Educator expenses (see page 25). IRA deduction (see page 27). Student loan interest deduction (see page 29). - - - - 19 - 19 20 - Tuition and fees deduction. Attach Form 8917. Add lines 16 through 19. These are your total adjustments. - + 16 17 18 + 16 17 18 + Educator expenses (see page 25). IRA deduction (see page 27). Student loan interest deduction (see page 29). + + + + 19 + 19 20 + Tuition and fees deduction. Attach Form 8917. Add lines 16 through 19. These are your total adjustments. + - 20 - - - 21 - Subtract line 20 from line 15. This is your adjusted gross income. (cid:2) - 21 + 20 + + + Subtract line 20 from line 15. This is your adjusted gross income. (cid:2) + 21 + 21 - Adjusted gross income + Adjusted gross income - For Disclosure, Privacy Act, and Paperwork Reduction Act Notice, see page 74. - Form 1040A (2007) - Cat. No. 11327A + For Disclosure, Privacy Act, and Paperwork Reduction Act Notice, see page 74. + Form 1040A (2007) + Cat. No. 11327A - 2 TO BE REMOVED BEFORE PRINTING + 2 TO BE REMOVED BEFORE PRINTING - I.R.S. SPECIFICATIONS INSTRUCTIONS TO PRINTERS FORM 1040A, PAGE 2 of 2 MARGINS: TOP 13 mm (1⁄2 ") (TO BLACK IMAGE), CENTER SIDES. PRINTS: HEAD to HEAD PAPER: WHITE WRITING, SUB. 20. INK: BLACK FLAT SIZE: 203 mm (8") x 279 mm (11") PERFORATE: (NONE) DO NOT PRINT — DO NOT PRINT — DO NOT PRINT — DO NOT PRINT + I.R.S. SPECIFICATIONS INSTRUCTIONS TO PRINTERS FORM 1040A, PAGE 2 of 2 MARGINS: TOP 13 mm (1⁄2 ") (TO BLACK IMAGE), CENTER SIDES. PRINTS: HEAD to HEAD PAPER: WHITE WRITING, SUB. 20. INK: BLACK FLAT SIZE: 203 mm (8") x 279 mm (11") PERFORATE: (NONE) DO NOT PRINT — DO NOT PRINT — DO NOT PRINT — DO NOT PRINT - - - Separation 1 of 2: Black - Separation 2 of 2: PMS 185 (RED) + + + Separation 1 of 2: Black + Separation 2 of 2: PMS 185 (RED) - Page 2 - for— ● People who checked any box on line 23a or 23b or who can be claimed as a dependent, see page 30. ● All others: Single or Married filing separately, $5,350 Married filing jointly or Qualifying widow(er), $10,700 Head of household, $7,850 Form 1040A (2007) 22 Tax, credits, 23a and payments b Standard Deduction 24 25 26 for— -● People who -checked any -box on line -23a or 23b or -who can be -27 -27 -claimed as a -dependent, -28 -29 -28 -29 -see page 30. -● All others: -Single or -Married filing -30 -30 -separately, -$5,350 -31 -32 -31 -32 -Married filing -jointly or -Qualifying -widow(er), -$10,700 -33 -33 -Head of -household, -$7,850 - + Page 2 + Form 1040A (2007) - Enter the amount from line 21 (adjusted gross income). - 22 + 22 + Enter the amount from line 21 (adjusted gross income). + 22 + Tax, credits, and payments Standard Deduction for— ● People who checked any box on line 23a or 23b or who can be claimed as a dependent, see page 30. ● All others: - (cid:2) - (cid:3) + (cid:2) + (cid:3) - You were born before January 2, 1943, Spouse was born before January 2, 1943, - if: checked (cid:2) If you are married filing separately and your spouse itemizes deductions, see page 30 and check here (cid:2) Enter your standard deduction (see left margin). Subtract line 24 from line 22. If line 24 is more than line 22, enter -0-. If line 22 is $117,300 or less, multiply $3,400 by the total number of exemptions claimed on line 6d. If line 22 is over $117,300, see the worksheet on page 32. Subtract line 26 from line 25. If line 26 is more than line 25, enter -0-. This is your taxable income. Tax, including any alternative minimum tax (see page 30). Credit for child and dependent care expenses. Attach Schedule 2. Credit for the elderly or the disabled. Attach Schedule 3. Education credits. Attach Form 8863. Child tax credit (see page 35). Attach Form 8901 if required. Retirement savings contributions credit. Attach Form 8880. Add lines 29 through 33. These are your total credits. Subtract line 34 from line 28. If line 34 is more than line 28, enter -0-. Advance earned income credit payments from Form(s) W-2, box 9. Add lines 35 and 36. This is your total tax. Federal income tax withheld from Forms W-2 and 1099. 2007 estimated tax payments and amount applied from 2006 return. Earned income credit (EIC). Total boxes Blind Blind Check if: -checked (cid:2) -23a -23a -If you are married filing separately and your spouse itemizes -deductions, see page 30 and check here -23b -23b -(cid:2) -Enter your standard deduction (see left margin). -Subtract line 24 from line 22. If line 24 is more than line 22, enter -0-. -If line 22 is $117,300 or less, multiply $3,400 by the total number of exemptions -claimed on line 6d. If line 22 is over $117,300, see the worksheet on page 32. -Y -Y -Subtract line 26 from line 25. If line 26 is more than line 25, enter -0-. -L -N + 23a + You were born before January 2, 1943, Spouse was born before January 2, 1943, + Blind Blind + checked (cid:2) If you are married filing separately and your spouse itemizes (cid:2) deductions, see page 30 and check here Enter your standard deduction (see left margin). Subtract line 24 from line 22. If line 24 is more than line 22, enter -0-. If line 22 is $117,300 or less, multiply $3,400 by the total number of exemptions claimed on line 6d. If line 22 is over $117,300, see the worksheet on page 32. Subtract line 26 from line 25. If line 26 is more than line 25, enter -0-. This is your taxable income. Tax, including any alternative minimum tax (see page 30). Credit for child and dependent care expenses. Attach Schedule 2. Credit for the elderly or the disabled. Attach Schedule 3. Education credits. Attach Form 8863. Child tax credit (see page 35). Attach Form 8901 if required. Retirement savings contributions credit. Attach Form 8880. Add lines 29 through 33. These are your total credits. Subtract line 34 from line 28. If line 34 is more than line 28, enter -0-. Advance earned income credit payments from Form(s) W-2, box 9. Add lines 35 and 36. This is your total tax. Federal income tax withheld from Forms W-2 and 1099. 2007 estimated tax payments and amount applied from 2006 return. Earned income credit (EIC). b Nontaxable combat pay election. 40b Total boxes Check +if: +Check +if: +checked (cid:2) +23a +23a +b +b +If you are married filing separately and your spouse itemizes +(cid:2) +deductions, see page 30 and check here +23b +23b +Enter your standard deduction (see left margin). +Subtract line 24 from line 22. If line 24 is more than line 22, enter -0-. +If line 22 is $117,300 or less, multiply $3,400 by the total number of exemptions +claimed on line 6d. If line 22 is over $117,300, see the worksheet on page 32. +Y +Y +Subtract line 26 from line 25. If line 26 is more than line 25, enter -0-. +This is your taxable income. +L +L +N E O -T F IL -L -N -E O -T F IL -This is your taxable income. -Tax, including any alternative minimum tax (see page 30). -Credit for child and dependent care expenses. -E -E -Attach Schedule 2. -29 -29 -Credit for the elderly or the disabled. Attach -L -L -Schedule 3. -30 +T F I L +N +E O +T F I L +Tax, including any alternative minimum tax (see page 30). +Credit for child and dependent care expenses. +Attach Schedule 2. +29 +29 +E +E +Credit for the elderly or the disabled. Attach +L +L +Schedule 3. +30 31 -30 -31 -P -M +30 +31 +A M P O N -P -M -O N -Education credits. Attach Form 8863. -Child tax credit (see page 35). Attach -O -O -Form 8901 if required. -32 -32 -Retirement savings contributions credit. Attach -Form 8880. -33 -33 -Add lines 29 through 33. These are your total credits. -Subtract line 34 from line 28. If line 34 is more than line 28, enter -0-. -Advance earned income credit payments from Form(s) W-2, box 9. -Add lines 35 and 36. This is your total tax. -Federal income tax withheld from Forms W-2 and 1099. -38 -38 -2007 estimated tax payments and amount -applied from 2006 return. -39 -Earned income credit (EIC). +D +A M P +Form 8880. +O N +D +Education credits. Attach Form 8863. +Child tax credit (see page 35). Attach +Form 8901 if required. +32 +32 +O +O +Retirement savings contributions credit. Attach +33 +33 +Add lines 29 through 33. These are your total credits. +Subtract line 34 from line 28. If line 34 is more than line 28, enter -0-. +Advance earned income credit payments from Form(s) W-2, box 9. +Add lines 35 and 36. This is your total tax. +Federal income tax withheld from Forms W-2 and 1099. +38 +38 +2007 estimated tax payments and amount +applied from 2006 return. +39 +40a +39 +40a +Earned income credit (EIC). +b Nontaxable combat pay election. 40b - + + Single or Married filing separately, $5,350 + Married filing jointly or Qualifying widow(er), $10,700 + Head of household, $7,850 + - 24 25 + 24 25 26 + + 24 25 + + + - 26 + 26 - (cid:2) - 27 28 + 27 + (cid:2) + 27 28 + 28 29 + 30 + 31 32 - A - X - E - 34 35 36 37 38 39 + 33 + X + E - 34 35 36 37 - D + 34 35 36 37 38 39 + 34 35 36 37 - (cid:2) + (cid:2) - 39 40a - - If you have a qualifying child, attach Schedule EIC. + + If you have a qualifying child, attach Schedule EIC. - 40a + 40a - b Nontaxable combat pay election. 40b - 41 42 43 - 41 - (cid:2) 42 + 41 + 41 42 43 + (cid:2) 42 - Refund + Refund - number Account number Amount of line 43 you want applied to your 2008 estimated tax. Amount you owe. Subtract line 42 from line 37. For details on how to pay, see page 53. Estimated tax penalty (see page 53). Additional child tax credit. Attach Form 8812. Add lines 38, 39, 40a, and 41. These are your total payments. If line 42 is more than line 37, subtract line 37 from line 42. This is the amount you overpaid. Amount of line 43 you want refunded to you. If Form 8888 is attached, check here (cid:2) Routing c -c -Type: -Type: -Checking -Checking -Savings -Savings -(cid:2) -(cid:2) -number -Account -number -Amount of line 43 you want applied to your -2008 estimated tax. -45 -45 -Amount you owe. Subtract line 42 from line 37. For details on how -to pay, see page 53. -Estimated tax penalty (see page 53). -47 -47 - + number Additional child tax credit. Attach Form 8812. Add lines 38, 39, 40a, and 41. These are your total payments. If line 42 is more than line 37, subtract line 37 from line 42. This is the amount you overpaid. Amount of line 43 you want refunded to you. If Form 8888 is attached, check here (cid:2) Routing (cid:2) +(cid:2) +c +c +Type: +Type: +Checking +Checking +Savings +Savings +number + - 43 44a - 44a (cid:2) b - Direct deposit? See page 52 and fill in 44b, 44c, and 44d or Form 8888. + 43 44a @@ -568,49 +539,63 @@ O N - (cid:2) d + 44a (cid:2) b + Direct deposit? See page 52 and fill in 44b, 44c, and 44d or Form 8888. + (cid:2) d + number Amount of line 43 you want applied to your 2008 estimated tax. Amount you owe. Subtract line 42 from line 37. For details on how to pay, see page 53. Estimated tax penalty (see page 53). Account number +Amount of line 43 you want applied to your +2008 estimated tax. +45 +45 +Amount you owe. Subtract line 42 from line 37. For details on how +to pay, see page 53. +Estimated tax penalty (see page 53). +47 +47 + - 45 + + 45 - 46 - Amount you owe - 46 - (cid:2) - 47 + 46 + Amount you owe + (cid:2) + 46 + 47 - Yes. Complete the following. - No - Do you want to allow another person to discuss this return with the IRS (see page 54)? - Third party designee Sign here Joint return? See page 15. Keep a copy for your records. Paid preparer’s use only + Yes. Complete the following. + No + Do you want to allow another person to discuss this return with the IRS (see page 54)? + Third party designee - name (cid:2) Under penalties of perjury, I declare that I have examined this return and accompanying schedules and statements, and to the best of my knowledge and belief, they are true, correct, and accurately list all amounts and sources of income I received during the tax year. Declaration of preparer (other than the taxpayer) is based on all information of which the preparer has any knowledge. Your signature Designee’s Phone no. Personal identification number (PIN) (cid:2) ( -(cid:2) ( -) -) -name -(cid:2) -(cid:2) -(cid:2) -Under penalties of perjury, I declare that I have examined this return and accompanying schedules and statements, and to the best of my -knowledge and belief, they are true, correct, and accurately list all amounts and sources of income I received during the tax year. Declaration -of preparer (other than the taxpayer) is based on all information of which the preparer has any knowledge. -Your occupation -Your occupation -Your signature -Date -Date + (cid:2) name Under penalties of perjury, I declare that I have examined this return and accompanying schedules and statements, and to the best of my knowledge and belief, they are true, correct, and accurately list all amounts and sources of income I received during the tax year. Declaration of preparer (other than the taxpayer) is based on all information of which the preparer has any knowledge. Your signature Designee’s Phone no. Personal identification number (PIN) (cid:2) +(cid:2) ( +(cid:2) ( +(cid:2) +(cid:2) +) +) +name +Under penalties of perjury, I declare that I have examined this return and accompanying schedules and statements, and to the best of my +knowledge and belief, they are true, correct, and accurately list all amounts and sources of income I received during the tax year. Declaration +of preparer (other than the taxpayer) is based on all information of which the preparer has any knowledge. +Your occupation +Your occupation +Your signature +Date +Date @@ -618,52 +603,55 @@ O N - (cid:2) - Daytime phone number + Sign here Joint return? See page 15. Keep a copy for your records. + (cid:2) - Spouse’s occupation + Spouse’s occupation + Daytime phone number - ( - ) + ( + ) - Spouse’s signature. If a joint return, both must sign. - Date - (cid:2) + Spouse’s signature. If a joint return, both must sign. + Date + (cid:2) - Date - Preparer’s SSN or PTIN - Preparer’s signature - Check if self-employed + Date + Preparer’s SSN or PTIN + Paid preparer’s use only + Preparer’s signature + Check if self-employed - (cid:2) + (cid:2) - Firm’s name (or yours if self-employed), address, and ZIP code - EIN + Firm’s name (or yours if self-employed), address, and ZIP code + EIN - ( - ) Form 1040A (2007) - Phone no. + ( + ) + Phone no. - - - - - - - + Form 1040A (2007) + + + + + + + - Printed on recycled paper + Printed on recycled paper diff --git a/tests/saved_output/bug28_output.xml b/tests/saved_output/bug28_output.xml index 2d91ca1..75d920f 100644 --- a/tests/saved_output/bug28_output.xml +++ b/tests/saved_output/bug28_output.xml @@ -1,33 +1,31 @@ - - - PUBLISHED OPINIONS - KENTUCKY SUPREME COURT - MAY 2015 - - I. CRIMINAL LAW: - - A. Jeremy Caraway v. Commonwealth of Kentucky - - 2013-SC-000610-MR - - - - - May 14, 2015 - - Opinion of the Court by Justice Noble Affirming. All sitting; all concur. Caraway was convicted of various sex offenses and was sentenced to 20 years’ imprisonment. In affirming his convictions and sentence, the Court held that Caraway had accepted a juror’s qualifications during voir dire, thereby waiving any objection to the alleged partiality of the juror, and was thus barred from seeking appellate review on those grounds; that his direct appeal ineffective assistance of counsel claim was premature; that the trial court’s refusal to hear additional testimony at the sentencing hearing after the penalty phase of trial was not error and did not deny Caraway of meaningful judicial sentencing; and that, in light of the 2011 amendments to KRS 532.120(3), the trial court was not required or authorized to order credit for time served in custody before sentencing. - - B. Jose Lopez v. Commonwealth of Kentucky - - 2013-SC-000795-MR - - - - - May 14, 2015 - - Opinion of the Court by Justice Keller. All sitting; all concur. . Lopez was convicted of rape, incest, sexual abuse, and unlawful transaction with a minor. His convictions arose from a sexual relationship he admitted to having with his under 16-year-old stepdaughter. On appeal, Lopez primarily argued that he did not receive pre-trial due process or a fair trial because he was not provided a qualified translator. The Court noted that Lopez raised a number of issues regarding what constitutes a qualified translator. However, because Lopez had not properly preserved those issues and had not shown how he was harmed by any error related to the translations, the Court did not substantively address them. Lopez also argued that his statement, which was taken in the presence of a translator and contained the translator's translation, should have been excluded as hearsay. The Court held that Lopez's translated statement was an admissible statement against interest and the fact that a translator was involved did not alter the nature of the statement. During the penalty phase, the jurors indicated that they could not agree regarding the length of certain sentences; however, they had agreed that any sentences should run concurrently. The trial court declared a deadlock and imposed sentences that ran consecutively rather than concurrently. Because Lopez had not preserved the issue, the Court looked for palpable error, which it could not find. Finally, Lopez argued that testimony by his stepdaughter about uncharged sexual activity amounted to impermissible KRE 404(b) evidence. The Court held, as it did in Noel v. Commonwealth, 76 S.W.3d 923 (Ky. 2002), that evidence regarding similar acts perpetrated against the same victim are almost always admissible to prove intent, plan, or absence of mistake. Therefore, the complained of testimony was properly admitted. - 1 - + + + PUBLISHED OPINIONS KENTUCKY SUPREME COURT MAY 2015 + + I. CRIMINAL LAW: + + A. Jeremy Caraway v. Commonwealth of Kentucky + + 2013-SC-000610-MR + + + + + May 14, 2015 + + Opinion of the Court by Justice Noble Affirming. All sitting; all concur. Caraway was convicted of various sex offenses and was sentenced to 20 years’ imprisonment. In affirming his convictions and sentence, the Court held that Caraway had accepted a juror’s qualifications during voir dire, thereby waiving any objection to the alleged partiality of the juror, and was thus barred from seeking appellate review on those grounds; that his direct appeal ineffective assistance of counsel claim was premature; that the trial court’s refusal to hear additional testimony at the sentencing hearing after the penalty phase of trial was not error and did not deny Caraway of meaningful judicial sentencing; and that, in light of the 2011 amendments to KRS 532.120(3), the trial court was not required or authorized to order credit for time served in custody before sentencing. + + B. Jose Lopez v. Commonwealth of Kentucky + + 2013-SC-000795-MR + + + + + May 14, 2015 + + Opinion of the Court by Justice Keller. All sitting; all concur. . Lopez was convicted of rape, incest, sexual abuse, and unlawful transaction with a minor. His convictions arose from a sexual relationship he admitted to having with his under 16-year-old stepdaughter. On appeal, Lopez primarily argued that he did not receive pre-trial due process or a fair trial because he was not provided a qualified translator. The Court noted that Lopez raised a number of issues regarding what constitutes a qualified translator. However, because Lopez had not properly preserved those issues and had not shown how he was harmed by any error related to the translations, the Court did not substantively address them. Lopez also argued that his statement, which was taken in the presence of a translator and contained the translator's translation, should have been excluded as hearsay. The Court held that Lopez's translated statement was an admissible statement against interest and the fact that a translator was involved did not alter the nature of the statement. During the penalty phase, the jurors indicated that they could not agree regarding the length of certain sentences; however, they had agreed that any sentences should run concurrently. The trial court declared a deadlock and imposed sentences that ran consecutively rather than concurrently. Because Lopez had not preserved the issue, the Court looked for palpable error, which it could not find. Finally, Lopez argued that testimony by his stepdaughter about uncharged sexual activity amounted to impermissible KRE 404(b) evidence. The Court held, as it did in Noel v. Commonwealth, 76 S.W.3d 923 (Ky. 2002), that evidence regarding similar acts perpetrated against the same victim are almost always admissible to prove intent, plan, or absence of mistake. Therefore, the complained of testimony was properly admitted. + 1 + diff --git a/tests/test_main.py b/tests/test_main.py index 044c3e7..1f4007e 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -1,24 +1,15 @@ -# to run: -# python setup.py test -# -# to debug: -# pip install nose -# nosetests --pdb - import sys +import tempfile + import pdfquery from pdfquery.cache import FileCache from .utils import BaseTestCase -### helpers ### - - - class TestPDFQuery(BaseTestCase): """ - Various tests based on the IRS_1040A sample doc. + Various tests based on the IRS_1040A sample doc. """ @classmethod @@ -28,122 +19,145 @@ def setUpClass(cls): def test_xml_conversion(self): """ - Test that converted XML hasn't changed from saved version. + Test that converted XML hasn't changed from saved version. """ self.assertValidOutput(self.pdf, "IRS_1040A_output") def test_selectors(self): """ - Test the :contains and :in_bbox selectors. + Test the :contains and :in_bbox selectors. """ - label = self.pdf.pq('LTTextLineHorizontal:contains("Your first name ' - 'and initial")') + label = self.pdf.pq( + 'LTTextLineHorizontal:contains("Your first name ' 'and initial")' + ) self.assertEqual(len(label), 1) - left_corner = float(label.attr('x0')) + left_corner = float(label.attr("x0")) self.assertEqual(left_corner, 143.651) - bottom_corner = float(label.attr('y0')) + bottom_corner = float(label.attr("y0")) self.assertEqual(bottom_corner, 714.694) - name = self.pdf.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % - (left_corner, - bottom_corner - 30, - left_corner + 150, - bottom_corner) - ).text() + name = self.pdf.pq( + 'LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' + % (left_corner, bottom_corner - 30, left_corner + 150, bottom_corner) + ).text() self.assertEqual(name, "John E.") def test_extract(self): """ - Test the extract() function. + Test the extract() function. """ - values = self.pdf.extract([ - ('with_parent', 'LTPage[pageid="1"]'), - ('with_formatter', 'text'), - - ('last_name', 'LTTextLineHorizontal:in_bbox("315,680,395,700")'), - ('spouse', 'LTTextLineHorizontal:in_bbox("170,650,220,680")'), - - ('with_parent', 'LTPage[pageid="2"]'), - - ('oath', 'LTTextLineHorizontal:contains("perjury")', - lambda match: match.text()[:30] + "..."), - ('year', 'LTTextLineHorizontal:contains("Form 1040A (")', - lambda match: int(match.text()[-5:-1])) - ]) + values = self.pdf.extract( + [ + ("with_parent", 'LTPage[pageid="1"]'), + ("with_formatter", "text"), + ("last_name", 'LTTextLineHorizontal:in_bbox("315,680,395,700")'), + ("spouse", 'LTTextLineHorizontal:in_bbox("170,650,220,680")'), + ("with_parent", 'LTPage[pageid="2"]'), + ( + "oath", + 'LTTextLineHorizontal:contains("perjury")', + lambda match: match.text()[:30] + "...", + ), + ( + "year", + 'LTTextLineHorizontal:contains("Form 1040A (")', + lambda match: int(match.text()[-5:-1]), + ), + ] + ) - self.assertDictEqual(values, { - 'last_name': 'Michaels', - 'spouse': 'Susan R.', - 'oath': u'Under penalties of perjury, I ...', - 'year': 2007 - }) + self.assertDictEqual( + values, + { + "last_name": "Michaels", + "spouse": "Susan R.", + "oath": "Under penalties of perjury, I ...", + "year": 2007, + }, + ) def test_page_numbers(self): - self.assertEqual(self.pdf.tree.getroot()[0].get('page_label'), '1') + self.assertEqual(self.pdf.tree.getroot()[0].get("page_label"), "1") class TestDocInfo(BaseTestCase): - def test_docinfo(self): - doc_info_results = [ - ["tests/samples/bug11.pdf", - {'Producer': 'Mac OS X 10.9.3 Quartz PDFContext', - 'Title': u'\u262d\U0001f61c\U0001f4a9Unicode is fun!', - 'Author': 'Russkel', 'Creator': 'Firefox', - 'ModDate': "D:20140528141914+08'00'", - 'CreationDate': 'D:20140528061106Z', 'Subject': ''}], - ["tests/samples/bug15.pdf", - {'Producer': 'Mac OS X 10.9.3 Quartz PDFContext', - 'Author': 'Brepols Publishers', - 'Creator': 'PDFsharp 1.2.1269-g (www.pdfsharp.com)', - 'AAPL_Keywords': '["Brepols", "Publishers", "CTLO"]', - 'Title': 'Exporter', - 'ModDate': "D:20140614192741Z00'00'", - 'Keywords': 'Brepols, Publishers, CTLO', - 'CreationDate': "D:20140614192741Z00'00'", - 'Subject': 'Extrait de la Library of Latin Texts - Series A'}], - ["tests/samples/bug17.pdf", - {'CreationDate': 'D:20140328164512Z', - 'Creator': 'Adobe InDesign CC (Macintosh)', - 'ModDate': 'D:20140328164513Z', - 'Producer': 'Adobe PDF Library 10.0.1', 'Trapped': '/False'}] + [ + "tests/samples/bug11.pdf", + { + "Producer": "Mac OS X 10.9.3 Quartz PDFContext", + "Title": "\u262d\U0001f61c\U0001f4a9Unicode is fun!", + "Author": "Russkel", + "Creator": "Firefox", + "ModDate": "D:20140528141914+08'00'", + "CreationDate": "D:20140528061106Z", + "Subject": "", + }, + ], + [ + "tests/samples/bug15.pdf", + { + "Producer": "Mac OS X 10.9.3 Quartz PDFContext", + "Author": "Brepols Publishers", + "Creator": "PDFsharp 1.2.1269-g (www.pdfsharp.com)", + "AAPL_Keywords": '["Brepols", "Publishers", "CTLO"]', + "Title": "Exporter", + "ModDate": "D:20140614192741Z00'00'", + "Keywords": "Brepols, Publishers, CTLO", + "CreationDate": "D:20140614192741Z00'00'", + "Subject": "Extrait de la Library of Latin Texts - Series A", + }, + ], + [ + "tests/samples/bug17.pdf", + { + "CreationDate": "D:20140328164512Z", + "Creator": "Adobe InDesign CC (Macintosh)", + "ModDate": "D:20140328164513Z", + "Producer": "Adobe PDF Library 10.0.1", + "Trapped": "/False", + }, + ], ] for file_path, expected_results in doc_info_results: pdf = pdfquery.PDFQuery(file_path) pdf.load(None) docinfo = dict(pdf.tree.getroot().attrib) - self.assertDictEqual(docinfo,expected_results) + self.assertDictEqual(docinfo, expected_results) class TestUnicode(BaseTestCase): - def test_unicode_text(self): pdf = pdfquery.PDFQuery("tests/samples/bug18.pdf") pdf.load() self.assertEqual( pdf.pq('LTTextLineHorizontal:contains("Hop Hing Oils")').text(), - (u'5 Hop Hing Oils and Fats (Hong Kong) Ltd \uf06c ' - u'\u7279\u5bf6\u7cbe\u88fd\u8c6c\u6cb9') + ( + "5 Hop Hing Oils and Fats (Hong Kong) Ltd \uf06c " + "\u7279\u5bf6\u7cbe\u88fd\u8c6c\u6cb9" + ), ) def test_invalid_xml_characters(self): pdf = pdfquery.PDFQuery("tests/samples/bug39.pdf") - pdf.load(2) # throws error if we fail to strip ascii control characters -- see issue #39 + pdf.load( + 2 + ) # throws error if we fail to strip ascii control characters -- see issue #39 class TestAnnotations(BaseTestCase): """ - Ensure that annotations such as links are getting added to the PDFs - properly, as discussed in issue #28. + Ensure that annotations such as links are getting added to the PDFs + properly, as discussed in issue #28. """ def test_xml_conversion(self): """ - Test that converted XML hasn't changed from saved version. + Test that converted XML hasn't changed from saved version. """ pdf = pdfquery.PDFQuery("tests/samples/bug28.pdf") pdf.load() @@ -151,9 +165,42 @@ def test_xml_conversion(self): def test_annot_dereferencing(self): """ - See issues #37, #42. + See issues #37, #42. """ pdf = pdfquery.PDFQuery("tests/samples/bug37.pdf") pdf.load() pdf = pdfquery.PDFQuery("tests/samples/bug42.pdf") pdf.load() + + +class TestPageRange(BaseTestCase): + """ + Test various page number parameters + """ + + @classmethod + def setUpClass(cls): + cache_dir = "{}/".format(tempfile.gettempdir()) + cls.pdf = pdfquery.PDFQuery( + "tests/samples/bug67.pdf", parse_tree_cacher=FileCache(cache_dir) + ) + + def test_page_int(self): + self.pdf.load(3) + self.assertEqual(len(self.pdf.pq("LTPage")), 1) + self.pdf.load(0, 10, 25, 49) + self.assertEqual(len(self.pdf.pq("LTPage")), 4) + + def test_page_array(self): + self.pdf.load([0, 7, 11]) + self.assertEqual(len(self.pdf.pq("LTPage")), 3) + self.pdf.load([10], [0, 12], [30, 40]) + self.assertEqual(len(self.pdf.pq("LTPage")), 5) + + def test_page_mixed(self): + self.pdf.load([0, 7, 11], [0, 44], 1) + self.assertEqual(len(self.pdf.pq("LTPage")), 6) + + def test_page_range(self): + self.pdf.load(range(0, 150)) + self.assertEqual(len(self.pdf.pq("LTPage")), 150) diff --git a/tests/utils.py b/tests/utils.py index c4ea4c6..c959017 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,27 +1,17 @@ -from lxml import etree +import unittest +from lxml import etree from six import BytesIO -import sys - -if sys.version_info[:2] < (2, 7): - import unittest2 as unittest -else: - import unittest # ignore index= attribute in xml comparison, as it is not stable between python versions -IGNORE_ATTRIBS = {'index'} +IGNORE_ATTRIBS = {"index"} -class BaseTestCase(unittest.TestCase): +class BaseTestCase(unittest.TestCase): def assertValidOutput(self, pdf, output_name): """ - Test that converted XML hasn't changed from saved version. + Test that converted XML hasn't changed from saved version. """ - # Just skip this test if we're on python 2.6 -- float handling makes element sort ordering unpredictable, - # causing intermittent test failures. - if sys.version_info[:2] < (2, 7): - return - # get current XML for sample file tree_string = BytesIO() pdf.tree.write(tree_string, pretty_print=True, encoding="utf-8") @@ -31,7 +21,7 @@ def assertValidOutput(self, pdf, output_name): # this varies by Python version, because the float handling isn't quite # the same comparison_file = "tests/saved_output/%s.xml" % (output_name,) - with open(comparison_file, 'rb') as f: + with open(comparison_file, "rb") as f: saved_string = f.read() # compare current to previous @@ -43,19 +33,30 @@ def assertValidOutput(self, pdf, output_name): out.write(tree_string) # for debugging: run `pytest --lf --pdb` and then use etree.dump(e1), etree.dump(e2) e1, e2 = e.args[1:3] - raise self.failureException("XML conversion of sample pdf has changed! Compare %s to %s" % (comparison_file, output_path)) from e + raise self.failureException( + "XML conversion of sample pdf has changed! Compare %s to %s" + % (comparison_file, output_path) + ) from e def xml_strings_equal(self, s1, s2, ignore_attribs=IGNORE_ATTRIBS): """ - Return true if two xml strings are semantically equivalent (ignoring attribute ordering and whitespace). + Return true if two xml strings are semantically equivalent (ignoring attribute ordering and whitespace). """ + # via http://stackoverflow.com/a/24349916/307769 def elements_equal(e1, e2): - if e1.tag != e2.tag: raise self.failureException("Mismatched tags", e1, e2) - if e1.text != e2.text: raise self.failureException("Mismatched text", e1, e2) - if e1.tail != e2.tail: raise self.failureException("Mismatched tail", e1, e2) - if set(e1.attrib) - ignore_attribs != set(e2.attrib) - ignore_attribs: raise self.failureException("Mismatched attributes %s and %s" % (e1.attrib, e2.attrib), e1, e2) - if len(e1) != len(e2): raise self.failureException("Mismatched children", e1, e2) + if e1.tag != e2.tag: + raise self.failureException("Mismatched tags", e1, e2) + if e1.text != e2.text: + raise self.failureException("Mismatched text", e1, e2) + if e1.tail != e2.tail: + raise self.failureException("Mismatched tail", e1, e2) + if set(e1.attrib) - ignore_attribs != set(e2.attrib) - ignore_attribs: + raise self.failureException( + "Mismatched attributes %s and %s" % (e1.attrib, e2.attrib), e1, e2 + ) + if len(e1) != len(e2): + raise self.failureException("Mismatched children", e1, e2) for c1, c2 in zip(e1, e2): elements_equal(c1, c2)