Skip to content

Commit b096dfb

Browse files
committed
fix dependencies from pillow branch and fix processing of pdf files
1 parent 9cf07dc commit b096dfb

7 files changed

Lines changed: 41 additions & 43 deletions

File tree

libpdf/catalog.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from libpdf.progress import bar_format_lvl2, tqdm
99
from libpdf.utils import decode_title, to_pdfplumber_bbox
1010

11-
from pdfminer.pdftypes import PDFObjRef
11+
from pdfminer.pdftypes import PDFObjRef, resolve1
1212
from pdfminer.psparser import PSLiteral
1313

1414

@@ -48,7 +48,9 @@ def get_named_destination(pdf): # pylint: disable=too-many-branches
4848
if isinstance(pdf_catalog['Names'], PDFObjRef) and 'Dests' in pdf_catalog['Names'].resolve():
4949
name_tree = pdf_catalog['Names'].resolve()['Dests'].resolve()
5050
elif isinstance(pdf_catalog['Names'], dict) and 'Dests' in pdf_catalog['Names']:
51-
name_tree = pdf_catalog['Names']['Dests'].resolve()
51+
name_tree = resolve1(pdf_catalog['Names']['Dests'])
52+
#name_tree = pdf_catalog['Names']['Dests'].resolve()
53+
#LOG.debug(f"{name_tree}")
5254
# check if name tree not empty
5355
if name_tree:
5456
# map page id to page number
@@ -405,7 +407,7 @@ def update_ann_info(annotation_page_map, ann_resolved, page, idx_page, pdf): #
405407
page.height,
406408
)
407409
page_crop = page.within_bbox(ann_bbox)
408-
ann_text = page_crop.extract_text(x_tolerance=1, y_tolerance=4)
410+
ann_text = page_crop.extract_text(x_tolerance=float(1), y_tolerance=float(4))
409411

410412
if 'A' in ann_resolved:
411413
# make sure ann_resolved['A'] is resolved

libpdf/extract.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -536,7 +536,7 @@ def extract_figures(
536536
lt_page = page._layout # pylint: disable=protected-access # easiest way to obtain LTPage
537537

538538
# check and filter figures
539-
figures = check_and_filter_figures(page_crop.figures)
539+
figures = check_and_filter_figures(page_crop.objects['figure']) if 'figure' in page_crop.objects else []
540540

541541
if len(figures) != 0:
542542
for idx_figure, figure in enumerate(figures):

libpdf/process.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,9 @@ def remove_page_header_footer(single_page):
4040
page_crop = single_page.within_bbox(
4141
(
4242
0,
43-
decimal.Decimal(parameters.PAGE_CROP_MARGINS['top']),
43+
parameters.PAGE_CROP_MARGINS['top'],
4444
single_page.width,
45-
single_page.height - decimal.Decimal(parameters.PAGE_CROP_MARGINS['bottom']),
45+
single_page.height - parameters.PAGE_CROP_MARGINS['bottom'],
4646
),
4747
)
4848

libpdf/tables.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,18 +16,15 @@
1616
pdfminer's layout is used.
1717
"""
1818
import logging
19-
from decimal import Decimal
2019
from typing import List, Union
2120

22-
from libpdf import textbox
23-
from libpdf import utils
21+
from libpdf import textbox, utils
2422
from libpdf.catalog import catalog
2523
from libpdf.log import logging_needed
2624
from libpdf.models.figure import Figure
2725
from libpdf.models.page import Page
2826
from libpdf.models.position import Position
29-
from libpdf.models.table import Cell
30-
from libpdf.models.table import Table
27+
from libpdf.models.table import Cell, Table
3128
from libpdf.parameters import LA_PARAMS
3229
from libpdf.progress import bar_format_lvl2, tqdm
3330
from libpdf.utils import from_pdfplumber_bbox, lt_to_libpdf_hbox_converter
@@ -65,17 +62,21 @@ def extract_pdf_table(pdf, pages_list: List[Page], figure_list: List[Figure]):
6562
'explicit_vertical_lines': [],
6663
'explicit_horizontal_lines': [],
6764
'snap_tolerance': 3,
65+
"snap_x_tolerance": 3,
66+
"snap_y_tolerance": 3,
6867
'join_tolerance': 3,
68+
"join_x_tolerance": 3,
69+
"join_y_tolerance": 3,
6970
'edge_min_length': 3,
7071
'min_words_vertical': 3,
7172
'min_words_horizontal': 1,
72-
'keep_blank_chars': False,
73+
#'keep_blank_chars': False,
7374
'text_tolerance': 3,
7475
'text_x_tolerance': 2,
7576
'text_y_tolerance': 2,
7677
'intersection_tolerance': 3,
77-
'intersection_x_tolerance': None,
78-
'intersection_y_tolerance': None,
78+
'intersection_x_tolerance': 3,
79+
'intersection_y_tolerance': 3,
7980
}
8081

8182
table_dict = {'page': {}}
@@ -157,7 +158,7 @@ def extract_cells(lt_page: LTPage, rows: List, list_cell: List[Cell], page: Page
157158
row_cell[1],
158159
row_cell[2],
159160
row_cell[3],
160-
Decimal(lt_page.height),
161+
lt_page.height,
161162
)
162163
pos_cell = Position(pos_cell_bbox[0], pos_cell_bbox[1], pos_cell_bbox[2], pos_cell_bbox[3], page)
163164
# extract cell text

libpdf/textbox.py

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,9 @@
2727
pdfminer sees y0 and y1 from the bottom of the page, so y0 is smaller than y1.
2828
All coordinates are given in points where 72 points are 1 inch.
2929
"""
30+
from difflib import SequenceMatcher
3031
import logging
3132
import re
32-
from difflib import SequenceMatcher
3333
from typing import Dict, List, Tuple, Union
3434

3535
from libpdf import parameters
@@ -51,14 +51,7 @@
5151
)
5252
from libpdf.progress import bar_format_lvl2, tqdm
5353
from libpdf.utils import lt_page_crop, lt_to_libpdf_hbox_converter, textbox_crop
54-
55-
from pdfminer.layout import (
56-
LTAnno,
57-
LTChar,
58-
LTText,
59-
LTTextBox,
60-
LTTextLineHorizontal,
61-
)
54+
from pdfminer.layout import LTAnno, LTChar, LTText, LTTextBox, LTTextLineHorizontal
6255

6356

6457
LOG = logging.getLogger(__name__)
@@ -877,9 +870,9 @@ def pdfminer_get_lt_textboxes(pdf) -> Dict[int, List[LTTextBox]]:
877870
if logging_needed(idx_page, len(pdf.pages)):
878871
LOG.debug('Extracting layout page %s of %s', idx_page + 1, len(pdf.pages))
879872

880-
pdf.interpreter.process_page(page.page_obj)
881-
result = pdf.device.get_result()
882-
lt_textboxes = [obj for obj in result if isinstance(obj, LTTextBox)]
873+
# pdf.interpreter.process_page(page.page_obj)
874+
layout_objects = page.layout._objs
875+
lt_textboxes = [obj for obj in layout_objects if isinstance(obj, LTTextBox)]
883876
# remove detected header and footer lt_textboxes based on given page crop margin parameter
884877
filter_lt_textboxes = list(
885878
filter(

libpdf/utils.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -164,10 +164,10 @@ def to_pdfplumber_bbox(x0, y0, x1, y1, page_height):
164164
:return: [x0, top, x1, bottom]
165165
"""
166166
# pylint: disable=invalid-name # short is better here
167-
ret_x0 = Decimal(x0)
168-
ret_y0 = Decimal(Decimal(page_height) - Decimal(y1))
169-
ret_x1 = Decimal(x1)
170-
ret_y1 = Decimal(Decimal(page_height) - Decimal(y0))
167+
ret_x0 = x0
168+
ret_y0 = page_height - y1
169+
ret_x1 = x1
170+
ret_y1 = page_height - y0
171171
return [ret_x0, ret_y0, ret_x1, ret_y1]
172172

173173

pyproject.toml

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -38,24 +38,26 @@ include = [
3838

3939
[tool.poetry.dependencies]
4040
python = "^3.7"
41-
chardet = "^4"
42-
click = "^8"
41+
chardet = "*"
42+
click = "*"
4343
importlib-metadata = { version = "^1.6.0", python = "~3.7" }
44-
PyYAML = "^6"
45-
"ruamel.yaml" = "^0.17"
44+
PyYAML = "*"
45+
"ruamel.yaml" = "*"
46+
pdfplumber = "*"
47+
"pdfminer.six" = "*"
4648

4749
# optional deps for progress bars
48-
tqdm = { version = "^4.50.0", optional = true }
49-
colorama = { version = "^0.4.4", optional = true }
50+
tqdm = { version = "*", optional = true }
51+
colorama = { version = "*", optional = true }
5052

5153
# dependencies needed by pdfminer.six and pdfplumber which are deliverd as wheels in the deps folder
5254
# the libs were patched and no upstream PR has been raised yet
5355
# see [tool.poetry.dev-dependencies] for the forked libraries
54-
pycryptodome = "^3.9.9"
56+
pycryptodome = "*"
5557
sortedcontainers = "^2.3.0"
56-
pillow = "~9.0.1" # cannot go higher because of this bug https://github.com/jsvine/pdfplumber/discussions/637
57-
unicodecsv = "^0.14.1"
58-
wand = "^0.6.5"
58+
pillow = "*"
59+
unicodecsv = "*"
60+
wand = "*"
5961

6062
# optional deps for docs, needed to make RTD work with pyproject.toml
6163
# see https://github.com/readthedocs/readthedocs.org/issues/4912#issuecomment-664002569
@@ -73,8 +75,8 @@ docs = ["sphinx", "sphinx_rtd_theme", "sphinxcontrib-plantuml"]
7375
# they must be dev deps so make them invisible to PyPI and the egg/wheel requires section
7476
# 'poetry install' installs those by default, so the Git deps are actually used instead of the wheels in the deps
7577
# folder; see also libpdf/_import_forks.py
76-
pdfplumber = { git = "https://github.com/useblocks/pdfplumber.git" }
77-
"pdfminer.six" = {git = "https://github.com/useblocks/pdfminer.six", rev = "develop"}
78+
# pdfplumber = { git = "https://github.com/useblocks/pdfplumber.git" }
79+
# "pdfminer.six" = {git = "https://github.com/useblocks/pdfminer.six", rev = "develop"}
7880

7981
# testing
8082
pytest = "^7"

0 commit comments

Comments
 (0)