|
16 | 16 | pdfminer's layout is used. |
17 | 17 | """ |
18 | 18 | import logging |
19 | | -from decimal import Decimal |
20 | 19 | from typing import List, Union |
21 | 20 |
|
22 | | -from libpdf import textbox |
23 | | -from libpdf import utils |
| 21 | +from libpdf import textbox, utils |
24 | 22 | from libpdf.catalog import catalog |
25 | 23 | from libpdf.log import logging_needed |
26 | 24 | from libpdf.models.figure import Figure |
27 | 25 | from libpdf.models.page import Page |
28 | 26 | from libpdf.models.position import Position |
29 | | -from libpdf.models.table import Cell |
30 | | -from libpdf.models.table import Table |
| 27 | +from libpdf.models.table import Cell, Table |
31 | 28 | from libpdf.parameters import LA_PARAMS |
32 | 29 | from libpdf.progress import bar_format_lvl2, tqdm |
33 | 30 | from libpdf.utils import from_pdfplumber_bbox, lt_to_libpdf_hbox_converter |
@@ -65,17 +62,21 @@ def extract_pdf_table(pdf, pages_list: List[Page], figure_list: List[Figure]): |
65 | 62 | 'explicit_vertical_lines': [], |
66 | 63 | 'explicit_horizontal_lines': [], |
67 | 64 | 'snap_tolerance': 3, |
| 65 | + "snap_x_tolerance": 3, |
| 66 | + "snap_y_tolerance": 3, |
68 | 67 | 'join_tolerance': 3, |
| 68 | + "join_x_tolerance": 3, |
| 69 | + "join_y_tolerance": 3, |
69 | 70 | 'edge_min_length': 3, |
70 | 71 | 'min_words_vertical': 3, |
71 | 72 | 'min_words_horizontal': 1, |
72 | | - 'keep_blank_chars': False, |
| 73 | + #'keep_blank_chars': False, |
73 | 74 | 'text_tolerance': 3, |
74 | 75 | 'text_x_tolerance': 2, |
75 | 76 | 'text_y_tolerance': 2, |
76 | 77 | 'intersection_tolerance': 3, |
77 | | - 'intersection_x_tolerance': None, |
78 | | - 'intersection_y_tolerance': None, |
| 78 | + 'intersection_x_tolerance': 3, |
| 79 | + 'intersection_y_tolerance': 3, |
79 | 80 | } |
80 | 81 |
|
81 | 82 | table_dict = {'page': {}} |
@@ -157,7 +158,7 @@ def extract_cells(lt_page: LTPage, rows: List, list_cell: List[Cell], page: Page |
157 | 158 | row_cell[1], |
158 | 159 | row_cell[2], |
159 | 160 | row_cell[3], |
160 | | - Decimal(lt_page.height), |
| 161 | + lt_page.height, |
161 | 162 | ) |
162 | 163 | pos_cell = Position(pos_cell_bbox[0], pos_cell_bbox[1], pos_cell_bbox[2], pos_cell_bbox[3], page) |
163 | 164 | # extract cell text |
|
0 commit comments