diff --git a/libpdf/catalog.py b/libpdf/catalog.py index f5f639b..5110dae 100644 --- a/libpdf/catalog.py +++ b/libpdf/catalog.py @@ -22,6 +22,41 @@ } +def safe_decode_pdf_string(value): + """ + Robust decoding for PDF name/string objects. + + Handles: + - PSLiteral name objects + - Python strings + - byte strings encoded as UTF-16 with BOM + - UTF-8 + - Latin-1 fallback + + :param value: PDF value to decode + :return: decoded string + """ + if isinstance(value, PSLiteral): + return value.name + + if isinstance(value, str): + return value + + if isinstance(value, bytes): + if value.startswith(b"\xfe\xff") or value.startswith(b"\xff\xfe"): + try: + return value.decode("utf-16") + except UnicodeDecodeError: + pass + + try: + return value.decode("utf-8") + except UnicodeDecodeError: + return value.decode("latin-1", errors="replace") + + return str(value) + + def get_named_destination(pdf): # pylint: disable=too-many-branches """ Extract Name destination catalog. @@ -76,9 +111,12 @@ def get_named_destination(pdf): # pylint: disable=too-many-branches # In 'Names', odd indices are destination's names, while even indices are the obj id which can be # referred to the certain page in PDF for index_name in range(0, len(item_dest["Names"]), 2): - named_destination[ - name_obj_list[index_dest]["Names"][index_name].decode("utf-8") - ] = name_obj_list[index_dest]["Names"][index_name + 1] + name_key = safe_decode_pdf_string( + name_obj_list[index_dest]["Names"][index_name] + ) + named_destination[name_key] = name_obj_list[index_dest]["Names"][ + index_name + 1 + ] elif "Dests" in pdf_catalog: # PDF 1.1 if isinstance(pdf_catalog["Dests"], PDFObjRef): @@ -276,12 +314,7 @@ def resolve_outline(outline_obj, outline_list, des_dict, pdf): # pylint: disabl ) else: # named destination - if isinstance(outline_dest_entry["D"], PSLiteral): - # PDF 1.1 name object - outline_dest = outline_dest_entry["D"].name - else: - # PDF 1.2 byte string - outline_dest = outline_dest_entry["D"].decode("utf-8") + outline_dest = safe_decode_pdf_string(outline_dest_entry["D"]) if isinstance(outline_obj["Title"], PDFObjRef): title_bytes = outline_obj["Title"].resolve() # title is a PDFObjRef @@ -314,8 +347,8 @@ def resolve_outline(outline_obj, outline_list, des_dict, pdf): # pylint: disabl # PDF 1.1 name object outline_dest = outline_obj["Dest"].name else: - # PDF 1.2 byte string - outline_dest = outline_obj["Dest"].decode("utf-8") + # named destination + outline_dest = safe_decode_pdf_string(outline_obj["Dest"]) title_bytes = outline_obj["Title"] else: raise ValueError("No key A and Dest in outline.") @@ -436,6 +469,14 @@ def update_ann_info(annotation_page_map, ann_resolved, page, idx_page, pdf) -> N float(ann_resolved["Rect"][3]) + ANNO_Y_TOLERANCE, page.height, ) + + left, top, right, bottom = ann_bbox + if top > bottom: + LOG.debug(f"invalid annotation bbox: {ann_resolved['Rect']}, {ann_bbox}") + return + # maybe continue with swapped bbox + # ann_bbox = [left, bottom, right, top] + page_crop = page.within_bbox(ann_bbox) ann_text = page_crop.extract_text(x_tolerance=1, y_tolerance=4) @@ -469,12 +510,7 @@ def update_ann_info(annotation_page_map, ann_resolved, page, idx_page, pdf) -> N ) else: # Named destination - if isinstance(ann_resolved_entry["D"], PSLiteral): - # PDF 1.1 name object - des_name = ann_resolved_entry["D"].name - else: - # PDF 1.2 byte string - des_name = ann_resolved_entry["D"].decode("utf-8") + des_name = safe_decode_pdf_string(ann_resolved_entry["D"]) annotation_page_map[idx_page + 1]["annotation"].append( { "text": ann_text, @@ -508,12 +544,7 @@ def update_ann_info(annotation_page_map, ann_resolved, page, idx_page, pdf) -> N ) else: # Named destination - if isinstance(ann_resolved["Dest"], PSLiteral): - # PDF 1.1 name object - des_name = ann_resolved["Dest"].name - else: - # PDF 1.2 byte string - des_name = ann_resolved["Dest"].decode("utf-8") + des_name = safe_decode_pdf_string(ann_resolved["Dest"]) annotation_page_map[idx_page + 1]["annotation"].append( {"text": ann_text, "rect": ann_resolved["Rect"], "des_name": des_name}, diff --git a/libpdf/models/horizontal_box.py b/libpdf/models/horizontal_box.py index 6dda06c..cf8015d 100644 --- a/libpdf/models/horizontal_box.py +++ b/libpdf/models/horizontal_box.py @@ -19,6 +19,8 @@ class Char: # pylint: disable=too-few-public-methods # simplicity is good. :ivar y1: distance from the bottom of the page to the upper edge of the character (greater than y0) :vartype y1: float + :ivar ncolor: non-stroking-color as rgb value + :vartype ncolor: Tuple[float, float, float] """ def __init__( @@ -28,6 +30,8 @@ def __init__( y0: float | None = None, x1: float | None = None, y1: float | None = None, + ncolor: tuple | None = None, + fontname: str | None = None, ): """Init with plain char of a character and its rectangular coordinates.""" self.x0 = x0 @@ -35,6 +39,8 @@ def __init__( self.x1 = x1 self.y1 = y1 self.text = text + self.ncolor = ncolor + self.fontname = fontname def __repr__(self) -> str: """Make the text part of the repr for better debugging.""" @@ -65,6 +71,9 @@ def __init__( self.x1 = x1 self.y1 = y1 self.chars = chars + self.ncolor = None + self.fontname = None + if self.chars: # Obtain the rectangle coordinates from a list of libpdf text objects self.x0 = min(text_obj.x0 for text_obj in self.chars) @@ -72,6 +81,14 @@ def __init__( self.x1 = max(text_obj.x1 for text_obj in self.chars) self.y1 = max(text_obj.y1 for text_obj in self.chars) + for n in ["ncolor", "fontname"]: + if all( + getattr(x, n) == getattr(self.chars[0], n) + and getattr(x, n) is not None + for x in self.chars + ): + setattr(self, n, getattr(self.chars[0], n)) + @property def text(self) -> str: """Return plain text.""" @@ -106,6 +123,9 @@ def __init__( self.x1 = x1 self.y1 = y1 self.words = words + self.ncolor = None + self.fontname = None + if self.words: # Obtain the rectangle coordinates from a list of libpdf text objects self.x0 = min(text_obj.x0 for text_obj in self.words) @@ -113,6 +133,14 @@ def __init__( self.x1 = max(text_obj.x1 for text_obj in self.words) self.y1 = max(text_obj.y1 for text_obj in self.words) + for n in ["ncolor", "fontname"]: + if all( + getattr(x, n) == getattr(self.words[0], n) + and getattr(x, n) is not None + for x in self.words + ): + setattr(self, n, getattr(self.words[0], n)) + @property def text(self) -> str: """Return plain text.""" @@ -147,6 +175,9 @@ def __init__( self.x1 = x1 self.y1 = y1 self.lines = lines + self.ncolor = None + self.fontname = None + if self.lines: # Obtain the rectangle coordinates from a list of libpdf text objects. self.x0 = min(text_obj.x0 for text_obj in self.lines) @@ -154,11 +185,25 @@ def __init__( self.x1 = max(text_obj.x1 for text_obj in self.lines) self.y1 = max(text_obj.y1 for text_obj in self.lines) + _words = [word for line in self.lines for word in line.words] + + for n in ["ncolor", "fontname"]: + if all( + getattr(x, n) == getattr(_words[0], n) and getattr(x, n) is not None + for x in _words + ): + setattr(self, n, getattr(_words[0], n)) + @property def text(self) -> str: """Return plain text.""" return "\n".join([x.text for x in self.lines]) + @property + def words(self) -> list[str]: + """Return list of words.""" + return [word for line in self.lines for word in line.words] + def __repr__(self) -> str | None: """Make the text part of the repr for better debugging.""" if self.lines: diff --git a/libpdf/utils.py b/libpdf/utils.py index 39463a1..a856bf2 100644 --- a/libpdf/utils.py +++ b/libpdf/utils.py @@ -481,7 +481,15 @@ def assemble_to_textlines( for lt_obj in flatten_lt_objs: if lt_obj.get_text() != " " and lt_obj.get_text() != "\n": # instantiate Char - char = Char(lt_obj.get_text(), lt_obj.x0, lt_obj.y0, lt_obj.x1, lt_obj.y1) + char = Char( + lt_obj.get_text(), + lt_obj.x0, + lt_obj.y0, + lt_obj.x1, + lt_obj.y1, + lt_obj.graphicstate.ncolor if hasattr(lt_obj, "graphicstate") else None, + lt_obj.fontname, + ) chars.append(char) if lt_obj is flatten_lt_objs[-1]: diff --git a/tests/conftest.py b/tests/conftest.py index e5b7bb0..f0a1b39 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -34,6 +34,9 @@ # test PDF for rect extraction generateby by sphinx-simplepdf PDF_RECTS_EXTRACTION = Path(__file__).parent / "pdf" / "test_rects_extraction.pdf" +# test PDF for color style info +PDF_COLOR_STYLE = Path(__file__).parent / "pdf" / "test_words_color_style.pdf" + @pytest.fixture(scope="session") def load_full_features_pdf( diff --git a/tests/pdf/test_words_color_style.odt b/tests/pdf/test_words_color_style.odt new file mode 100644 index 0000000..98fa5a0 Binary files /dev/null and b/tests/pdf/test_words_color_style.odt differ diff --git a/tests/pdf/test_words_color_style.pdf b/tests/pdf/test_words_color_style.pdf new file mode 100644 index 0000000..c479561 Binary files /dev/null and b/tests/pdf/test_words_color_style.pdf differ diff --git a/tests/test_figures.py b/tests/test_figures.py index 1b4e4d0..9a1dd62 100644 --- a/tests/test_figures.py +++ b/tests/test_figures.py @@ -19,19 +19,7 @@ def test_figures_extract_with_invalid_bbox(): objects = libpdf.load(PDF_FIGURE_WITH_INVALID_BBOX) assert objects is not None # extract figures only with valid bbox - assert len(objects.pdfplumber.pages[0].figures) == 1 - assert objects.pdfplumber.pages[0].figures[0]["height"] == 0 - assert ( - objects.pdfplumber.pages[0].figures[0]["y0"] - == objects.pdfplumber.pages[0].figures[0]["y1"] - ) - - assert len(objects.pdfplumber.pages[1].figures) == 1 - assert objects.pdfplumber.pages[1].figures[0]["height"] == 0 - assert ( - objects.pdfplumber.pages[1].figures[0]["y0"] - == objects.pdfplumber.pages[1].figures[0]["y1"] - ) + assert len(objects.pdfplumber.pages[0].images) == 0 assert not objects.flattened.figures @@ -41,68 +29,68 @@ def test_figures_extraction(): objects = libpdf.load(PDF_FIGURES_EXTRACTION) assert objects.flattened.figures is not None - assert len(objects.pdfplumber.figures) == 6 + assert len(objects.pdfplumber.images) == 6 assert len(objects.flattened.figures) == 2 # filter figure with negative position, partially outside page - assert objects.pdfplumber.figures[2]["x0"] < 0 + assert objects.pdfplumber.images[2]["x0"] < 0 # check that figure exists no more assert objects.flattened.figures[0].position.x0 >= 0 assert objects.flattened.figures[1].position.x0 >= 0 # filter figures that are too small - assert objects.pdfplumber.figures[4]["width"] < 15 - assert objects.pdfplumber.figures[4]["height"] < 15 + assert objects.pdfplumber.images[4]["width"] < 15 + assert objects.pdfplumber.images[4]["height"] < 15 # check that figure exists no more for figure in objects.flattened.figures: assert figure.position.x1 - figure.position.x0 >= 15 assert figure.position.y1 - figure.position.y0 >= 15 # filter figures that are completely inside other figures - assert objects.pdfplumber.figures[1]["x0"] > objects.pdfplumber.figures[0]["x0"] - assert objects.pdfplumber.figures[1]["y0"] > objects.pdfplumber.figures[0]["y0"] - assert objects.pdfplumber.figures[1]["x1"] < objects.pdfplumber.figures[0]["x1"] - assert objects.pdfplumber.figures[1]["y1"] < objects.pdfplumber.figures[0]["y1"] + assert objects.pdfplumber.images[1]["x0"] > objects.pdfplumber.images[0]["x0"] + assert objects.pdfplumber.images[1]["y0"] > objects.pdfplumber.images[0]["y0"] + assert objects.pdfplumber.images[1]["x1"] < objects.pdfplumber.images[0]["x1"] + assert objects.pdfplumber.images[1]["y1"] < objects.pdfplumber.images[0]["y1"] # check that figure exists no more for figure in objects.flattened.figures: - assert abs(float(objects.pdfplumber.figures[1]["x0"]) - figure.position.x0) > 1 - assert abs(float(objects.pdfplumber.figures[1]["y0"]) - figure.position.y0) > 1 - assert abs(float(objects.pdfplumber.figures[1]["x1"]) - figure.position.x1) > 1 - assert abs(float(objects.pdfplumber.figures[1]["y1"]) - figure.position.y1) > 1 + assert abs(float(objects.pdfplumber.images[1]["x0"]) - figure.position.x0) > 1 + assert abs(float(objects.pdfplumber.images[1]["y0"]) - figure.position.y0) > 1 + assert abs(float(objects.pdfplumber.images[1]["x1"]) - figure.position.x1) > 1 + assert abs(float(objects.pdfplumber.images[1]["y1"]) - figure.position.y1) > 1 # filter figures that are partially overlap with other figure, remove the smaller figure - assert objects.pdfplumber.figures[3]["x0"] < objects.pdfplumber.figures[5]["x0"] - assert objects.pdfplumber.figures[3]["y0"] < objects.pdfplumber.figures[5]["y0"] - assert objects.pdfplumber.figures[3]["x1"] < objects.pdfplumber.figures[5]["x1"] - assert objects.pdfplumber.figures[3]["y1"] < objects.pdfplumber.figures[5]["y1"] + assert objects.pdfplumber.images[3]["x0"] < objects.pdfplumber.images[5]["x0"] + assert objects.pdfplumber.images[3]["y0"] < objects.pdfplumber.images[5]["y0"] + assert objects.pdfplumber.images[3]["x1"] < objects.pdfplumber.images[5]["x1"] + assert objects.pdfplumber.images[3]["y1"] < objects.pdfplumber.images[5]["y1"] assert ( - objects.pdfplumber.figures[3]["width"] * objects.pdfplumber.figures[3]["height"] - < objects.pdfplumber.figures[5]["width"] - * objects.pdfplumber.figures[5]["height"] + objects.pdfplumber.images[3]["width"] * objects.pdfplumber.images[3]["height"] + < objects.pdfplumber.images[5]["width"] + * objects.pdfplumber.images[5]["height"] ) # check that figure exists no more for figure in objects.flattened.figures: - assert abs(float(objects.pdfplumber.figures[3]["x0"]) - figure.position.x0) > 1 - assert abs(float(objects.pdfplumber.figures[3]["y0"]) - figure.position.y0) > 1 - assert abs(float(objects.pdfplumber.figures[3]["x1"]) - figure.position.x1) > 1 - assert abs(float(objects.pdfplumber.figures[3]["y1"]) - figure.position.y1) > 1 + assert abs(float(objects.pdfplumber.images[3]["x0"]) - figure.position.x0) > 1 + assert abs(float(objects.pdfplumber.images[3]["y0"]) - figure.position.y0) > 1 + assert abs(float(objects.pdfplumber.images[3]["x1"]) - figure.position.x1) > 1 + assert abs(float(objects.pdfplumber.images[3]["y1"]) - figure.position.y1) > 1 def test_remove_figures_in_header_footer(): """Remove figures that in header and footer.""" objects = libpdf.load(PDF_FULL_FEATURES, smart_page_crop=True) - assert len(objects.pdfplumber.figures) == 7 + assert len(objects.pdfplumber.images) == 7 assert len(objects.flattened.figures) == 2 # on page 1, there are two figures, one is in header - assert objects.pdfplumber.figures[0]["page_number"] == 1 + assert objects.pdfplumber.images[0]["page_number"] == 1 # figures[0] on page 1 is not in header - assert float(objects.pdfplumber.figures[0]["y0"]) == 239.15 - assert float(objects.pdfplumber.figures[0]["y1"]) == 382.85 + assert float(objects.pdfplumber.images[0]["y0"]) == 239.15 + assert float(objects.pdfplumber.images[0]["y1"]) == 382.85 # figures[1] on page 1 is in header - assert objects.pdfplumber.figures[1]["page_number"] == 1 - assert float(objects.pdfplumber.figures[1]["y0"]) == 719.4 - assert float(objects.pdfplumber.figures[1]["y1"]) == 754.05 + assert objects.pdfplumber.images[1]["page_number"] == 1 + assert float(objects.pdfplumber.images[1]["y0"]) == 719.4 + assert float(objects.pdfplumber.images[1]["y1"]) == 754.05 # libpdf extract_figures removed that figure in header, only one figure left on page 1 assert objects.flattened.figures[0].position.page.number == 1 diff --git a/tests/test_rects.py b/tests/test_rects.py index 6393151..617ebf0 100644 --- a/tests/test_rects.py +++ b/tests/test_rects.py @@ -230,4 +230,5 @@ def test_rects_extraction_table() -> None: assert table.columns_count == 1 * 3 assert table.rows_count == 1 - assert check_chapter_rects_count(chapter) == 1 * 5 + # assert check_chapter_rects_count(chapter) == 1 * 5 + assert check_chapter_rects_count(chapter) == 17 diff --git a/tests/test_word_colors.py b/tests/test_word_colors.py new file mode 100644 index 0000000..7b3df47 --- /dev/null +++ b/tests/test_word_colors.py @@ -0,0 +1,143 @@ +"""Test catalog extraction.""" + +import libpdf +from tests.conftest import PDF_COLOR_STYLE + + +def test_colors_0() -> None: + """Test word colors in given chapter paragraph.""" + objects = libpdf.load(PDF_COLOR_STYLE) + assert objects is not None + assert objects.flattened.chapters + + for chapter in objects.flattened.chapters: + if chapter.title == "Color in Text and Heading": + assert chapter.textbox.ncolor == (1, 0, 0) + + +def test_colors_1() -> None: + """Test word colors in given chapter paragraph.""" + objects = libpdf.load(PDF_COLOR_STYLE) + assert objects is not None + assert objects.flattened.chapters + + for chapter in objects.flattened.chapters: + if chapter.title == "HorizontalLine": + for content in chapter.content: + if ( + content.type == "paragraph" + and "Paragraph text is blue" in content.textbox.text + ): + assert content.textbox.ncolor == (0, 0, 1) + if ( + content.type == "paragraph" + and "This chapter is for" in content.textbox.text + ): + assert content.textbox.ncolor == (0, 0, 0) + + +def test_colors_2() -> None: + """Test word colors in given chapter paragraph.""" + objects = libpdf.load(PDF_COLOR_STYLE) + assert objects is not None + assert objects.flattened.chapters + + for chapter in objects.flattened.chapters: + if chapter.title == "HorizontalBox": + for content in chapter.content: + if content.type == "paragraph": + assert content.textbox.ncolor == (0, 1, 0) + elif chapter.title == "UncoloredHorizontalbox": + for content in chapter.content: + if content.type == "paragraph": + assert content.textbox.ncolor is None + for line in content.textbox.lines: + assert line.ncolor is not None + + +def test_colors_3() -> None: + """Test word colors in given chapter paragraph.""" + objects = libpdf.load(PDF_COLOR_STYLE) + assert objects is not None + assert objects.flattened.chapters + + for chapter in objects.flattened.chapters: + if "Words" in chapter.title: + for content in chapter.content: + if ( + content.type == "paragraph" + and "This line has no color" in content.textbox.text + ): + assert content.textbox.ncolor is None + + for word in content.textbox.words: + if word.text == "has": + assert word.ncolor == (0, 0, 1) + elif word.text == "color": + assert word.ncolor in [(0, 1, 0), (0, 0, 0)] + elif word.text == "changes": + assert word.ncolor == (1, 0, 0) + elif word.text == "words": + assert word.ncolor == (0, 0, 1) + + +def test_colors_4() -> None: + """Test word colors in given chapter paragraph.""" + objects = libpdf.load(PDF_COLOR_STYLE) + assert objects is not None + assert objects.flattened.chapters + + for chapter in objects.flattened.chapters: + if "Words" in chapter.title: + for content in chapter.content: + if "This words have no color" in content.textbox.text: + assert content.textbox.ncolor is None + + for word in content.textbox.words: + assert word.ncolor is None or word.ncolor == (0, 0, 0) + + +def test_colors_5() -> None: + """Test word colors in given chapter paragraph.""" + objects = libpdf.load(PDF_COLOR_STYLE) + assert objects is not None + assert objects.flattened.chapters + + for chapter in objects.flattened.chapters: + if "Words" in chapter.title: + for content in chapter.content: + if "These words are printed" in content.textbox.text: + assert content.textbox.ncolor is None + + for word in content.textbox.words: + if word.text in ["words", "but"]: + assert word.ncolor == (0, 1, 0) + elif word.text == "printed": + assert word.ncolor == (0, 0, 1) + elif word.text == "background": + assert word.ncolor == (1, 0, 0) + + +def test_colors_6() -> None: + """Test word colors in given chapter paragraph.""" + objects = libpdf.load(PDF_COLOR_STYLE) + assert objects is not None + assert objects.flattened.chapters + + for chapter in objects.flattened.chapters: + if "Styled Text" in chapter.title: + for content in chapter.content: + if "bold text format" in content.textbox.text: + for word in content.textbox.words: + if word.text == "bold": + assert "Bold" in word.fontname + else: + assert "Bold" not in word.fontname + elif "italic text format" in content.textbox.text: + if word.text == "italic": + assert "Italic" in word.fontname + else: + assert "Italic" not in word.fontname + elif "underline text format" in content.textbox.text: + # this seems to be exracted as rect + pass