Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 15 additions & 5 deletions src/Smalot/PdfParser/PDFObject.php
Original file line number Diff line number Diff line change
Expand Up @@ -788,16 +788,26 @@ public function getTextArray(?Page $page = null): array
break;
}

// If the PDFObject is an Image or a Form, do nothing as
// neither of these XObject types are text.
if ($xobject instanceof Image || $xobject instanceof Form) {
// If the PDFObject is an Image, do nothing as images
// aren't text.
if ($xobject instanceof Image) {
break;
}

// Check this is not a circular reference.
if (!\in_array($xobject->getUniqueId(), self::$recursionStack, true)) {
$text[] = $xobject->getText($page);
if (\in_array($xobject->getUniqueId(), self::$recursionStack, true)) {
break;
}

$objectText = $xobject->getText($page);

// If the PDFObject is a Form and doesn't have any text,
// skip it.
if ($xobject instanceof Form && $objectText === ' ') {
break;
}

$text[] = $objectText;
break;

// Marked content point with (DP) & without (MP) property list
Expand Down
31 changes: 25 additions & 6 deletions tests/PHPUnit/Unit/PDFObjectTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
namespace PHPUnitTests\Unit;

use PHPUnitTests\TestCase;
use Smalot\PdfParser\Config;
use Smalot\PdfParser\Document;
use Smalot\PdfParser\Element;
use Smalot\PdfParser\Element\ElementArray;
Expand Down Expand Up @@ -33,8 +34,10 @@ public function testTextArrayObjects(): void
$document = new Document();
$document->init();

$config = new Config();
$image = new Image($document);
$form = new Form($document);
$formNoText = new Form($document);
$formWithText = new Form($document, null, 'BT /F1 12 Tf 10 10 Td (Form text) Tj ET', $config);
$xObject = new PDFObject($document);

$header1 = new Header([
Expand All @@ -50,30 +53,46 @@ public function testTextArrayObjects(): void
$header2 = new Header([
'Resources' => new Header([
'XObject' => new Header([
'Fr0' => $form,
'Fr0' => $formNoText,
])
]),
'Contents' => new ElementArray([new Element('/Fr0 Do', $document)], $document),
]);
$page2 = new Page($document, $header2);

$header3 = new Header([
'Resources' => new Header([
'XObject' => new Header([
'Fr0' => $formWithText,
])
]),
'Contents' => new ElementArray([new Element('/Fr0 Do', $document)], $document),
]);
$page3 = new Page($document, $header3);

$header4 = new Header([
'Resources' => new Header([
'XObject' => new Header([
'Ps0' => $xObject,
])
]),
'Contents' => new ElementArray([new Element('/Ps0 Do', $document)], $document),
]);
$page3 = new Page($document, $header3);
$page4 = new Page($document, $header4);

// Page 1 contains an image, which should not appear in the text array.
self::assertSame([], $page1->getTextArray());

// Page 2 contains a form, which should not appear in the text array.
// Page 2 contains a form that contains no text, which should not appear
// in the text array.
self::assertSame([], $page2->getTextArray());

// Page 3 contains a non-image object, which should appear in the text array.
self::assertSame([' '], $page3->getTextArray());
// Page 3 contains a form that contains text, which should appear in the
// text array.
self::assertSame(['Form text '], $page3->getTextArray());

// Page 4 contains a non-image object, which should appear in the text
// array.
self::assertSame([' '], $page4->getTextArray());
}
}