From 3fb806830e24060e061f0236af54861cf3f609fa Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Fri, 28 Mar 2025 09:19:08 +0100 Subject: [PATCH 1/2] NUTCH-3110 Upgrade to Tika 3.1.0 Upgrade to shaded Tika packages 3.1.0.0 provided by Tim Allison. The shaded packages are required to avoid version conflicts when running in distributed mode caused by incompatible versions of the commons-io jar shipped with Hadoop and required by Tika, cf. NUTCH-2959. --- ivy/ivy.xml | 2 +- src/plugin/language-identifier/ivy.xml | 2 +- src/plugin/language-identifier/plugin.xml | 2 +- src/plugin/parse-tika/ivy.xml | 2 +- src/plugin/parse-tika/plugin.xml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ivy/ivy.xml b/ivy/ivy.xml index f402d8365c..70614f8fd6 100644 --- a/ivy/ivy.xml +++ b/ivy/ivy.xml @@ -74,7 +74,7 @@ - + diff --git a/src/plugin/language-identifier/ivy.xml b/src/plugin/language-identifier/ivy.xml index 17aba47e92..149d6c1479 100644 --- a/src/plugin/language-identifier/ivy.xml +++ b/src/plugin/language-identifier/ivy.xml @@ -37,7 +37,7 @@ - + diff --git a/src/plugin/language-identifier/plugin.xml b/src/plugin/language-identifier/plugin.xml index 94929bdbfa..a4625d0603 100644 --- a/src/plugin/language-identifier/plugin.xml +++ b/src/plugin/language-identifier/plugin.xml @@ -26,7 +26,7 @@ - + diff --git a/src/plugin/parse-tika/ivy.xml b/src/plugin/parse-tika/ivy.xml index 6d96ed3cd9..f4aa049a9a 100644 --- a/src/plugin/parse-tika/ivy.xml +++ b/src/plugin/parse-tika/ivy.xml @@ -37,7 +37,7 @@ - + diff --git a/src/plugin/parse-tika/plugin.xml b/src/plugin/parse-tika/plugin.xml index 04afb9faca..8761748932 100644 --- a/src/plugin/parse-tika/plugin.xml +++ b/src/plugin/parse-tika/plugin.xml @@ -25,7 +25,7 @@ - + From 76ced9b18f9af04764a23ffa9ac811a70517da0f Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Fri, 28 Mar 2025 09:28:10 +0100 Subject: [PATCH 2/2] NUTCH-3110 Upgrade to Tika 3.1.0 Add "text/javascript" as MIME type supported by "parse-js". Note: fixes parse-js unit tests. Tika 3.1.0 identifies the Javascript test document as "text/javascript" instead of "application/javascript". --- src/plugin/parse-js/plugin.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/plugin/parse-js/plugin.xml b/src/plugin/parse-js/plugin.xml index e55195a460..ac044ce7ca 100644 --- a/src/plugin/parse-js/plugin.xml +++ b/src/plugin/parse-js/plugin.xml @@ -36,7 +36,7 @@ point="org.apache.nutch.parse.Parser"> - + @@ -45,7 +45,7 @@ point="org.apache.nutch.parse.HtmlParseFilter"> - +