From 27bc0c58fb644a89cf014a76c40d6b5676ed2050 Mon Sep 17 00:00:00 2001
From: Shinsuke Sugaya <shinsuke@apache.org>
Date: Tue, 5 May 2026 07:31:25 +0900
Subject: [PATCH 1/6] fix(extractor): defend against archive bomb / recursion /
 Zip Slip

Adds defensive limits to ZipExtractor, TarExtractor and LhaExtractor so
that a malicious content stream can no longer cause unbounded resource
consumption or write outside its conceptual extraction root.

Threat model assumption: the InputStream is untrusted; the params Map is
admin-configured / trusted.

Protections:
- AbstractExtractor: extractorDepth param + checkDepth/getCurrentDepth/
  incrementDepth helpers, plus configurable maxArchiveDepth (default 10).
  Recursive archive-in-archive expansions now propagate an incremented
  depth and abort with MaxLengthExceededException at the threshold.
- ZipExtractor: per-archive caps (maxBytes 2 GiB, maxEntries 100k,
  maxCompressionRatio 100:1 for entries > 1 MiB), Zip Slip path-traversal
  rejection via Path.normalize(), early ZIP signature validation, and
  configurable filenameEncoding (UTF-8 / CP932 / MS932) for non-UTF8
  archive names. Compression ratio uses the central directory size when
  available and falls back to bytes consumed from a CountingInputStream.
- TarExtractor: maxBytes / maxEntries caps, depth check, path-traversal
  rejection, and explicit skipping of symbolic-link and hard-link entries
  to avoid leaking content from outside the archive sandbox.
- LhaExtractor: maxBytes / maxEntries caps, depth check, path-traversal
  rejection.

Tests:
- AbstractExtractorTest: new coverage for getCurrentDepth /
  incrementDepth (returns NEW map, leaves input untouched) / checkDepth
  pass and fail paths.
- ArchiveExtractorSecurityTest: 15 synthetic-archive tests covering each
  threat (zip byte / entry / ratio bombs, Zip Slip, absolute paths,
  recursion-depth, CP932 filenames, tar symlink/hardlink/path-traversal,
  tar byte/entry caps and depth, lha depth).

All previously-passing extractor tests continue to pass; failures in the
suite are limited to environment-dependent tests (Docker for SMB / S3 /
GCS / Storage clients, LibreOffice for JodExtractor) which are unchanged
by this commit.
---
 .../extractor/impl/AbstractExtractor.java     |  87 ++++
 .../crawler/extractor/impl/LhaExtractor.java  |  88 +++-
 .../crawler/extractor/impl/TarExtractor.java  | 164 ++++++-
 .../crawler/extractor/impl/ZipExtractor.java  | 267 +++++++++-
 .../extractor/impl/AbstractExtractorTest.java |  96 ++++
 .../impl/ArchiveExtractorSecurityTest.java    | 457 ++++++++++++++++++
 6 files changed, 1119 insertions(+), 40 deletions(-)
 create mode 100644 fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java

diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java
index 14759e30..bb30c235 100644
--- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java
+++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java
@@ -18,10 +18,13 @@
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 
 import org.codelibs.fess.crawler.container.CrawlerContainer;
 import org.codelibs.fess.crawler.exception.CrawlerSystemException;
+import org.codelibs.fess.crawler.exception.MaxLengthExceededException;
 import org.codelibs.fess.crawler.extractor.Extractor;
 import org.codelibs.fess.crawler.extractor.ExtractorFactory;
 import org.codelibs.fess.crawler.helper.MimeTypeHelper;
@@ -48,6 +51,14 @@
  */
 public abstract class AbstractExtractor implements Extractor {
 
+    /**
+     * Parameter key used to track the recursion depth across nested archive
+     * extraction. Callers/recursive extractor invocations may set this to
+     * limit how deeply nested archives are unpacked. The value is parsed as
+     * an integer; missing or unparseable values are treated as depth 0.
+     */
+    public static final String EXTRACTOR_DEPTH_KEY = "extractorDepth";
+
     /** The crawler container. */
     @Resource
     protected CrawlerContainer crawlerContainer;
@@ -55,6 +66,14 @@ public abstract class AbstractExtractor implements Extractor {
     /** The weight of this extractor. */
     protected int weight = 1;
 
+    /**
+     * Maximum allowed depth for recursive archive extraction. When the depth
+     * value parsed from {@link #EXTRACTOR_DEPTH_KEY} reaches this threshold,
+     * {@link #checkDepth(Map, int)} aborts further recursion to defend
+     * against recursion-bomb archives.
+     */
+    protected int maxArchiveDepth = 10;
+
     /**
      * Constructs a new AbstractExtractor.
      */
@@ -62,6 +81,74 @@ public AbstractExtractor() {
         // NOP
     }
 
+    /**
+     * Sets the maximum allowed recursion depth for nested archive extraction.
+     * @param maxArchiveDepth the new maximum depth (non-negative)
+     */
+    public void setMaxArchiveDepth(final int maxArchiveDepth) {
+        this.maxArchiveDepth = maxArchiveDepth;
+    }
+
+    /**
+     * Returns the current recursion depth recorded in the extractor params.
+     * Missing, blank, or unparseable values are treated as {@code 0}.
+     *
+     * @param params the extractor parameters (may be {@code null})
+     * @return the parsed depth, or {@code 0} if not set
+     */
+    protected int getCurrentDepth(final Map<String, String> params) {
+        if (params == null) {
+            return 0;
+        }
+        final String value = params.get(EXTRACTOR_DEPTH_KEY);
+        if (value == null || value.isBlank()) {
+            return 0;
+        }
+        try {
+            final int depth = Integer.parseInt(value.trim());
+            return depth < 0 ? 0 : depth;
+        } catch (final NumberFormatException e) {
+            return 0;
+        }
+    }
+
+    /**
+     * Returns a NEW parameter map (the original is not mutated) with the
+     * recursion depth incremented by one. Useful when an archive extractor
+     * recursively delegates to another extractor for a nested archive entry.
+     *
+     * @param params the current extractor parameters (may be {@code null})
+     * @return a new map containing all original entries plus an incremented
+     *         depth
+     */
+    protected Map<String, String> incrementDepth(final Map<String, String> params) {
+        final Map<String, String> next = new HashMap<>();
+        if (params != null) {
+            next.putAll(params);
+        }
+        next.put(EXTRACTOR_DEPTH_KEY, Integer.toString(getCurrentDepth(params) + 1));
+        return next;
+    }
+
+    /**
+     * Validates that the recursion depth recorded in {@code params} does not
+     * meet or exceed {@code maxDepth}. Throws {@link MaxLengthExceededException}
+     * (a {@link org.codelibs.fess.crawler.exception.CrawlingAccessException
+     * CrawlingAccessException}) when the threshold is reached so that the
+     * surrounding crawler treats it as a data-driven access failure rather
+     * than a system error.
+     *
+     * @param params the extractor parameters (may be {@code null})
+     * @param maxDepth the (exclusive) maximum allowed depth
+     * @throws MaxLengthExceededException when {@code currentDepth >= maxDepth}
+     */
+    protected void checkDepth(final Map<String, String> params, final int maxDepth) {
+        final int current = getCurrentDepth(params);
+        if (current >= maxDepth) {
+            throw new MaxLengthExceededException("Archive recursion depth exceeded: depth=" + current + " max=" + maxDepth);
+        }
+    }
+
     @Override
     public int getWeight() {
         return weight;
diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java
index 7a5ff29a..70e2cced 100644
--- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java
+++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java
@@ -19,8 +19,10 @@
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.file.InvalidPathException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.Enumeration;
-import java.util.HashMap;
 import java.util.Map;
 
 import org.apache.logging.log4j.LogManager;
@@ -45,6 +47,11 @@
  * This extractor can extract text content from files within LHA archives
  * by using appropriate extractors for each contained file type.
  *
+ * <p>
+ * Defends against decompression / many-entry / recursion bombs and Zip Slip
+ * style path traversal in entry names.
+ * </p>
+ *
  * @author shinsuke
  */
 public class LhaExtractor extends AbstractExtractor {
@@ -54,6 +61,18 @@ public class LhaExtractor extends AbstractExtractor {
     /** Maximum content size for extraction. -1 means no limit. */
     protected long maxContentSize = -1;
 
+    /**
+     * Maximum total uncompressed bytes that may be read from all entries
+     * combined. Defaults to 2 GiB. Set to {@code -1} to disable.
+     */
+    protected long maxBytes = 1L << 31;
+
+    /**
+     * Maximum allowed number of entries to iterate. Defaults to 100,000.
+     * Set to {@code -1} to disable.
+     */
+    protected int maxEntries = 100_000;
+
     /**
      * Creates a new LhaExtractor instance.
      */
@@ -76,6 +95,7 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
         if (in == null) {
             throw new CrawlerSystemException("LHA archive input stream is null. Cannot extract text from null input.");
         }
+        checkDepth(params, maxArchiveDepth);
 
         final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper();
         final ExtractorFactory extractorFactory = getExtractorFactory();
@@ -93,13 +113,25 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
             @SuppressWarnings("unchecked")
             final Enumeration<LhaHeader> entries = lhaFile.entries();
             long contentSize = 0;
+            int entryCount = 0;
             while (entries.hasMoreElements()) {
                 final LhaHeader head = entries.nextElement();
+                entryCount++;
+                if (maxEntries > 0 && entryCount > maxEntries) {
+                    throw new MaxLengthExceededException("lha entry count exceeded: count=" + entryCount + " max=" + maxEntries);
+                }
                 contentSize += head.getOriginalSize();
+                if (maxBytes > 0 && contentSize > maxBytes) {
+                    throw new MaxLengthExceededException("lha uncompressed size exceeded: total=" + contentSize + " max=" + maxBytes);
+                }
                 if (maxContentSize != -1 && contentSize > maxContentSize) {
                     throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize);
                 }
                 final String filename = head.getPath();
+                if (isPathTraversal(filename)) {
+                    logger.warn("lha entry rejected: name={} reason=path-traversal", filename);
+                    continue;
+                }
                 final String mimeType = mimeTypeHelper.getContentType(null, filename);
                 if (mimeType != null) {
                     final Extractor extractor = extractorFactory.getExtractor(mimeType);
@@ -107,10 +139,12 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
                         InputStream is = null;
                         try {
                             is = lhaFile.getInputStream(head);
-                            final Map<String, String> map = new HashMap<>();
+                            final Map<String, String> map = incrementDepth(params);
                             map.put(ExtractData.RESOURCE_NAME_KEY, filename);
                             buf.append(extractor.getText(new IgnoreCloseInputStream(is), map).getContent());
                             buf.append('\n');
+                        } catch (final MaxLengthExceededException e) {
+                            throw e;
                         } catch (final Exception e) {
                             if (logger.isDebugEnabled()) {
                                 logger.debug("Exception in an internal extractor.", e);
@@ -139,6 +173,40 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
         return new ExtractData(buf.toString().trim());
     }
 
+    /**
+     * Returns true when the supplied entry name escapes the conceptual
+     * extraction root via path-traversal segments.
+     *
+     * @param name the entry name as reported by the archive
+     * @return {@code true} if the name should be rejected
+     */
+    protected boolean isPathTraversal(final String name) {
+        if (name == null || name.isEmpty()) {
+            return true;
+        }
+        if (name.startsWith("/") || name.startsWith("\\")) {
+            return true;
+        }
+        if (name.length() >= 2 && name.charAt(1) == ':') {
+            return true;
+        }
+        try {
+            final Path normalised = Paths.get(name).normalize();
+            final String normStr = normalised.toString().replace('\\', '/');
+            if (normStr.equals("..") || normStr.startsWith("../") || normStr.contains("/../")) {
+                return true;
+            }
+            for (final Path part : normalised) {
+                if ("..".equals(part.toString())) {
+                    return true;
+                }
+            }
+        } catch (final InvalidPathException ipe) {
+            return true;
+        }
+        return false;
+    }
+
     /**
      * Sets the maximum content size for extraction.
      *
@@ -147,4 +215,20 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
     public void setMaxContentSize(final long maxContentSize) {
         this.maxContentSize = maxContentSize;
     }
+
+    /**
+     * Sets the cap on total uncompressed bytes read from all entries.
+     * @param maxBytes the maximum total bytes (use {@code -1} to disable)
+     */
+    public void setMaxBytes(final long maxBytes) {
+        this.maxBytes = maxBytes;
+    }
+
+    /**
+     * Sets the maximum number of entries that may be iterated.
+     * @param maxEntries the maximum entry count (use {@code -1} to disable)
+     */
+    public void setMaxEntries(final int maxEntries) {
+        this.maxEntries = maxEntries;
+    }
 }
diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java
index b5bb0238..35f4a8f0 100644
--- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java
+++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java
@@ -15,13 +15,19 @@
  */
 package org.codelibs.fess.crawler.extractor.impl;
 
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
 import java.io.InputStream;
-import java.util.HashMap;
+import java.nio.file.InvalidPathException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.Map;
 
 import org.apache.commons.compress.archivers.ArchiveInputStream;
 import org.apache.commons.compress.archivers.ArchiveStreamFactory;
 import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
+import org.apache.commons.io.IOUtils;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.codelibs.fess.crawler.entity.ExtractData;
@@ -30,12 +36,17 @@
 import org.codelibs.fess.crawler.extractor.Extractor;
 import org.codelibs.fess.crawler.extractor.ExtractorFactory;
 import org.codelibs.fess.crawler.helper.MimeTypeHelper;
-import org.codelibs.fess.crawler.util.IgnoreCloseInputStream;
 
 import jakarta.annotation.Resource;
 
 /**
  * Extracts text content from TAR archives.
+ *
+ * <p>
+ * Defends against decompression / many-entry / recursion bombs and Zip Slip
+ * style path traversal. Symbolic and hard link entries are skipped because
+ * they can reference files outside the archive sandbox.
+ * </p>
  */
 public class TarExtractor extends AbstractExtractor {
     private static final Logger logger = LogManager.getLogger(TarExtractor.class);
@@ -47,10 +58,23 @@ public class TarExtractor extends AbstractExtractor {
     protected ArchiveStreamFactory archiveStreamFactory;
 
     /**
-     * Maximum content size.
+     * Maximum content size (legacy field). When set, the actual bytes
+     * read from each entry stream are summed and compared against this limit.
      */
     protected long maxContentSize = -1;
 
+    /**
+     * Maximum total uncompressed bytes that may be read from all entries
+     * combined. Defaults to 2 GiB. Set to {@code -1} to disable.
+     */
+    protected long maxBytes = 1L << 31;
+
+    /**
+     * Maximum allowed number of entries to iterate. Defaults to 100,000.
+     * Set to {@code -1} to disable.
+     */
+    protected int maxEntries = 100_000;
+
     /**
      * Creates a new TarExtractor instance.
      */
@@ -61,10 +85,11 @@ public TarExtractor() {
     @Override
     public ExtractData getText(final InputStream in, final Map<String, String> params) {
         validateInputStream(in);
+        checkDepth(params, maxArchiveDepth);
 
         final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper();
         final ExtractorFactory extractorFactory = getExtractorFactory();
-        return new ExtractData(getTextInternal(in, mimeTypeHelper, extractorFactory));
+        return new ExtractData(getTextInternal(in, mimeTypeHelper, extractorFactory, params));
     }
 
     /**
@@ -73,36 +98,83 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
      * @param in The input stream.
      * @param mimeTypeHelper The mime type helper.
      * @param extractorFactory The extractor factory.
+     * @param params Extractor parameters used to track recursion depth.
      * @return A text.
      */
-    protected String getTextInternal(final InputStream in, final MimeTypeHelper mimeTypeHelper, final ExtractorFactory extractorFactory) {
+    protected String getTextInternal(final InputStream in, final MimeTypeHelper mimeTypeHelper, final ExtractorFactory extractorFactory,
+            final Map<String, String> params) {
         final StringBuilder buf = new StringBuilder(1000);
         int processedEntries = 0;
         int failedEntries = 0;
 
         try (final ArchiveInputStream ais = archiveStreamFactory.createArchiveInputStream("tar", in)) {
-            TarArchiveEntry entry = null;
-            long contentSize = 0;
+            TarArchiveEntry entry;
+            long totalBytes = 0;
+            int entryCount = 0;
             while ((entry = (TarArchiveEntry) ais.getNextEntry()) != null) {
-                contentSize += entry.getSize();
-                if (maxContentSize != -1 && contentSize > maxContentSize) {
-                    throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize);
+                entryCount++;
+                if (maxEntries > 0 && entryCount > maxEntries) {
+                    throw new MaxLengthExceededException("tar entry count exceeded: count=" + entryCount + " max=" + maxEntries);
                 }
                 final String filename = entry.getName();
+                if (entry.isDirectory()) {
+                    continue;
+                }
+                if (entry.isSymbolicLink() || entry.isLink()) {
+                    if (logger.isDebugEnabled()) {
+                        logger.debug("tar entry skipped: name={} reason=link link={}", filename, entry.getLinkName());
+                    }
+                    continue;
+                }
+                if (isPathTraversal(filename)) {
+                    logger.warn("tar entry rejected: name={} reason=path-traversal", filename);
+                    continue;
+                }
+
+                final long actualBytes;
+                final byte[] entryBytes;
+                try {
+                    final long readLimit;
+                    if (maxBytes > 0) {
+                        readLimit = Math.max(0L, maxBytes - totalBytes) + 1L;
+                    } else {
+                        readLimit = Long.MAX_VALUE;
+                    }
+                    final ByteArrayOutputStream out = new ByteArrayOutputStream();
+                    actualBytes = copyBounded(ais, out, readLimit);
+                    entryBytes = out.toByteArray();
+                } catch (final IOException ioe) {
+                    failedEntries++;
+                    if (logger.isDebugEnabled()) {
+                        logger.debug("Failed to read tar entry: name={}", filename, ioe);
+                    }
+                    continue;
+                }
+
+                totalBytes += actualBytes;
+                if (maxBytes > 0 && totalBytes > maxBytes) {
+                    throw new MaxLengthExceededException("tar uncompressed size exceeded: total=" + totalBytes + " max=" + maxBytes);
+                }
+                if (maxContentSize != -1 && totalBytes > maxContentSize) {
+                    throw new MaxLengthExceededException("Extracted size is " + totalBytes + " > " + maxContentSize);
+                }
+
                 final String mimeType = mimeTypeHelper.getContentType(null, filename);
                 if (mimeType != null) {
                     final Extractor extractor = extractorFactory.getExtractor(mimeType);
                     if (extractor != null) {
                         try {
-                            final Map<String, String> map = new HashMap<>();
+                            final Map<String, String> map = incrementDepth(params);
                             map.put(ExtractData.RESOURCE_NAME_KEY, filename);
-                            buf.append(extractor.getText(new IgnoreCloseInputStream(ais), map).getContent());
+                            buf.append(extractor.getText(new ByteArrayInputStream(entryBytes), map).getContent());
                             buf.append('\n');
                             processedEntries++;
+                        } catch (final MaxLengthExceededException e) {
+                            throw e;
                         } catch (final Exception e) {
                             failedEntries++;
                             if (logger.isDebugEnabled()) {
-                                logger.debug("Failed to extract content from archive entry: {}", filename, e);
+                                logger.debug("Failed to extract content from archive entry: name={}", filename, e);
                             }
                         }
                     }
@@ -115,13 +187,61 @@ protected String getTextInternal(final InputStream in, final MimeTypeHelper mime
                 throw new ExtractException("Failed to extract content from TAR archive. No entries could be processed.", e);
             }
             if (logger.isWarnEnabled()) {
-                logger.warn("Partial extraction from TAR archive. Processed: {}, Failed: {}", processedEntries, failedEntries, e);
+                logger.warn("Partial extraction from TAR archive. processed={} failed={}", processedEntries, failedEntries, e);
             }
         }
 
         return buf.toString().trim();
     }
 
+    /**
+     * Returns true when the supplied entry name escapes the conceptual
+     * extraction root via path-traversal segments.
+     *
+     * @param name the entry name as reported by the archive
+     * @return {@code true} if the name should be rejected
+     */
+    protected boolean isPathTraversal(final String name) {
+        if (name == null || name.isEmpty()) {
+            return true;
+        }
+        if (name.startsWith("/") || name.startsWith("\\")) {
+            return true;
+        }
+        if (name.length() >= 2 && name.charAt(1) == ':') {
+            return true;
+        }
+        try {
+            final Path normalised = Paths.get(name).normalize();
+            final String normStr = normalised.toString().replace('\\', '/');
+            if (normStr.equals("..") || normStr.startsWith("../") || normStr.contains("/../")) {
+                return true;
+            }
+            for (final Path part : normalised) {
+                if ("..".equals(part.toString())) {
+                    return true;
+                }
+            }
+        } catch (final InvalidPathException ipe) {
+            return true;
+        }
+        return false;
+    }
+
+    private long copyBounded(final InputStream in, final ByteArrayOutputStream out, final long limit) throws IOException {
+        if (limit <= 0) {
+            return 0;
+        }
+        final byte[] buffer = new byte[8192];
+        long total = 0;
+        int read;
+        while (total < limit && (read = in.read(buffer, 0, (int) Math.min(buffer.length, limit - total))) != IOUtils.EOF) {
+            out.write(buffer, 0, read);
+            total += read;
+        }
+        return total;
+    }
+
     /**
      * Sets the maximum content size.
      * @param maxContentSize The maximum content size to set.
@@ -129,4 +249,20 @@ protected String getTextInternal(final InputStream in, final MimeTypeHelper mime
     public void setMaxContentSize(final long maxContentSize) {
         this.maxContentSize = maxContentSize;
     }
+
+    /**
+     * Sets the cap on total uncompressed bytes read from all entries.
+     * @param maxBytes the maximum total bytes (use {@code -1} to disable)
+     */
+    public void setMaxBytes(final long maxBytes) {
+        this.maxBytes = maxBytes;
+    }
+
+    /**
+     * Sets the maximum number of entries that may be iterated.
+     * @param maxEntries the maximum entry count (use {@code -1} to disable)
+     */
+    public void setMaxEntries(final int maxEntries) {
+        this.maxEntries = maxEntries;
+    }
 }
diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java
index a543b3a9..ee4a9370 100644
--- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java
+++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java
@@ -16,13 +16,19 @@
 package org.codelibs.fess.crawler.extractor.impl;
 
 import java.io.BufferedInputStream;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
 import java.io.InputStream;
-import java.util.HashMap;
+import java.nio.file.InvalidPathException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.Map;
 
-import org.apache.commons.compress.archivers.ArchiveInputStream;
-import org.apache.commons.compress.archivers.ArchiveStreamFactory;
 import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.input.CountingInputStream;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.codelibs.fess.crawler.entity.ExtractData;
@@ -31,26 +37,64 @@
 import org.codelibs.fess.crawler.extractor.Extractor;
 import org.codelibs.fess.crawler.extractor.ExtractorFactory;
 import org.codelibs.fess.crawler.helper.MimeTypeHelper;
-import org.codelibs.fess.crawler.util.IgnoreCloseInputStream;
-
-import jakarta.annotation.Resource;
 
 /**
  * Extracts text content from ZIP archives.
+ *
+ * <p>
+ * The extractor defends against several content-driven attack vectors. The
+ * input stream itself is treated as untrusted, while the {@code params} map is
+ * assumed to be admin-configured / trusted. Protections include:
+ * </p>
+ * <ul>
+ *   <li>Total uncompressed-size cap ({@link #setMaxBytes(long)})</li>
+ *   <li>Maximum number of entries ({@link #setMaxEntries(int)})</li>
+ *   <li>Per-entry compression-ratio threshold
+ *       ({@link #setMaxCompressionRatio(long)}) to detect zip bombs</li>
+ *   <li>Recursion-depth check (via {@link AbstractExtractor#checkDepth})</li>
+ *   <li>Zip Slip path-traversal detection (entry names normalised and
+ *       rejected when they escape the conceptual extraction root)</li>
+ *   <li>Configurable filename encoding (e.g. {@code "CP932"} /
+ *       {@code "MS932"} for Japanese filenames)</li>
+ * </ul>
  */
 public class ZipExtractor extends AbstractExtractor {
     private static final Logger logger = LogManager.getLogger(ZipExtractor.class);
 
+    /** Threshold below which compression-ratio checks are skipped (bytes). */
+    private static final long COMPRESSION_RATIO_MIN_BYTES = 1L << 20; // 1 MiB
+
+    /**
+     * The maximum content size (legacy field). When set, the actual bytes
+     * read from each entry stream are summed and compared against this limit.
+     */
+    protected long maxContentSize = -1;
+
     /**
-     * The archive stream factory.
+     * Maximum total uncompressed bytes that may be read from all entries
+     * combined. Defaults to 2 GiB. Set to {@code -1} to disable.
      */
-    @Resource
-    protected ArchiveStreamFactory archiveStreamFactory;
+    protected long maxBytes = 1L << 31;
 
     /**
-     * The maximum content size.
+     * Maximum allowed compression ratio (uncompressed / compressed). Entries
+     * exceeding this ratio AND larger than 1 MiB are rejected as suspected
+     * zip bombs. Set to {@code -1} to disable.
      */
-    protected long maxContentSize = -1;
+    protected long maxCompressionRatio = 100L;
+
+    /**
+     * Maximum allowed number of entries to iterate. Defaults to 100,000.
+     * Set to {@code -1} to disable.
+     */
+    protected int maxEntries = 100_000;
+
+    /**
+     * Filename encoding used to decode entry names that lack the UTF-8 flag.
+     * Defaults to {@code "UTF-8"}; set to {@code "CP932"} or {@code "MS932"}
+     * for archives created on Japanese Windows systems.
+     */
+    protected String filenameEncoding = "UTF-8";
 
     /**
      * Creates a new ZipExtractor instance.
@@ -62,6 +106,7 @@ public ZipExtractor() {
     @Override
     public ExtractData getText(final InputStream in, final Map<String, String> params) {
         validateInputStream(in);
+        checkDepth(params, maxArchiveDepth);
 
         final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper();
         final ExtractorFactory extractorFactory = getExtractorFactory();
@@ -69,30 +114,114 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
         int processedEntries = 0;
         int failedEntries = 0;
 
-        try (final ArchiveInputStream ais =
-                archiveStreamFactory.createArchiveInputStream(in.markSupported() ? in : new BufferedInputStream(in))) {
-            ZipArchiveEntry entry = null;
-            long contentSize = 0;
-            while ((entry = (ZipArchiveEntry) ais.getNextEntry()) != null) {
-                contentSize += entry.getSize();
-                if (maxContentSize != -1 && contentSize > maxContentSize) {
-                    throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize);
+        final InputStream wrapped = in.markSupported() ? in : new BufferedInputStream(in);
+        // Early-validate the ZIP magic so a clearly non-zip blob is reported
+        // as ExtractException rather than silently returning empty text.
+        wrapped.mark(4);
+        try {
+            final byte[] sig = new byte[4];
+            int read = 0;
+            while (read < 4) {
+                final int n = wrapped.read(sig, read, 4 - read);
+                if (n < 0) {
+                    break;
+                }
+                read += n;
+            }
+            wrapped.reset();
+            if (read != 4 || sig[0] != 'P' || sig[1] != 'K' || (sig[2] != 0x03 && sig[2] != 0x05 && sig[2] != 0x07)) {
+                throw new ExtractException("Failed to extract content from ZIP archive. Not a recognised ZIP signature.");
+            }
+        } catch (final IOException ioe) {
+            throw new ExtractException("Failed to extract content from ZIP archive. No entries could be processed.", ioe);
+        }
+        // CountingInputStream lets us measure the compressed bytes consumed
+        // from the underlying stream per entry, which is the only reliable
+        // signal in streaming mode (ZipArchiveEntry#getCompressedSize() is
+        // often -1 when entries use a data descriptor).
+        final CountingInputStream counter = new CountingInputStream(wrapped);
+        try (final ZipArchiveInputStream ais = new ZipArchiveInputStream(counter, filenameEncoding, true, true)) {
+            ZipArchiveEntry entry;
+            long totalBytes = 0;
+            long lastCompressedBytes = counter.getByteCount();
+            int entryCount = 0;
+            while ((entry = ais.getNextEntry()) != null) {
+                entryCount++;
+                if (maxEntries > 0 && entryCount > maxEntries) {
+                    throw new MaxLengthExceededException("zip entry count exceeded: count=" + entryCount + " max=" + maxEntries);
                 }
                 final String filename = entry.getName();
+                if (entry.isDirectory()) {
+                    continue;
+                }
+                if (isPathTraversal(filename)) {
+                    logger.warn("zip entry rejected: name={} reason=path-traversal", filename);
+                    continue;
+                }
+
+                // Read entry into bounded buffer while counting actual bytes.
+                final long actualBytes;
+                final byte[] entryBytes;
+                try {
+                    final long readLimit;
+                    if (maxBytes > 0) {
+                        readLimit = Math.max(0L, maxBytes - totalBytes) + 1L;
+                    } else {
+                        readLimit = Long.MAX_VALUE;
+                    }
+                    final ByteArrayOutputStream out = new ByteArrayOutputStream();
+                    actualBytes = copyBounded(ais, out, readLimit);
+                    entryBytes = out.toByteArray();
+                } catch (final IOException ioe) {
+                    failedEntries++;
+                    if (logger.isDebugEnabled()) {
+                        logger.debug("Failed to read zip entry: name={}", filename, ioe);
+                    }
+                    continue;
+                }
+
+                totalBytes += actualBytes;
+                if (maxBytes > 0 && totalBytes > maxBytes) {
+                    throw new MaxLengthExceededException("zip uncompressed size exceeded: total=" + totalBytes + " max=" + maxBytes);
+                }
+                if (maxContentSize != -1 && totalBytes > maxContentSize) {
+                    throw new MaxLengthExceededException("Extracted size is " + totalBytes + " > " + maxContentSize);
+                }
+
+                // Compression-ratio check (only meaningful for non-tiny entries).
+                // Prefer the entry header's compressed size when present;
+                // otherwise fall back to the bytes actually consumed from the
+                // underlying stream during this entry's read.
+                long compressed = entry.getCompressedSize();
+                if (compressed <= 0) {
+                    final long now = counter.getByteCount();
+                    compressed = Math.max(0L, now - lastCompressedBytes);
+                    lastCompressedBytes = now;
+                } else {
+                    lastCompressedBytes = counter.getByteCount();
+                }
+                if (maxCompressionRatio > 0 && compressed > 0 && actualBytes > COMPRESSION_RATIO_MIN_BYTES
+                        && actualBytes / compressed > maxCompressionRatio) {
+                    throw new MaxLengthExceededException("zip compression ratio exceeded: name=" + filename + " ratio="
+                            + (actualBytes / compressed) + " max=" + maxCompressionRatio);
+                }
+
                 final String mimeType = mimeTypeHelper.getContentType(null, filename);
                 if (mimeType != null) {
                     final Extractor extractor = extractorFactory.getExtractor(mimeType);
                     if (extractor != null) {
                         try {
-                            final Map<String, String> map = new HashMap<>();
+                            final Map<String, String> map = incrementDepth(params);
                             map.put(ExtractData.RESOURCE_NAME_KEY, filename);
-                            buf.append(extractor.getText(new IgnoreCloseInputStream(ais), map).getContent());
+                            buf.append(extractor.getText(new ByteArrayInputStream(entryBytes), map).getContent());
                             buf.append('\n');
                             processedEntries++;
+                        } catch (final MaxLengthExceededException e) {
+                            throw e;
                         } catch (final Exception e) {
                             failedEntries++;
                             if (logger.isDebugEnabled()) {
-                                logger.debug("Failed to extract content from archive entry: {}", filename, e);
+                                logger.debug("Failed to extract content from archive entry: name={}", filename, e);
                             }
                         }
                     }
@@ -105,13 +234,68 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
                 throw new ExtractException("Failed to extract content from ZIP archive. No entries could be processed.", e);
             }
             if (logger.isWarnEnabled()) {
-                logger.warn("Partial extraction from ZIP archive. Processed: {}, Failed: {}", processedEntries, failedEntries, e);
+                logger.warn("Partial extraction from ZIP archive. processed={} failed={}", processedEntries, failedEntries, e);
             }
         }
 
         return new ExtractData(buf.toString().trim());
     }
 
+    /**
+     * Returns true when the supplied entry name escapes the conceptual
+     * extraction root via path-traversal segments. The check is performed on
+     * a normalised form of the path.
+     *
+     * @param name the entry name as reported by the archive
+     * @return {@code true} if the name should be rejected
+     */
+    protected boolean isPathTraversal(final String name) {
+        if (name == null || name.isEmpty()) {
+            return true;
+        }
+        // Absolute paths (Unix or Windows-style) are unsafe in the
+        // context of an archive extracted into a sandbox root.
+        if (name.startsWith("/") || name.startsWith("\\")) {
+            return true;
+        }
+        if (name.length() >= 2 && name.charAt(1) == ':') {
+            return true;
+        }
+        try {
+            final Path normalised = Paths.get(name).normalize();
+            final String normStr = normalised.toString().replace('\\', '/');
+            if (normStr.equals("..") || normStr.startsWith("../") || normStr.contains("/../")) {
+                return true;
+            }
+            for (final Path part : normalised) {
+                if ("..".equals(part.toString())) {
+                    return true;
+                }
+            }
+        } catch (final InvalidPathException ipe) {
+            return true;
+        }
+        return false;
+    }
+
+    /**
+     * Copies up to {@code limit} bytes from {@code in} to {@code out}. Returns
+     * the actual number of bytes copied.
+     */
+    private long copyBounded(final InputStream in, final ByteArrayOutputStream out, final long limit) throws IOException {
+        if (limit <= 0) {
+            return 0;
+        }
+        final byte[] buffer = new byte[8192];
+        long total = 0;
+        int read;
+        while (total < limit && (read = in.read(buffer, 0, (int) Math.min(buffer.length, limit - total))) != IOUtils.EOF) {
+            out.write(buffer, 0, read);
+            total += read;
+        }
+        return total;
+    }
+
     /**
      * Sets the maximum content size.
      * @param maxContentSize The maximum content size to set.
@@ -119,4 +303,39 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
     public void setMaxContentSize(final long maxContentSize) {
         this.maxContentSize = maxContentSize;
     }
-}
\ No newline at end of file
+
+    /**
+     * Sets the cap on total uncompressed bytes read from all entries.
+     * @param maxBytes the maximum total bytes (use {@code -1} to disable)
+     */
+    public void setMaxBytes(final long maxBytes) {
+        this.maxBytes = maxBytes;
+    }
+
+    /**
+     * Sets the maximum permitted uncompressed/compressed ratio per entry.
+     * @param maxCompressionRatio the threshold (use {@code -1} to disable)
+     */
+    public void setMaxCompressionRatio(final long maxCompressionRatio) {
+        this.maxCompressionRatio = maxCompressionRatio;
+    }
+
+    /**
+     * Sets the maximum number of entries that may be iterated.
+     * @param maxEntries the maximum entry count (use {@code -1} to disable)
+     */
+    public void setMaxEntries(final int maxEntries) {
+        this.maxEntries = maxEntries;
+    }
+
+    /**
+     * Sets the filename encoding used to decode entry names that lack the
+     * UTF-8 flag (e.g. {@code "CP932"} / {@code "MS932"} for Japanese
+     * archives).
+     *
+     * @param filenameEncoding the charset name
+     */
+    public void setFilenameEncoding(final String filenameEncoding) {
+        this.filenameEncoding = filenameEncoding;
+    }
+}
diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractorTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractorTest.java
index 30e7db86..1a51483f 100644
--- a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractorTest.java
+++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractorTest.java
@@ -17,10 +17,12 @@
 
 import java.io.ByteArrayInputStream;
 import java.io.InputStream;
+import java.util.HashMap;
 import java.util.Map;
 
 import org.codelibs.fess.crawler.entity.ExtractData;
 import org.codelibs.fess.crawler.exception.CrawlerSystemException;
+import org.codelibs.fess.crawler.exception.MaxLengthExceededException;
 import org.dbflute.utflute.core.PlainTestCase;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
@@ -64,6 +66,19 @@ public void resetTestState() {
         public void testValidateInputStream(final InputStream in) {
             validateInputStream(in);
         }
+
+        // Expose depth helpers for testing.
+        public int testGetCurrentDepth(final Map<String, String> params) {
+            return getCurrentDepth(params);
+        }
+
+        public Map<String, String> testIncrementDepth(final Map<String, String> params) {
+            return incrementDepth(params);
+        }
+
+        public void testCheckDepth(final Map<String, String> params, final int maxDepth) {
+            checkDepth(params, maxDepth);
+        }
     }
 
     private TestExtractor extractor;
@@ -243,4 +258,85 @@ public void test_validateInputStream_throwsCorrectExceptionType() {
             fail();
         }
     }
+
+    /** Recursion-depth helper: missing/null params return 0. */
+    @Test
+    public void test_getCurrentDepth_returnsZeroForMissing() {
+        assertEquals(0, extractor.testGetCurrentDepth(null));
+        assertEquals(0, extractor.testGetCurrentDepth(new HashMap<>()));
+        final Map<String, String> blank = new HashMap<>();
+        blank.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "");
+        assertEquals(0, extractor.testGetCurrentDepth(blank));
+        final Map<String, String> garbage = new HashMap<>();
+        garbage.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "not-a-number");
+        assertEquals(0, extractor.testGetCurrentDepth(garbage));
+    }
+
+    /** Recursion-depth helper: depth value is parsed and clamped to >= 0. */
+    @Test
+    public void test_getCurrentDepth_parsesValidValue() {
+        final Map<String, String> params = new HashMap<>();
+        params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "3");
+        assertEquals(3, extractor.testGetCurrentDepth(params));
+
+        params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "-5");
+        assertEquals(0, extractor.testGetCurrentDepth(params));
+    }
+
+    /** incrementDepth must return a NEW map and not mutate the input. */
+    @Test
+    public void test_incrementDepth_returnsNewMap() {
+        final Map<String, String> original = new HashMap<>();
+        original.put("foo", "bar");
+        final Map<String, String> next = extractor.testIncrementDepth(original);
+
+        assertFalse(original == next);
+        // original is unchanged
+        assertFalse(original.containsKey(AbstractExtractor.EXTRACTOR_DEPTH_KEY));
+        assertEquals("bar", next.get("foo"));
+        assertEquals("1", next.get(AbstractExtractor.EXTRACTOR_DEPTH_KEY));
+
+        final Map<String, String> after = extractor.testIncrementDepth(next);
+        assertEquals("2", after.get(AbstractExtractor.EXTRACTOR_DEPTH_KEY));
+        // first map still says "1"
+        assertEquals("1", next.get(AbstractExtractor.EXTRACTOR_DEPTH_KEY));
+    }
+
+    /** incrementDepth on null produces depth=1. */
+    @Test
+    public void test_incrementDepth_nullInput() {
+        final Map<String, String> next = extractor.testIncrementDepth(null);
+        assertNotNull(next);
+        assertEquals("1", next.get(AbstractExtractor.EXTRACTOR_DEPTH_KEY));
+    }
+
+    /** checkDepth allows depths below the limit. */
+    @Test
+    public void test_checkDepth_belowLimit_passes() {
+        final Map<String, String> params = new HashMap<>();
+        params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "3");
+        extractor.testCheckDepth(params, 10); // no throw
+        extractor.testCheckDepth(null, 10);
+    }
+
+    /** checkDepth rejects depths at or above the limit. */
+    @Test
+    public void test_checkDepth_atOrAboveLimit_throws() {
+        final Map<String, String> params = new HashMap<>();
+        params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "10");
+        try {
+            extractor.testCheckDepth(params, 10);
+            fail();
+        } catch (final MaxLengthExceededException e) {
+            assertTrue(e.getMessage().contains("recursion depth"));
+        }
+
+        params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "99");
+        try {
+            extractor.testCheckDepth(params, 10);
+            fail();
+        } catch (final MaxLengthExceededException e) {
+            // pass
+        }
+    }
 }
diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java
new file mode 100644
index 00000000..1be37ccd
--- /dev/null
+++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java
@@ -0,0 +1,457 @@
+/*
+ * Copyright 2012-2025 CodeLibs Project and the Others.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language
+ * governing permissions and limitations under the License.
+ */
+package org.codelibs.fess.crawler.extractor.impl;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.commons.compress.archivers.ArchiveStreamFactory;
+import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
+import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
+import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream;
+import org.apache.commons.compress.compressors.CompressorStreamFactory;
+import org.codelibs.fess.crawler.container.StandardCrawlerContainer;
+import org.codelibs.fess.crawler.exception.MaxLengthExceededException;
+import org.codelibs.fess.crawler.extractor.ExtractorFactory;
+import org.codelibs.fess.crawler.helper.impl.MimeTypeHelperImpl;
+import org.dbflute.utflute.core.PlainTestCase;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.TestInfo;
+
+/**
+ * Security-oriented tests that validate the archive-bomb / recursion / Zip
+ * Slip / link-skipping defences added to the Zip / Tar / Lha extractors.
+ *
+ * <p>
+ * Synthetic archives are constructed in-memory with Apache Commons Compress
+ * so the tests are fully self-contained.
+ * </p>
+ */
+public class ArchiveExtractorSecurityTest extends PlainTestCase {
+
+    private ZipExtractor zipExtractor;
+    private TarExtractor tarExtractor;
+    private LhaExtractor lhaExtractor;
+
+    @Override
+    protected void setUp(final TestInfo testInfo) throws Exception {
+        super.setUp(testInfo);
+        final StandardCrawlerContainer container = new StandardCrawlerContainer();
+        container.singleton("archiveStreamFactory", ArchiveStreamFactory.class)
+                .singleton("compressorStreamFactory", CompressorStreamFactory.class)
+                .singleton("mimeTypeHelper", MimeTypeHelperImpl.class)
+                .singleton("textExtractor", TextExtractor.class)
+                .singleton("zipExtractor", ZipExtractor.class)
+                .singleton("tarExtractor", TarExtractor.class)
+                .singleton("lhaExtractor", LhaExtractor.class)
+                .<ExtractorFactory> singleton("extractorFactory", ExtractorFactory.class, factory -> {
+                    final TextExtractor textExtractor = container.getComponent("textExtractor");
+                    final ZipExtractor zip = container.getComponent("zipExtractor");
+                    final TarExtractor tar = container.getComponent("tarExtractor");
+                    final LhaExtractor lha = container.getComponent("lhaExtractor");
+                    factory.addExtractor("text/plain", textExtractor);
+                    factory.addExtractor("application/zip", zip);
+                    factory.addExtractor("application/x-tar", tar);
+                    factory.addExtractor("application/x-lha", lha);
+                });
+
+        zipExtractor = container.getComponent("zipExtractor");
+        tarExtractor = container.getComponent("tarExtractor");
+        lhaExtractor = container.getComponent("lhaExtractor");
+    }
+
+    // ---------------------------------------------------------------------
+    // Helpers
+    // ---------------------------------------------------------------------
+
+    private byte[] buildZip(final EntrySpec... specs) throws IOException {
+        return buildZipWithCharset(StandardCharsets.UTF_8, specs);
+    }
+
+    private byte[] buildZipWithCharset(final Charset charset, final EntrySpec... specs) throws IOException {
+        final ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        try (ZipArchiveOutputStream zos = new ZipArchiveOutputStream(baos)) {
+            zos.setEncoding(charset.name());
+            // Disable the UTF-8 flag so the encoding parameter is honoured by
+            // ZipArchiveInputStream during read.
+            zos.setUseLanguageEncodingFlag(false);
+            zos.setCreateUnicodeExtraFields(ZipArchiveOutputStream.UnicodeExtraFieldPolicy.NEVER);
+            for (final EntrySpec spec : specs) {
+                final ZipArchiveEntry entry = new ZipArchiveEntry(spec.name);
+                zos.putArchiveEntry(entry);
+                if (spec.content != null) {
+                    zos.write(spec.content);
+                }
+                zos.closeArchiveEntry();
+            }
+            zos.finish();
+        }
+        return baos.toByteArray();
+    }
+
+    private byte[] buildTar(final TarEntrySpec... specs) throws IOException {
+        final ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        try (TarArchiveOutputStream tos = new TarArchiveOutputStream(baos)) {
+            tos.setLongFileMode(TarArchiveOutputStream.LONGFILE_POSIX);
+            for (final TarEntrySpec spec : specs) {
+                final TarArchiveEntry entry;
+                if (spec.linkType != 0) {
+                    entry = new TarArchiveEntry(spec.name, spec.linkType);
+                    if (spec.linkName != null) {
+                        entry.setLinkName(spec.linkName);
+                    }
+                } else {
+                    entry = new TarArchiveEntry(spec.name);
+                    entry.setSize(spec.content == null ? 0 : spec.content.length);
+                }
+                tos.putArchiveEntry(entry);
+                if (spec.linkType == 0 && spec.content != null) {
+                    tos.write(spec.content);
+                }
+                tos.closeArchiveEntry();
+            }
+            tos.finish();
+        }
+        return baos.toByteArray();
+    }
+
+    private static final class EntrySpec {
+        final String name;
+        final byte[] content;
+
+        EntrySpec(final String name, final byte[] content) {
+            this.name = name;
+            this.content = content;
+        }
+    }
+
+    private static final class TarEntrySpec {
+        final String name;
+        final byte[] content;
+        final byte linkType;
+        final String linkName;
+
+        TarEntrySpec(final String name, final byte[] content) {
+            this(name, content, (byte) 0, null);
+        }
+
+        TarEntrySpec(final String name, final byte[] content, final byte linkType, final String linkName) {
+            this.name = name;
+            this.content = content;
+            this.linkType = linkType;
+            this.linkName = linkName;
+        }
+    }
+
+    // ---------------------------------------------------------------------
+    // Zip — byte-limit bomb
+    // ---------------------------------------------------------------------
+
+    @Test
+    public void test_zipBomb_byteLimit() throws Exception {
+        final byte[] payload = new byte[64 * 1024];
+        final byte[] data = buildZip(new EntrySpec("a.txt", payload), new EntrySpec("b.txt", payload), new EntrySpec("c.txt", payload));
+
+        zipExtractor.setMaxBytes(64 * 1024); // exactly one entry's worth -> 2nd should fail
+        try (InputStream in = new ByteArrayInputStream(data)) {
+            zipExtractor.getText(in, null);
+            fail();
+        } catch (final MaxLengthExceededException e) {
+            assertTrue(e.getMessage().contains("zip uncompressed size exceeded"));
+        }
+    }
+
+    // ---------------------------------------------------------------------
+    // Zip — many-entry bomb
+    // ---------------------------------------------------------------------
+
+    @Test
+    public void test_zipBomb_entryLimit() throws Exception {
+        final EntrySpec[] specs = new EntrySpec[20];
+        for (int i = 0; i < specs.length; i++) {
+            specs[i] = new EntrySpec("e" + i + ".txt", new byte[0]);
+        }
+        final byte[] data = buildZip(specs);
+
+        zipExtractor.setMaxEntries(5);
+        try (InputStream in = new ByteArrayInputStream(data)) {
+            zipExtractor.getText(in, null);
+            fail();
+        } catch (final MaxLengthExceededException e) {
+            assertTrue(e.getMessage().contains("zip entry count exceeded"));
+        }
+    }
+
+    // ---------------------------------------------------------------------
+    // Zip — Zip Slip path traversal
+    // ---------------------------------------------------------------------
+
+    @Test
+    public void test_zipSlip_pathTraversal() throws Exception {
+        final byte[] data = buildZip(new EntrySpec("../../etc/passwd", "evil".getBytes(StandardCharsets.UTF_8)),
+                new EntrySpec("ok.txt", "good".getBytes(StandardCharsets.UTF_8)));
+
+        try (InputStream in = new ByteArrayInputStream(data)) {
+            final String content = zipExtractor.getText(in, null).getContent();
+            // Bad entry must be skipped; good entry must still be processed.
+            assertFalse(content.contains("evil"));
+            assertTrue(content.contains("good"));
+        }
+    }
+
+    @Test
+    public void test_zipSlip_absolutePath() throws Exception {
+        final byte[] data = buildZip(new EntrySpec("/etc/passwd", "evil".getBytes(StandardCharsets.UTF_8)),
+                new EntrySpec("ok.txt", "good".getBytes(StandardCharsets.UTF_8)));
+
+        try (InputStream in = new ByteArrayInputStream(data)) {
+            final String content = zipExtractor.getText(in, null).getContent();
+            assertFalse(content.contains("evil"));
+            assertTrue(content.contains("good"));
+        }
+    }
+
+    // ---------------------------------------------------------------------
+    // Recursion-depth bomb
+    // ---------------------------------------------------------------------
+
+    @Test
+    public void test_recursionDepth_exceeded() throws Exception {
+        final byte[] data = buildZip(new EntrySpec("ok.txt", "hello".getBytes(StandardCharsets.UTF_8)));
+        final Map<String, String> params = new HashMap<>();
+        params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "10"); // == default max
+
+        try (InputStream in = new ByteArrayInputStream(data)) {
+            zipExtractor.getText(in, params);
+            fail();
+        } catch (final MaxLengthExceededException e) {
+            assertTrue(e.getMessage().toLowerCase().contains("recursion"));
+        }
+    }
+
+    @Test
+    public void test_recursionDepth_belowLimit_succeeds() throws Exception {
+        final byte[] data = buildZip(new EntrySpec("ok.txt", "hello".getBytes(StandardCharsets.UTF_8)));
+        final Map<String, String> params = new HashMap<>();
+        params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "3");
+
+        try (InputStream in = new ByteArrayInputStream(data)) {
+            final String content = zipExtractor.getText(in, params).getContent();
+            assertTrue(content.contains("hello"));
+        }
+        // Original params must be unchanged.
+        assertEquals("3", params.get(AbstractExtractor.EXTRACTOR_DEPTH_KEY));
+    }
+
+    // ---------------------------------------------------------------------
+    // CP932 / non-UTF-8 filename encoding
+    // ---------------------------------------------------------------------
+
+    @Test
+    public void test_cp932Filename() throws Exception {
+        final Charset cp932;
+        try {
+            cp932 = Charset.forName("MS932");
+        } catch (final Exception e) {
+            // CP932/MS932 not available on this JVM; skip.
+            return;
+        }
+
+        final byte[] data = buildZipWithCharset(cp932, new EntrySpec("テスト.txt", "japan".getBytes(StandardCharsets.UTF_8)));
+
+        // Default UTF-8 encoding may mojibake the filename, but once we set
+        // CP932 the filename should round-trip cleanly. We assert by
+        // inspecting the entry list directly via the public API: setting the
+        // proper encoding allows the .txt suffix to be detected and the
+        // entry's content extracted.
+        zipExtractor.setFilenameEncoding("MS932");
+        try (InputStream in = new ByteArrayInputStream(data)) {
+            final String content = zipExtractor.getText(in, null).getContent();
+            assertTrue(content.contains("japan"));
+        }
+    }
+
+    // ---------------------------------------------------------------------
+    // Tar — symlink / hardlink entries are skipped
+    // ---------------------------------------------------------------------
+
+    @Test
+    public void test_tar_symlinkSkipped() throws Exception {
+        final byte[] data = buildTar(new TarEntrySpec("ok.txt", "regular".getBytes(StandardCharsets.UTF_8)),
+                new TarEntrySpec("evil.txt", null, TarArchiveEntry.LF_SYMLINK, "/etc/passwd"));
+
+        try (InputStream in = new ByteArrayInputStream(data)) {
+            final String content = tarExtractor.getText(in, null).getContent();
+            assertTrue(content.contains("regular"));
+            // Symlink target text must NOT leak into the output.
+            assertFalse(content.contains("/etc/passwd"));
+        }
+    }
+
+    @Test
+    public void test_tar_hardlinkSkipped() throws Exception {
+        final byte[] data = buildTar(new TarEntrySpec("ok.txt", "regular".getBytes(StandardCharsets.UTF_8)),
+                new TarEntrySpec("evil.txt", null, TarArchiveEntry.LF_LINK, "ok.txt"));
+
+        try (InputStream in = new ByteArrayInputStream(data)) {
+            final String content = tarExtractor.getText(in, null).getContent();
+            assertTrue(content.contains("regular"));
+            // The hardlink should not have introduced a duplicate of the
+            // referenced entry's content.
+            assertEquals(content.indexOf("regular"), content.lastIndexOf("regular"));
+        }
+    }
+
+    @Test
+    public void test_tar_pathTraversal() throws Exception {
+        final byte[] data = buildTar(new TarEntrySpec("../../etc/passwd", "evil".getBytes(StandardCharsets.UTF_8)),
+                new TarEntrySpec("ok.txt", "good".getBytes(StandardCharsets.UTF_8)));
+
+        try (InputStream in = new ByteArrayInputStream(data)) {
+            final String content = tarExtractor.getText(in, null).getContent();
+            assertFalse(content.contains("evil"));
+            assertTrue(content.contains("good"));
+        }
+    }
+
+    // ---------------------------------------------------------------------
+    // Compression-ratio bomb — produce a highly-compressible big entry
+    // ---------------------------------------------------------------------
+
+    @Test
+    public void test_compressionRatioExceeded() throws Exception {
+        // 2 MiB of zeroes compresses extremely well, well above the 100:1
+        // default threshold. Build the entry with explicit method/size/crc so
+        // the local file header carries the compressed size (otherwise a
+        // streaming DEFLATED entry uses a data descriptor, leaving
+        // ZipArchiveEntry#getCompressedSize() as -1 and bypassing the ratio
+        // check).
+        final byte[] payload = new byte[2 * 1024 * 1024];
+        final java.util.zip.Deflater def = new java.util.zip.Deflater(java.util.zip.Deflater.BEST_COMPRESSION);
+        def.setInput(payload);
+        def.finish();
+        final ByteArrayOutputStream compBuf = new ByteArrayOutputStream();
+        final byte[] tmpBuf = new byte[8192];
+        while (!def.finished()) {
+            final int n = def.deflate(tmpBuf);
+            compBuf.write(tmpBuf, 0, n);
+        }
+        def.end();
+        final java.util.zip.CRC32 crc = new java.util.zip.CRC32();
+        crc.update(payload);
+
+        final ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        try (ZipArchiveOutputStream zos = new ZipArchiveOutputStream(baos)) {
+            final ZipArchiveEntry entry = new ZipArchiveEntry("zeros.txt");
+            entry.setMethod(ZipArchiveEntry.DEFLATED);
+            entry.setSize(payload.length);
+            entry.setCompressedSize(compBuf.size());
+            entry.setCrc(crc.getValue());
+            zos.putArchiveEntry(entry);
+            zos.write(payload);
+            zos.closeArchiveEntry();
+            zos.finish();
+        }
+        final byte[] data = baos.toByteArray();
+
+        // Disable the byte cap so the compression-ratio check is the one that
+        // fires.
+        zipExtractor.setMaxBytes(-1);
+        zipExtractor.setMaxContentSize(-1);
+        try (InputStream in = new ByteArrayInputStream(data)) {
+            zipExtractor.getText(in, null);
+            fail();
+        } catch (final MaxLengthExceededException e) {
+            assertTrue(e.getMessage().contains("compression ratio") || e.getMessage().contains("uncompressed size"));
+        }
+    }
+
+    // ---------------------------------------------------------------------
+    // Tar byte/entry limits
+    // ---------------------------------------------------------------------
+
+    @Test
+    public void test_tarBomb_byteLimit() throws Exception {
+        final byte[] payload = new byte[64 * 1024];
+        final byte[] data = buildTar(new TarEntrySpec("a.txt", payload), new TarEntrySpec("b.txt", payload));
+
+        tarExtractor.setMaxBytes(64 * 1024);
+        try (InputStream in = new ByteArrayInputStream(data)) {
+            tarExtractor.getText(in, null);
+            fail();
+        } catch (final MaxLengthExceededException e) {
+            assertTrue(e.getMessage().contains("tar uncompressed size exceeded"));
+        }
+    }
+
+    @Test
+    public void test_tarBomb_entryLimit() throws Exception {
+        final TarEntrySpec[] specs = new TarEntrySpec[20];
+        for (int i = 0; i < specs.length; i++) {
+            specs[i] = new TarEntrySpec("e" + i + ".txt", new byte[0]);
+        }
+        final byte[] data = buildTar(specs);
+
+        tarExtractor.setMaxEntries(5);
+        try (InputStream in = new ByteArrayInputStream(data)) {
+            tarExtractor.getText(in, null);
+            fail();
+        } catch (final MaxLengthExceededException e) {
+            assertTrue(e.getMessage().contains("tar entry count exceeded"));
+        }
+    }
+
+    @Test
+    public void test_tar_recursionDepth_exceeded() throws Exception {
+        final byte[] data = buildTar(new TarEntrySpec("ok.txt", "hi".getBytes(StandardCharsets.UTF_8)));
+        final Map<String, String> params = new HashMap<>();
+        params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "10");
+
+        try (InputStream in = new ByteArrayInputStream(data)) {
+            tarExtractor.getText(in, params);
+            fail();
+        } catch (final MaxLengthExceededException e) {
+            assertTrue(e.getMessage().toLowerCase().contains("recursion"));
+        }
+    }
+
+    // ---------------------------------------------------------------------
+    // Lha recursion-depth check (uses isPathTraversal helper too)
+    // ---------------------------------------------------------------------
+
+    @Test
+    public void test_lha_recursionDepth_exceeded() {
+        final Map<String, String> params = new HashMap<>();
+        params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "10");
+        // We pass a tiny non-archive stream; the depth check fires before
+        // the LHA library is invoked.
+        try (InputStream in = new ByteArrayInputStream("dummy".getBytes(StandardCharsets.UTF_8))) {
+            lhaExtractor.getText(in, params);
+            fail();
+        } catch (final MaxLengthExceededException e) {
+            assertTrue(e.getMessage().toLowerCase().contains("recursion"));
+        } catch (final IOException e) {
+            fail();
+        }
+    }
+}

From 102b22fde5b28965b215fa11adf3c3bf7b5b1ebc Mon Sep 17 00:00:00 2001
From: Shinsuke Sugaya <shinsuke@apache.org>
Date: Tue, 5 May 2026 12:39:26 +0900
Subject: [PATCH 2/6] fix(extractor): add per-entry size cap and dedupe Zip
 Slip / copy-bounded helpers

---
 .../extractor/impl/AbstractExtractor.java     | 78 +++++++++++++++
 .../crawler/extractor/impl/LhaExtractor.java  | 62 +++++-------
 .../crawler/extractor/impl/TarExtractor.java  | 83 ++++++----------
 .../crawler/extractor/impl/ZipExtractor.java  | 96 +++++++------------
 .../impl/ArchiveExtractorSecurityTest.java    | 53 ++++++++++
 5 files changed, 217 insertions(+), 155 deletions(-)

diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java
index bb30c235..7a2eb122 100644
--- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java
+++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java
@@ -18,10 +18,15 @@
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.file.InvalidPathException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 
+import org.apache.commons.io.IOUtils;
 import org.codelibs.fess.crawler.container.CrawlerContainer;
 import org.codelibs.fess.crawler.exception.CrawlerSystemException;
 import org.codelibs.fess.crawler.exception.MaxLengthExceededException;
@@ -229,4 +234,77 @@ protected void validateInputStream(final InputStream in) {
             throw new CrawlerSystemException("The inputstream is null.");
         }
     }
+
+    /**
+     * Returns true when the supplied entry name escapes the conceptual
+     * extraction root via path-traversal segments. The check is performed on
+     * a normalised form of the path and is shared between the archive
+     * extractors (Zip / Tar / Lha) so the rejection rules stay in lock step.
+     *
+     * <p>
+     * An entry is rejected when it is null/empty, when it is rooted at
+     * {@code /} or {@code \}, when it begins with a Windows drive letter
+     * (e.g. {@code C:}), when its normalised form contains a {@code ..}
+     * segment, or when {@link Paths#get} treats it as malformed.
+     * </p>
+     *
+     * @param name the entry name as reported by the archive
+     * @return {@code true} if the name should be rejected
+     */
+    protected static boolean isPathTraversal(final String name) {
+        if (name == null || name.isEmpty()) {
+            return true;
+        }
+        // Absolute paths (Unix or Windows-style) are unsafe in the
+        // context of an archive extracted into a sandbox root.
+        if (name.startsWith("/") || name.startsWith("\\")) {
+            return true;
+        }
+        if (name.length() >= 2 && name.charAt(1) == ':') {
+            return true;
+        }
+        try {
+            final Path normalised = Paths.get(name).normalize();
+            final String normStr = normalised.toString().replace('\\', '/');
+            if (normStr.equals("..") || normStr.startsWith("../") || normStr.contains("/../")) {
+                return true;
+            }
+            for (final Path part : normalised) {
+                if ("..".equals(part.toString())) {
+                    return true;
+                }
+            }
+        } catch (final InvalidPathException ipe) {
+            return true;
+        }
+        return false;
+    }
+
+    /**
+     * Copies up to {@code limit} bytes from {@code in} to {@code out}, returning
+     * the actual number of bytes copied. Used by archive extractors to bound
+     * the amount of memory consumed when buffering an entry's uncompressed
+     * payload.
+     *
+     * @param in the source stream
+     * @param out the sink stream
+     * @param limit the maximum number of bytes to copy (inclusive). Values
+     *              {@code <= 0} cause the method to return without reading.
+     * @return the number of bytes actually copied
+     * @throws IOException if reading from {@code in} or writing to {@code out}
+     *                     fails
+     */
+    protected static long copyBounded(final InputStream in, final OutputStream out, final long limit) throws IOException {
+        if (limit <= 0) {
+            return 0;
+        }
+        final byte[] buffer = new byte[8192];
+        long total = 0;
+        int read;
+        while (total < limit && (read = in.read(buffer, 0, (int) Math.min(buffer.length, limit - total))) != IOUtils.EOF) {
+            out.write(buffer, 0, read);
+            total += read;
+        }
+        return total;
+    }
 }
diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java
index 70e2cced..b6668fd3 100644
--- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java
+++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java
@@ -19,9 +19,6 @@
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
-import java.nio.file.InvalidPathException;
-import java.nio.file.Path;
-import java.nio.file.Paths;
 import java.util.Enumeration;
 import java.util.Map;
 
@@ -67,6 +64,14 @@ public class LhaExtractor extends AbstractExtractor {
      */
     protected long maxBytes = 1L << 31;
 
+    /**
+     * Maximum uncompressed bytes for a SINGLE entry (as reported by the LHA
+     * header). Guards against an oversized entry being routed to a buffered
+     * downstream extractor. Defaults to 256 MiB. Set to {@code -1} to
+     * disable. Enforced independently of {@link #maxBytes}.
+     */
+    protected long maxBytesPerEntry = 256L * 1024L * 1024L;
+
     /**
      * Maximum allowed number of entries to iterate. Defaults to 100,000.
      * Set to {@code -1} to disable.
@@ -120,7 +125,12 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
                 if (maxEntries > 0 && entryCount > maxEntries) {
                     throw new MaxLengthExceededException("lha entry count exceeded: count=" + entryCount + " max=" + maxEntries);
                 }
-                contentSize += head.getOriginalSize();
+                final long entrySize = head.getOriginalSize();
+                if (maxBytesPerEntry > 0 && entrySize > maxBytesPerEntry) {
+                    throw new MaxLengthExceededException(
+                            "lha per-entry size exceeded: name=" + head.getPath() + " size=" + entrySize + " max=" + maxBytesPerEntry);
+                }
+                contentSize += entrySize;
                 if (maxBytes > 0 && contentSize > maxBytes) {
                     throw new MaxLengthExceededException("lha uncompressed size exceeded: total=" + contentSize + " max=" + maxBytes);
                 }
@@ -173,40 +183,6 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
         return new ExtractData(buf.toString().trim());
     }
 
-    /**
-     * Returns true when the supplied entry name escapes the conceptual
-     * extraction root via path-traversal segments.
-     *
-     * @param name the entry name as reported by the archive
-     * @return {@code true} if the name should be rejected
-     */
-    protected boolean isPathTraversal(final String name) {
-        if (name == null || name.isEmpty()) {
-            return true;
-        }
-        if (name.startsWith("/") || name.startsWith("\\")) {
-            return true;
-        }
-        if (name.length() >= 2 && name.charAt(1) == ':') {
-            return true;
-        }
-        try {
-            final Path normalised = Paths.get(name).normalize();
-            final String normStr = normalised.toString().replace('\\', '/');
-            if (normStr.equals("..") || normStr.startsWith("../") || normStr.contains("/../")) {
-                return true;
-            }
-            for (final Path part : normalised) {
-                if ("..".equals(part.toString())) {
-                    return true;
-                }
-            }
-        } catch (final InvalidPathException ipe) {
-            return true;
-        }
-        return false;
-    }
-
     /**
      * Sets the maximum content size for extraction.
      *
@@ -224,6 +200,16 @@ public void setMaxBytes(final long maxBytes) {
         this.maxBytes = maxBytes;
     }
 
+    /**
+     * Sets the per-entry cap on the original (uncompressed) size reported by
+     * the LHA header. Set to {@code -1} to disable.
+     *
+     * @param maxBytesPerEntry the per-entry maximum
+     */
+    public void setMaxBytesPerEntry(final long maxBytesPerEntry) {
+        this.maxBytesPerEntry = maxBytesPerEntry;
+    }
+
     /**
      * Sets the maximum number of entries that may be iterated.
      * @param maxEntries the maximum entry count (use {@code -1} to disable)
diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java
index 35f4a8f0..0839b2c1 100644
--- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java
+++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java
@@ -19,15 +19,11 @@
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
-import java.nio.file.InvalidPathException;
-import java.nio.file.Path;
-import java.nio.file.Paths;
 import java.util.Map;
 
 import org.apache.commons.compress.archivers.ArchiveInputStream;
 import org.apache.commons.compress.archivers.ArchiveStreamFactory;
 import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
-import org.apache.commons.io.IOUtils;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.codelibs.fess.crawler.entity.ExtractData;
@@ -69,6 +65,14 @@ public class TarExtractor extends AbstractExtractor {
      */
     protected long maxBytes = 1L << 31;
 
+    /**
+     * Maximum uncompressed bytes that may be buffered for a SINGLE entry.
+     * Guards against an oversized entry exhausting the JVM heap when
+     * buffered into memory. Defaults to 256 MiB. Set to {@code -1} to
+     * disable. Enforced independently of {@link #maxBytes}.
+     */
+    protected long maxBytesPerEntry = 256L * 1024L * 1024L;
+
     /**
      * Maximum allowed number of entries to iterate. Defaults to 100,000.
      * Set to {@code -1} to disable.
@@ -134,12 +138,14 @@ protected String getTextInternal(final InputStream in, final MimeTypeHelper mime
                 final long actualBytes;
                 final byte[] entryBytes;
                 try {
-                    final long readLimit;
+                    final long totalReadLimit;
                     if (maxBytes > 0) {
-                        readLimit = Math.max(0L, maxBytes - totalBytes) + 1L;
+                        totalReadLimit = Math.max(0L, maxBytes - totalBytes) + 1L;
                     } else {
-                        readLimit = Long.MAX_VALUE;
+                        totalReadLimit = Long.MAX_VALUE;
                     }
+                    final long perEntryReadLimit = maxBytesPerEntry > 0 ? maxBytesPerEntry + 1L : Long.MAX_VALUE;
+                    final long readLimit = Math.min(totalReadLimit, perEntryReadLimit);
                     final ByteArrayOutputStream out = new ByteArrayOutputStream();
                     actualBytes = copyBounded(ais, out, readLimit);
                     entryBytes = out.toByteArray();
@@ -151,6 +157,11 @@ protected String getTextInternal(final InputStream in, final MimeTypeHelper mime
                     continue;
                 }
 
+                if (maxBytesPerEntry > 0 && actualBytes > maxBytesPerEntry) {
+                    throw new MaxLengthExceededException(
+                            "tar per-entry size exceeded: name=" + filename + " size=" + actualBytes + " max=" + maxBytesPerEntry);
+                }
+
                 totalBytes += actualBytes;
                 if (maxBytes > 0 && totalBytes > maxBytes) {
                     throw new MaxLengthExceededException("tar uncompressed size exceeded: total=" + totalBytes + " max=" + maxBytes);
@@ -194,54 +205,6 @@ protected String getTextInternal(final InputStream in, final MimeTypeHelper mime
         return buf.toString().trim();
     }
 
-    /**
-     * Returns true when the supplied entry name escapes the conceptual
-     * extraction root via path-traversal segments.
-     *
-     * @param name the entry name as reported by the archive
-     * @return {@code true} if the name should be rejected
-     */
-    protected boolean isPathTraversal(final String name) {
-        if (name == null || name.isEmpty()) {
-            return true;
-        }
-        if (name.startsWith("/") || name.startsWith("\\")) {
-            return true;
-        }
-        if (name.length() >= 2 && name.charAt(1) == ':') {
-            return true;
-        }
-        try {
-            final Path normalised = Paths.get(name).normalize();
-            final String normStr = normalised.toString().replace('\\', '/');
-            if (normStr.equals("..") || normStr.startsWith("../") || normStr.contains("/../")) {
-                return true;
-            }
-            for (final Path part : normalised) {
-                if ("..".equals(part.toString())) {
-                    return true;
-                }
-            }
-        } catch (final InvalidPathException ipe) {
-            return true;
-        }
-        return false;
-    }
-
-    private long copyBounded(final InputStream in, final ByteArrayOutputStream out, final long limit) throws IOException {
-        if (limit <= 0) {
-            return 0;
-        }
-        final byte[] buffer = new byte[8192];
-        long total = 0;
-        int read;
-        while (total < limit && (read = in.read(buffer, 0, (int) Math.min(buffer.length, limit - total))) != IOUtils.EOF) {
-            out.write(buffer, 0, read);
-            total += read;
-        }
-        return total;
-    }
-
     /**
      * Sets the maximum content size.
      * @param maxContentSize The maximum content size to set.
@@ -258,6 +221,16 @@ public void setMaxBytes(final long maxBytes) {
         this.maxBytes = maxBytes;
     }
 
+    /**
+     * Sets the per-entry cap on uncompressed bytes buffered in memory. Set
+     * to {@code -1} to disable.
+     *
+     * @param maxBytesPerEntry the per-entry maximum
+     */
+    public void setMaxBytesPerEntry(final long maxBytesPerEntry) {
+        this.maxBytesPerEntry = maxBytesPerEntry;
+    }
+
     /**
      * Sets the maximum number of entries that may be iterated.
      * @param maxEntries the maximum entry count (use {@code -1} to disable)
diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java
index ee4a9370..eeff5326 100644
--- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java
+++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java
@@ -20,14 +20,10 @@
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
-import java.nio.file.InvalidPathException;
-import java.nio.file.Path;
-import java.nio.file.Paths;
 import java.util.Map;
 
 import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
 import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
-import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.input.CountingInputStream;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
@@ -76,6 +72,15 @@ public class ZipExtractor extends AbstractExtractor {
      */
     protected long maxBytes = 1L << 31;
 
+    /**
+     * Maximum uncompressed bytes that may be buffered for a SINGLE entry.
+     * This guards against a legitimate-looking but oversized entry (e.g. a
+     * 1.9 GiB file inside an otherwise small archive) exhausting the JVM
+     * heap when buffered into memory. Defaults to 256 MiB. Set to
+     * {@code -1} to disable. Enforced independently of {@link #maxBytes}.
+     */
+    protected long maxBytesPerEntry = 256L * 1024L * 1024L;
+
     /**
      * Maximum allowed compression ratio (uncompressed / compressed). Entries
      * exceeding this ratio AND larger than 1 MiB are rejected as suspected
@@ -163,12 +168,19 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
                 final long actualBytes;
                 final byte[] entryBytes;
                 try {
-                    final long readLimit;
+                    final long totalReadLimit;
                     if (maxBytes > 0) {
-                        readLimit = Math.max(0L, maxBytes - totalBytes) + 1L;
+                        totalReadLimit = Math.max(0L, maxBytes - totalBytes) + 1L;
                     } else {
-                        readLimit = Long.MAX_VALUE;
+                        totalReadLimit = Long.MAX_VALUE;
                     }
+                    // Enforce a per-entry cap independently of the total
+                    // cap so that a single oversized entry cannot exhaust
+                    // the JVM heap. We read one byte beyond the cap so the
+                    // explicit overflow check below can distinguish
+                    // "exactly at the cap" from "exceeds the cap".
+                    final long perEntryReadLimit = maxBytesPerEntry > 0 ? maxBytesPerEntry + 1L : Long.MAX_VALUE;
+                    final long readLimit = Math.min(totalReadLimit, perEntryReadLimit);
                     final ByteArrayOutputStream out = new ByteArrayOutputStream();
                     actualBytes = copyBounded(ais, out, readLimit);
                     entryBytes = out.toByteArray();
@@ -180,6 +192,11 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
                     continue;
                 }
 
+                if (maxBytesPerEntry > 0 && actualBytes > maxBytesPerEntry) {
+                    throw new MaxLengthExceededException(
+                            "zip per-entry size exceeded: name=" + filename + " size=" + actualBytes + " max=" + maxBytesPerEntry);
+                }
+
                 totalBytes += actualBytes;
                 if (maxBytes > 0 && totalBytes > maxBytes) {
                     throw new MaxLengthExceededException("zip uncompressed size exceeded: total=" + totalBytes + " max=" + maxBytes);
@@ -241,61 +258,6 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
         return new ExtractData(buf.toString().trim());
     }
 
-    /**
-     * Returns true when the supplied entry name escapes the conceptual
-     * extraction root via path-traversal segments. The check is performed on
-     * a normalised form of the path.
-     *
-     * @param name the entry name as reported by the archive
-     * @return {@code true} if the name should be rejected
-     */
-    protected boolean isPathTraversal(final String name) {
-        if (name == null || name.isEmpty()) {
-            return true;
-        }
-        // Absolute paths (Unix or Windows-style) are unsafe in the
-        // context of an archive extracted into a sandbox root.
-        if (name.startsWith("/") || name.startsWith("\\")) {
-            return true;
-        }
-        if (name.length() >= 2 && name.charAt(1) == ':') {
-            return true;
-        }
-        try {
-            final Path normalised = Paths.get(name).normalize();
-            final String normStr = normalised.toString().replace('\\', '/');
-            if (normStr.equals("..") || normStr.startsWith("../") || normStr.contains("/../")) {
-                return true;
-            }
-            for (final Path part : normalised) {
-                if ("..".equals(part.toString())) {
-                    return true;
-                }
-            }
-        } catch (final InvalidPathException ipe) {
-            return true;
-        }
-        return false;
-    }
-
-    /**
-     * Copies up to {@code limit} bytes from {@code in} to {@code out}. Returns
-     * the actual number of bytes copied.
-     */
-    private long copyBounded(final InputStream in, final ByteArrayOutputStream out, final long limit) throws IOException {
-        if (limit <= 0) {
-            return 0;
-        }
-        final byte[] buffer = new byte[8192];
-        long total = 0;
-        int read;
-        while (total < limit && (read = in.read(buffer, 0, (int) Math.min(buffer.length, limit - total))) != IOUtils.EOF) {
-            out.write(buffer, 0, read);
-            total += read;
-        }
-        return total;
-    }
-
     /**
      * Sets the maximum content size.
      * @param maxContentSize The maximum content size to set.
@@ -312,6 +274,16 @@ public void setMaxBytes(final long maxBytes) {
         this.maxBytes = maxBytes;
     }
 
+    /**
+     * Sets the per-entry cap on uncompressed bytes buffered in memory. Set
+     * to {@code -1} to disable.
+     *
+     * @param maxBytesPerEntry the per-entry maximum
+     */
+    public void setMaxBytesPerEntry(final long maxBytesPerEntry) {
+        this.maxBytesPerEntry = maxBytesPerEntry;
+    }
+
     /**
      * Sets the maximum permitted uncompressed/compressed ratio per entry.
      * @param maxCompressionRatio the threshold (use {@code -1} to disable)
diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java
index 1be37ccd..97184a5f 100644
--- a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java
+++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java
@@ -439,6 +439,59 @@ public void test_tar_recursionDepth_exceeded() throws Exception {
     // Lha recursion-depth check (uses isPathTraversal helper too)
     // ---------------------------------------------------------------------
 
+    // ---------------------------------------------------------------------
+    // Per-entry size cap — guards against a single oversized entry
+    // ---------------------------------------------------------------------
+
+    @Test
+    public void test_perEntryCapEnforced() throws Exception {
+        // Build a zip whose single entry holds 300 MiB of (highly
+        // compressible) zeroes. The extractor must trip the per-entry cap
+        // before buffering the entire 300 MiB into memory.
+        final int entrySize = 300 * 1024 * 1024;
+        final byte[] payload = new byte[entrySize];
+
+        final java.util.zip.Deflater def = new java.util.zip.Deflater(java.util.zip.Deflater.BEST_COMPRESSION);
+        def.setInput(payload);
+        def.finish();
+        final ByteArrayOutputStream compBuf = new ByteArrayOutputStream();
+        final byte[] tmpBuf = new byte[8192];
+        while (!def.finished()) {
+            final int n = def.deflate(tmpBuf);
+            compBuf.write(tmpBuf, 0, n);
+        }
+        def.end();
+        final java.util.zip.CRC32 crc = new java.util.zip.CRC32();
+        crc.update(payload);
+
+        final ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        try (ZipArchiveOutputStream zos = new ZipArchiveOutputStream(baos)) {
+            final ZipArchiveEntry entry = new ZipArchiveEntry("big.bin");
+            entry.setMethod(ZipArchiveEntry.DEFLATED);
+            entry.setSize(payload.length);
+            entry.setCompressedSize(compBuf.size());
+            entry.setCrc(crc.getValue());
+            zos.putArchiveEntry(entry);
+            zos.write(payload);
+            zos.closeArchiveEntry();
+            zos.finish();
+        }
+        final byte[] data = baos.toByteArray();
+
+        // Disable the total-size and ratio checks so only the per-entry cap
+        // can trigger. Default per-entry cap (256 MiB) must reject the
+        // 300 MiB entry.
+        zipExtractor.setMaxBytes(-1);
+        zipExtractor.setMaxContentSize(-1);
+        zipExtractor.setMaxCompressionRatio(-1);
+        try (InputStream in = new ByteArrayInputStream(data)) {
+            zipExtractor.getText(in, null);
+            fail();
+        } catch (final MaxLengthExceededException e) {
+            assertTrue(e.getMessage().contains("per-entry size exceeded"));
+        }
+    }
+
     @Test
     public void test_lha_recursionDepth_exceeded() {
         final Map<String, String> params = new HashMap<>();

From 9de64ae94923beca5837c60cf4e5136ae749cbf1 Mon Sep 17 00:00:00 2001
From: Shinsuke Sugaya <shinsuke@apache.org>
Date: Tue, 5 May 2026 19:52:38 +0900
Subject: [PATCH 3/6] fix(extractor): bound LHA input staging and per-entry
 actual bytes

- LhaExtractor now stages the input archive through copyBounded with a
  configurable maxInputBytes cap (default 1 GiB) so a hostile producer
  cannot fill local storage before LhaFile is opened.
- The per-entry size cap is enforced against bytes actually decompressed
  (mirroring Zip/Tar) instead of the attacker-controlled
  LhaHeader#getOriginalSize. Entries are now buffered into a bounded
  ByteArrayOutputStream and forwarded to downstream extractors via
  ByteArrayInputStream, dropping the unbounded IgnoreCloseInputStream
  hand-off.
- Down-scale test_perEntryCapEnforced (300 MiB -> 2 MiB payload with a
  1 MiB cap) so it stays cheap on parallel/low-memory CI, and add
  test_lha_maxInputBytes_capsStaging covering the new input staging
  cap.
---
 .../crawler/extractor/impl/LhaExtractor.java  | 108 +++++++++++++-----
 .../impl/ArchiveExtractorSecurityTest.java    |  31 ++++-
 2 files changed, 106 insertions(+), 33 deletions(-)

diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java
index b6668fd3..b28d59df 100644
--- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java
+++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java
@@ -15,6 +15,8 @@
  */
 package org.codelibs.fess.crawler.extractor.impl;
 
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
@@ -25,7 +27,6 @@
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.codelibs.core.io.CloseableUtil;
-import org.codelibs.core.io.CopyUtil;
 import org.codelibs.core.io.FileUtil;
 import org.codelibs.fess.crawler.entity.ExtractData;
 import org.codelibs.fess.crawler.exception.CrawlerSystemException;
@@ -34,7 +35,6 @@
 import org.codelibs.fess.crawler.extractor.Extractor;
 import org.codelibs.fess.crawler.extractor.ExtractorFactory;
 import org.codelibs.fess.crawler.helper.MimeTypeHelper;
-import org.codelibs.fess.crawler.util.IgnoreCloseInputStream;
 
 import jp.gr.java_conf.dangan.util.lha.LhaFile;
 import jp.gr.java_conf.dangan.util.lha.LhaHeader;
@@ -65,13 +65,23 @@ public class LhaExtractor extends AbstractExtractor {
     protected long maxBytes = 1L << 31;
 
     /**
-     * Maximum uncompressed bytes for a SINGLE entry (as reported by the LHA
-     * header). Guards against an oversized entry being routed to a buffered
-     * downstream extractor. Defaults to 256 MiB. Set to {@code -1} to
-     * disable. Enforced independently of {@link #maxBytes}.
+     * Maximum uncompressed bytes that may be buffered for a SINGLE entry.
+     * Enforced against the actual bytes read from the entry stream (NOT the
+     * header-reported size, which is attacker-controlled). Defaults to
+     * 256 MiB. Set to {@code -1} to disable. Enforced independently of
+     * {@link #maxBytes}.
      */
     protected long maxBytesPerEntry = 256L * 1024L * 1024L;
 
+    /**
+     * Maximum bytes copied from the input stream to the local temporary file
+     * before {@link LhaFile} is opened. The LHA library requires a seekable
+     * file, so the entire archive must be staged on disk; this cap prevents a
+     * hostile producer from filling local storage. Defaults to 1 GiB. Set to
+     * {@code -1} to disable.
+     */
+    protected long maxInputBytes = 1L << 30;
+
     /**
      * Maximum allowed number of entries to iterate. Defaults to 100,000.
      * Set to {@code -1} to disable.
@@ -111,13 +121,20 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
         try {
             tempFile = createTempFile("crawler-", ".lzh", null);
             try (FileOutputStream fos = new FileOutputStream(tempFile)) {
-                CopyUtil.copy(in, fos);
+                // Stage the (untrusted) archive bytes to disk under a hard
+                // cap so a hostile producer cannot exhaust local storage by
+                // streaming an arbitrarily large body.
+                final long inputReadLimit = maxInputBytes > 0 ? maxInputBytes + 1L : Long.MAX_VALUE;
+                final long staged = copyBounded(in, fos, inputReadLimit);
+                if (maxInputBytes > 0 && staged > maxInputBytes) {
+                    throw new MaxLengthExceededException("lha input size exceeded: bytes=" + staged + " max=" + maxInputBytes);
+                }
             }
 
             lhaFile = new LhaFile(tempFile);
             @SuppressWarnings("unchecked")
             final Enumeration<LhaHeader> entries = lhaFile.entries();
-            long contentSize = 0;
+            long totalBytes = 0;
             int entryCount = 0;
             while (entries.hasMoreElements()) {
                 final LhaHeader head = entries.nextElement();
@@ -125,33 +142,61 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
                 if (maxEntries > 0 && entryCount > maxEntries) {
                     throw new MaxLengthExceededException("lha entry count exceeded: count=" + entryCount + " max=" + maxEntries);
                 }
-                final long entrySize = head.getOriginalSize();
-                if (maxBytesPerEntry > 0 && entrySize > maxBytesPerEntry) {
-                    throw new MaxLengthExceededException(
-                            "lha per-entry size exceeded: name=" + head.getPath() + " size=" + entrySize + " max=" + maxBytesPerEntry);
-                }
-                contentSize += entrySize;
-                if (maxBytes > 0 && contentSize > maxBytes) {
-                    throw new MaxLengthExceededException("lha uncompressed size exceeded: total=" + contentSize + " max=" + maxBytes);
-                }
-                if (maxContentSize != -1 && contentSize > maxContentSize) {
-                    throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize);
-                }
                 final String filename = head.getPath();
                 if (isPathTraversal(filename)) {
                     logger.warn("lha entry rejected: name={} reason=path-traversal", filename);
                     continue;
                 }
+
+                // Read the entry payload through copyBounded so the cap is
+                // enforced against bytes actually decompressed, not the
+                // header-reported size (which is attacker-controlled).
+                final long actualBytes;
+                final byte[] entryBytes;
+                InputStream is = null;
+                try {
+                    is = lhaFile.getInputStream(head);
+                    final long totalReadLimit;
+                    if (maxBytes > 0) {
+                        totalReadLimit = Math.max(0L, maxBytes - totalBytes) + 1L;
+                    } else {
+                        totalReadLimit = Long.MAX_VALUE;
+                    }
+                    final long perEntryReadLimit = maxBytesPerEntry > 0 ? maxBytesPerEntry + 1L : Long.MAX_VALUE;
+                    final long readLimit = Math.min(totalReadLimit, perEntryReadLimit);
+                    final ByteArrayOutputStream out = new ByteArrayOutputStream();
+                    actualBytes = copyBounded(is, out, readLimit);
+                    entryBytes = out.toByteArray();
+                } catch (final IOException ioe) {
+                    if (logger.isDebugEnabled()) {
+                        logger.debug("Failed to read lha entry: name={}", filename, ioe);
+                    }
+                    continue;
+                } finally {
+                    CloseableUtil.closeQuietly(is);
+                }
+
+                if (maxBytesPerEntry > 0 && actualBytes > maxBytesPerEntry) {
+                    throw new MaxLengthExceededException(
+                            "lha per-entry size exceeded: name=" + filename + " size=" + actualBytes + " max=" + maxBytesPerEntry);
+                }
+
+                totalBytes += actualBytes;
+                if (maxBytes > 0 && totalBytes > maxBytes) {
+                    throw new MaxLengthExceededException("lha uncompressed size exceeded: total=" + totalBytes + " max=" + maxBytes);
+                }
+                if (maxContentSize != -1 && totalBytes > maxContentSize) {
+                    throw new MaxLengthExceededException("Extracted size is " + totalBytes + " > " + maxContentSize);
+                }
+
                 final String mimeType = mimeTypeHelper.getContentType(null, filename);
                 if (mimeType != null) {
                     final Extractor extractor = extractorFactory.getExtractor(mimeType);
                     if (extractor != null) {
-                        InputStream is = null;
                         try {
-                            is = lhaFile.getInputStream(head);
                             final Map<String, String> map = incrementDepth(params);
                             map.put(ExtractData.RESOURCE_NAME_KEY, filename);
-                            buf.append(extractor.getText(new IgnoreCloseInputStream(is), map).getContent());
+                            buf.append(extractor.getText(new ByteArrayInputStream(entryBytes), map).getContent());
                             buf.append('\n');
                         } catch (final MaxLengthExceededException e) {
                             throw e;
@@ -159,8 +204,6 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
                             if (logger.isDebugEnabled()) {
                                 logger.debug("Exception in an internal extractor.", e);
                             }
-                        } finally {
-                            CloseableUtil.closeQuietly(is);
                         }
                     }
                 }
@@ -201,8 +244,9 @@ public void setMaxBytes(final long maxBytes) {
     }
 
     /**
-     * Sets the per-entry cap on the original (uncompressed) size reported by
-     * the LHA header. Set to {@code -1} to disable.
+     * Sets the per-entry cap on uncompressed bytes buffered in memory. The
+     * cap is enforced against bytes actually decompressed (not the
+     * header-reported size). Set to {@code -1} to disable.
      *
      * @param maxBytesPerEntry the per-entry maximum
      */
@@ -210,6 +254,16 @@ public void setMaxBytesPerEntry(final long maxBytesPerEntry) {
         this.maxBytesPerEntry = maxBytesPerEntry;
     }
 
+    /**
+     * Sets the cap on the number of input bytes staged to a temporary file
+     * before {@link LhaFile} is opened. Set to {@code -1} to disable.
+     *
+     * @param maxInputBytes the input-stage maximum
+     */
+    public void setMaxInputBytes(final long maxInputBytes) {
+        this.maxInputBytes = maxInputBytes;
+    }
+
     /**
      * Sets the maximum number of entries that may be iterated.
      * @param maxEntries the maximum entry count (use {@code -1} to disable)
diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java
index 97184a5f..6c6ba2b4 100644
--- a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java
+++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java
@@ -445,10 +445,12 @@ public void test_tar_recursionDepth_exceeded() throws Exception {
 
     @Test
     public void test_perEntryCapEnforced() throws Exception {
-        // Build a zip whose single entry holds 300 MiB of (highly
-        // compressible) zeroes. The extractor must trip the per-entry cap
-        // before buffering the entire 300 MiB into memory.
-        final int entrySize = 300 * 1024 * 1024;
+        // Build a zip whose single entry exceeds the configured per-entry
+        // cap. The extractor must trip the cap before buffering the whole
+        // payload. We use a small cap (1 MiB) and a slightly larger payload
+        // (2 MiB) so the test stays cheap on parallel/low-memory CI.
+        final int perEntryCap = 1024 * 1024;
+        final int entrySize = 2 * perEntryCap;
         final byte[] payload = new byte[entrySize];
 
         final java.util.zip.Deflater def = new java.util.zip.Deflater(java.util.zip.Deflater.BEST_COMPRESSION);
@@ -479,11 +481,11 @@ public void test_perEntryCapEnforced() throws Exception {
         final byte[] data = baos.toByteArray();
 
         // Disable the total-size and ratio checks so only the per-entry cap
-        // can trigger. Default per-entry cap (256 MiB) must reject the
-        // 300 MiB entry.
+        // can trigger.
         zipExtractor.setMaxBytes(-1);
         zipExtractor.setMaxContentSize(-1);
         zipExtractor.setMaxCompressionRatio(-1);
+        zipExtractor.setMaxBytesPerEntry(perEntryCap);
         try (InputStream in = new ByteArrayInputStream(data)) {
             zipExtractor.getText(in, null);
             fail();
@@ -507,4 +509,21 @@ public void test_lha_recursionDepth_exceeded() {
             fail();
         }
     }
+
+    @Test
+    public void test_lha_maxInputBytes_capsStaging() {
+        // Stage cap is enforced during the temp-file copy, before LhaFile
+        // is opened. Any blob larger than the cap must be rejected — we use
+        // arbitrary bytes since the failure precedes archive parsing.
+        lhaExtractor.setMaxInputBytes(1024L);
+        final byte[] payload = new byte[4 * 1024];
+        try (InputStream in = new ByteArrayInputStream(payload)) {
+            lhaExtractor.getText(in, null);
+            fail();
+        } catch (final MaxLengthExceededException e) {
+            assertTrue(e.getMessage().contains("input size exceeded"));
+        } catch (final IOException e) {
+            fail();
+        }
+    }
 }

From 2a92466827f11cc032d93ea0fbfd5a3f73657717 Mon Sep 17 00:00:00 2001
From: Shinsuke Sugaya <shinsuke.sugaya@codelibs.co>
Date: Tue, 5 May 2026 22:44:39 +0900
Subject: [PATCH 4/6] fix(extractor): skip unsupported archive entries; fold
 maxContentSize into read budget

Addresses PR #161 review feedback:

1) Unsupported entries (no registered Extractor) are now skipped without
   buffering, mirroring the pre-defence behaviour. A large irrelevant
   entry (e.g. a video alongside a small .txt) no longer consumes the
   per-entry / total caps that should be reserved for entries the
   crawler actually wants to extract.

2) The legacy maxContentSize field is folded into the per-entry read
   budget (alongside maxBytes and maxBytesPerEntry). A small legacy cap
   (e.g. 10 MiB) now stops the buffer growing past it, instead of being
   checked only after up to 256 MiB+1 had been buffered.

For Zip, the compressed-bytes anchor is also advanced for skipped /
rejected entries so the next supported entry's compression-ratio is
computed against its own compressed bytes, not also those of the
skipped ones.

Tests: updates test_perEntryCapEnforced to use a supported (.txt) entry
and adds three regressions:
- test_zip_unsupportedEntryDoesNotConsumeCaps
- test_tar_unsupportedEntryDoesNotConsumeCaps
- test_zip_maxContentSize_capsBufferBeforePerEntryCap
---
 .../crawler/extractor/impl/LhaExtractor.java  | 65 ++++++++++-----
 .../crawler/extractor/impl/TarExtractor.java  | 69 ++++++++++------
 .../crawler/extractor/impl/ZipExtractor.java  | 76 ++++++++++++------
 .../impl/ArchiveExtractorSecurityTest.java    | 80 +++++++++++++++++--
 4 files changed, 217 insertions(+), 73 deletions(-)

diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java
index b28d59df..7e975e13 100644
--- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java
+++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java
@@ -55,12 +55,20 @@ public class LhaExtractor extends AbstractExtractor {
     /** Logger for this class. */
     private static final Logger logger = LogManager.getLogger(LhaExtractor.class);
 
-    /** Maximum content size for extraction. -1 means no limit. */
+    /**
+     * Legacy total cap on uncompressed bytes actually buffered from
+     * supported entries. The cap is also folded into the read budget so a
+     * single oversized entry cannot be buffered up to
+     * {@link #maxBytesPerEntry} when the user only asked for a much smaller
+     * total. Set to {@code -1} to disable.
+     */
     protected long maxContentSize = -1;
 
     /**
      * Maximum total uncompressed bytes that may be read from all entries
-     * combined. Defaults to 2 GiB. Set to {@code -1} to disable.
+     * combined. Defaults to 2 GiB. Set to {@code -1} to disable. Only bytes
+     * from entries that have a registered {@link Extractor} contribute to
+     * this total — unsupported entries are skipped without buffering.
      */
     protected long maxBytes = 1L << 31;
 
@@ -69,7 +77,8 @@ public class LhaExtractor extends AbstractExtractor {
      * Enforced against the actual bytes read from the entry stream (NOT the
      * header-reported size, which is attacker-controlled). Defaults to
      * 256 MiB. Set to {@code -1} to disable. Enforced independently of
-     * {@link #maxBytes}.
+     * {@link #maxBytes}. Only applies to entries that have a registered
+     * {@link Extractor}; an unsupported entry is never buffered.
      */
     protected long maxBytesPerEntry = 256L * 1024L * 1024L;
 
@@ -148,6 +157,17 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
                     continue;
                 }
 
+                // Decide MIME / extractor up front so an unsupported entry
+                // is skipped without opening its decompressor at all. This
+                // mirrors the legacy behaviour and keeps a large irrelevant
+                // entry from consuming the per-entry / total caps reserved
+                // for entries the crawler actually wants to extract.
+                final String mimeType = mimeTypeHelper.getContentType(null, filename);
+                final Extractor extractor = mimeType != null ? extractorFactory.getExtractor(mimeType) : null;
+                if (extractor == null) {
+                    continue;
+                }
+
                 // Read the entry payload through copyBounded so the cap is
                 // enforced against bytes actually decompressed, not the
                 // header-reported size (which is attacker-controlled).
@@ -162,8 +182,17 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
                     } else {
                         totalReadLimit = Long.MAX_VALUE;
                     }
+                    // Fold maxContentSize into the read budget so a small
+                    // legacy cap is honoured before a large per-entry cap
+                    // can buffer hundreds of MiB into memory.
+                    final long contentReadLimit;
+                    if (maxContentSize >= 0) {
+                        contentReadLimit = Math.max(0L, maxContentSize - totalBytes) + 1L;
+                    } else {
+                        contentReadLimit = Long.MAX_VALUE;
+                    }
                     final long perEntryReadLimit = maxBytesPerEntry > 0 ? maxBytesPerEntry + 1L : Long.MAX_VALUE;
-                    final long readLimit = Math.min(totalReadLimit, perEntryReadLimit);
+                    final long readLimit = Math.min(Math.min(totalReadLimit, contentReadLimit), perEntryReadLimit);
                     final ByteArrayOutputStream out = new ByteArrayOutputStream();
                     actualBytes = copyBounded(is, out, readLimit);
                     entryBytes = out.toByteArray();
@@ -185,26 +214,20 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
                 if (maxBytes > 0 && totalBytes > maxBytes) {
                     throw new MaxLengthExceededException("lha uncompressed size exceeded: total=" + totalBytes + " max=" + maxBytes);
                 }
-                if (maxContentSize != -1 && totalBytes > maxContentSize) {
+                if (maxContentSize >= 0 && totalBytes > maxContentSize) {
                     throw new MaxLengthExceededException("Extracted size is " + totalBytes + " > " + maxContentSize);
                 }
 
-                final String mimeType = mimeTypeHelper.getContentType(null, filename);
-                if (mimeType != null) {
-                    final Extractor extractor = extractorFactory.getExtractor(mimeType);
-                    if (extractor != null) {
-                        try {
-                            final Map<String, String> map = incrementDepth(params);
-                            map.put(ExtractData.RESOURCE_NAME_KEY, filename);
-                            buf.append(extractor.getText(new ByteArrayInputStream(entryBytes), map).getContent());
-                            buf.append('\n');
-                        } catch (final MaxLengthExceededException e) {
-                            throw e;
-                        } catch (final Exception e) {
-                            if (logger.isDebugEnabled()) {
-                                logger.debug("Exception in an internal extractor.", e);
-                            }
-                        }
+                try {
+                    final Map<String, String> map = incrementDepth(params);
+                    map.put(ExtractData.RESOURCE_NAME_KEY, filename);
+                    buf.append(extractor.getText(new ByteArrayInputStream(entryBytes), map).getContent());
+                    buf.append('\n');
+                } catch (final MaxLengthExceededException e) {
+                    throw e;
+                } catch (final Exception e) {
+                    if (logger.isDebugEnabled()) {
+                        logger.debug("Exception in an internal extractor.", e);
                     }
                 }
             }
diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java
index 0839b2c1..602878e5 100644
--- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java
+++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java
@@ -54,14 +54,19 @@ public class TarExtractor extends AbstractExtractor {
     protected ArchiveStreamFactory archiveStreamFactory;
 
     /**
-     * Maximum content size (legacy field). When set, the actual bytes
-     * read from each entry stream are summed and compared against this limit.
+     * Legacy total cap on uncompressed bytes actually buffered from
+     * supported entries. The cap is also folded into the read budget so a
+     * single oversized entry cannot be buffered up to
+     * {@link #maxBytesPerEntry} when the user only asked for a much smaller
+     * total. Set to {@code -1} to disable.
      */
     protected long maxContentSize = -1;
 
     /**
      * Maximum total uncompressed bytes that may be read from all entries
-     * combined. Defaults to 2 GiB. Set to {@code -1} to disable.
+     * combined. Defaults to 2 GiB. Set to {@code -1} to disable. Only bytes
+     * from entries that have a registered {@link Extractor} contribute to
+     * this total — unsupported entries are skipped without buffering.
      */
     protected long maxBytes = 1L << 31;
 
@@ -69,7 +74,9 @@ public class TarExtractor extends AbstractExtractor {
      * Maximum uncompressed bytes that may be buffered for a SINGLE entry.
      * Guards against an oversized entry exhausting the JVM heap when
      * buffered into memory. Defaults to 256 MiB. Set to {@code -1} to
-     * disable. Enforced independently of {@link #maxBytes}.
+     * disable. Enforced independently of {@link #maxBytes}. Only applies to
+     * entries that have a registered {@link Extractor}; an unsupported
+     * entry is never buffered, so this cap is irrelevant for it.
      */
     protected long maxBytesPerEntry = 256L * 1024L * 1024L;
 
@@ -135,6 +142,17 @@ protected String getTextInternal(final InputStream in, final MimeTypeHelper mime
                     continue;
                 }
 
+                // Decide MIME / extractor up front. An unsupported entry
+                // (e.g. a video alongside a small .txt) is skipped without
+                // buffering, so a large irrelevant entry does not consume
+                // the per-entry / total caps that should be reserved for
+                // entries the crawler actually wants to extract.
+                final String mimeType = mimeTypeHelper.getContentType(null, filename);
+                final Extractor extractor = mimeType != null ? extractorFactory.getExtractor(mimeType) : null;
+                if (extractor == null) {
+                    continue;
+                }
+
                 final long actualBytes;
                 final byte[] entryBytes;
                 try {
@@ -144,8 +162,17 @@ protected String getTextInternal(final InputStream in, final MimeTypeHelper mime
                     } else {
                         totalReadLimit = Long.MAX_VALUE;
                     }
+                    // Fold maxContentSize into the read budget so a small
+                    // legacy cap is honoured before a large per-entry cap
+                    // can buffer hundreds of MiB into memory.
+                    final long contentReadLimit;
+                    if (maxContentSize >= 0) {
+                        contentReadLimit = Math.max(0L, maxContentSize - totalBytes) + 1L;
+                    } else {
+                        contentReadLimit = Long.MAX_VALUE;
+                    }
                     final long perEntryReadLimit = maxBytesPerEntry > 0 ? maxBytesPerEntry + 1L : Long.MAX_VALUE;
-                    final long readLimit = Math.min(totalReadLimit, perEntryReadLimit);
+                    final long readLimit = Math.min(Math.min(totalReadLimit, contentReadLimit), perEntryReadLimit);
                     final ByteArrayOutputStream out = new ByteArrayOutputStream();
                     actualBytes = copyBounded(ais, out, readLimit);
                     entryBytes = out.toByteArray();
@@ -166,28 +193,22 @@ protected String getTextInternal(final InputStream in, final MimeTypeHelper mime
                 if (maxBytes > 0 && totalBytes > maxBytes) {
                     throw new MaxLengthExceededException("tar uncompressed size exceeded: total=" + totalBytes + " max=" + maxBytes);
                 }
-                if (maxContentSize != -1 && totalBytes > maxContentSize) {
+                if (maxContentSize >= 0 && totalBytes > maxContentSize) {
                     throw new MaxLengthExceededException("Extracted size is " + totalBytes + " > " + maxContentSize);
                 }
 
-                final String mimeType = mimeTypeHelper.getContentType(null, filename);
-                if (mimeType != null) {
-                    final Extractor extractor = extractorFactory.getExtractor(mimeType);
-                    if (extractor != null) {
-                        try {
-                            final Map<String, String> map = incrementDepth(params);
-                            map.put(ExtractData.RESOURCE_NAME_KEY, filename);
-                            buf.append(extractor.getText(new ByteArrayInputStream(entryBytes), map).getContent());
-                            buf.append('\n');
-                            processedEntries++;
-                        } catch (final MaxLengthExceededException e) {
-                            throw e;
-                        } catch (final Exception e) {
-                            failedEntries++;
-                            if (logger.isDebugEnabled()) {
-                                logger.debug("Failed to extract content from archive entry: name={}", filename, e);
-                            }
-                        }
+                try {
+                    final Map<String, String> map = incrementDepth(params);
+                    map.put(ExtractData.RESOURCE_NAME_KEY, filename);
+                    buf.append(extractor.getText(new ByteArrayInputStream(entryBytes), map).getContent());
+                    buf.append('\n');
+                    processedEntries++;
+                } catch (final MaxLengthExceededException e) {
+                    throw e;
+                } catch (final Exception e) {
+                    failedEntries++;
+                    if (logger.isDebugEnabled()) {
+                        logger.debug("Failed to extract content from archive entry: name={}", filename, e);
                     }
                 }
             }
diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java
index eeff5326..bfa83264 100644
--- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java
+++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java
@@ -61,14 +61,20 @@ public class ZipExtractor extends AbstractExtractor {
     private static final long COMPRESSION_RATIO_MIN_BYTES = 1L << 20; // 1 MiB
 
     /**
-     * The maximum content size (legacy field). When set, the actual bytes
-     * read from each entry stream are summed and compared against this limit.
+     * Legacy total cap on uncompressed bytes actually buffered from
+     * supported entries. The cap is also folded into the read budget so a
+     * single oversized entry cannot be buffered up to
+     * {@link #maxBytesPerEntry} when the user only asked for a much smaller
+     * total. Set to {@code -1} to disable.
      */
     protected long maxContentSize = -1;
 
     /**
      * Maximum total uncompressed bytes that may be read from all entries
-     * combined. Defaults to 2 GiB. Set to {@code -1} to disable.
+     * combined. Defaults to 2 GiB. Set to {@code -1} to disable. Only bytes
+     * from entries that have a registered {@link Extractor} contribute to
+     * this total — unsupported entries are skipped without buffering or
+     * draining, mirroring the pre-defence behaviour.
      */
     protected long maxBytes = 1L << 31;
 
@@ -78,6 +84,8 @@ public class ZipExtractor extends AbstractExtractor {
      * 1.9 GiB file inside an otherwise small archive) exhausting the JVM
      * heap when buffered into memory. Defaults to 256 MiB. Set to
      * {@code -1} to disable. Enforced independently of {@link #maxBytes}.
+     * Only applies to entries that have a registered {@link Extractor}; an
+     * unsupported entry is never buffered, so this cap is irrelevant for it.
      */
     protected long maxBytesPerEntry = 256L * 1024L * 1024L;
 
@@ -157,10 +165,28 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
                 }
                 final String filename = entry.getName();
                 if (entry.isDirectory()) {
+                    lastCompressedBytes = counter.getByteCount();
                     continue;
                 }
                 if (isPathTraversal(filename)) {
                     logger.warn("zip entry rejected: name={} reason=path-traversal", filename);
+                    // Keep the compressed-bytes anchor in step with the
+                    // stream so the next supported entry's ratio is
+                    // computed against ITS own compressed bytes, not also
+                    // those of the rejected entry.
+                    lastCompressedBytes = counter.getByteCount();
+                    continue;
+                }
+
+                // Decide MIME / extractor up front. An unsupported entry
+                // (e.g. a video alongside a small .txt) is skipped without
+                // buffering, so a large irrelevant entry does not consume
+                // the per-entry / total caps that should be reserved for
+                // entries the crawler actually wants to extract.
+                final String mimeType = mimeTypeHelper.getContentType(null, filename);
+                final Extractor extractor = mimeType != null ? extractorFactory.getExtractor(mimeType) : null;
+                if (extractor == null) {
+                    lastCompressedBytes = counter.getByteCount();
                     continue;
                 }
 
@@ -174,13 +200,22 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
                     } else {
                         totalReadLimit = Long.MAX_VALUE;
                     }
+                    // Fold maxContentSize into the read budget so a small
+                    // legacy cap is honoured before a large per-entry cap
+                    // can buffer hundreds of MiB into memory.
+                    final long contentReadLimit;
+                    if (maxContentSize >= 0) {
+                        contentReadLimit = Math.max(0L, maxContentSize - totalBytes) + 1L;
+                    } else {
+                        contentReadLimit = Long.MAX_VALUE;
+                    }
                     // Enforce a per-entry cap independently of the total
                     // cap so that a single oversized entry cannot exhaust
                     // the JVM heap. We read one byte beyond the cap so the
                     // explicit overflow check below can distinguish
                     // "exactly at the cap" from "exceeds the cap".
                     final long perEntryReadLimit = maxBytesPerEntry > 0 ? maxBytesPerEntry + 1L : Long.MAX_VALUE;
-                    final long readLimit = Math.min(totalReadLimit, perEntryReadLimit);
+                    final long readLimit = Math.min(Math.min(totalReadLimit, contentReadLimit), perEntryReadLimit);
                     final ByteArrayOutputStream out = new ByteArrayOutputStream();
                     actualBytes = copyBounded(ais, out, readLimit);
                     entryBytes = out.toByteArray();
@@ -189,6 +224,7 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
                     if (logger.isDebugEnabled()) {
                         logger.debug("Failed to read zip entry: name={}", filename, ioe);
                     }
+                    lastCompressedBytes = counter.getByteCount();
                     continue;
                 }
 
@@ -201,7 +237,7 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
                 if (maxBytes > 0 && totalBytes > maxBytes) {
                     throw new MaxLengthExceededException("zip uncompressed size exceeded: total=" + totalBytes + " max=" + maxBytes);
                 }
-                if (maxContentSize != -1 && totalBytes > maxContentSize) {
+                if (maxContentSize >= 0 && totalBytes > maxContentSize) {
                     throw new MaxLengthExceededException("Extracted size is " + totalBytes + " > " + maxContentSize);
                 }
 
@@ -223,24 +259,18 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
                             + (actualBytes / compressed) + " max=" + maxCompressionRatio);
                 }
 
-                final String mimeType = mimeTypeHelper.getContentType(null, filename);
-                if (mimeType != null) {
-                    final Extractor extractor = extractorFactory.getExtractor(mimeType);
-                    if (extractor != null) {
-                        try {
-                            final Map<String, String> map = incrementDepth(params);
-                            map.put(ExtractData.RESOURCE_NAME_KEY, filename);
-                            buf.append(extractor.getText(new ByteArrayInputStream(entryBytes), map).getContent());
-                            buf.append('\n');
-                            processedEntries++;
-                        } catch (final MaxLengthExceededException e) {
-                            throw e;
-                        } catch (final Exception e) {
-                            failedEntries++;
-                            if (logger.isDebugEnabled()) {
-                                logger.debug("Failed to extract content from archive entry: name={}", filename, e);
-                            }
-                        }
+                try {
+                    final Map<String, String> map = incrementDepth(params);
+                    map.put(ExtractData.RESOURCE_NAME_KEY, filename);
+                    buf.append(extractor.getText(new ByteArrayInputStream(entryBytes), map).getContent());
+                    buf.append('\n');
+                    processedEntries++;
+                } catch (final MaxLengthExceededException e) {
+                    throw e;
+                } catch (final Exception e) {
+                    failedEntries++;
+                    if (logger.isDebugEnabled()) {
+                        logger.debug("Failed to extract content from archive entry: name={}", filename, e);
                     }
                 }
             }
diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java
index 6c6ba2b4..39aeefc4 100644
--- a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java
+++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java
@@ -445,10 +445,13 @@ public void test_tar_recursionDepth_exceeded() throws Exception {
 
     @Test
     public void test_perEntryCapEnforced() throws Exception {
-        // Build a zip whose single entry exceeds the configured per-entry
-        // cap. The extractor must trip the cap before buffering the whole
-        // payload. We use a small cap (1 MiB) and a slightly larger payload
-        // (2 MiB) so the test stays cheap on parallel/low-memory CI.
+        // Build a zip whose single SUPPORTED entry exceeds the configured
+        // per-entry cap. The extractor must trip the cap before buffering
+        // the whole payload. We use a small cap (1 MiB) and a slightly
+        // larger payload (2 MiB) so the test stays cheap on parallel /
+        // low-memory CI. The extension is .txt so the entry routes through
+        // the registered text/plain extractor — only supported entries are
+        // buffered (and therefore can hit the per-entry memory cap).
         final int perEntryCap = 1024 * 1024;
         final int entrySize = 2 * perEntryCap;
         final byte[] payload = new byte[entrySize];
@@ -468,7 +471,7 @@ public void test_perEntryCapEnforced() throws Exception {
 
         final ByteArrayOutputStream baos = new ByteArrayOutputStream();
         try (ZipArchiveOutputStream zos = new ZipArchiveOutputStream(baos)) {
-            final ZipArchiveEntry entry = new ZipArchiveEntry("big.bin");
+            final ZipArchiveEntry entry = new ZipArchiveEntry("big.txt");
             entry.setMethod(ZipArchiveEntry.DEFLATED);
             entry.setSize(payload.length);
             entry.setCompressedSize(compBuf.size());
@@ -494,6 +497,73 @@ public void test_perEntryCapEnforced() throws Exception {
         }
     }
 
+    // ---------------------------------------------------------------------
+    // Unsupported entries must NOT consume the per-entry / total caps —
+    // they are skipped without buffering so that supported entries
+    // alongside them still extract successfully (regression for PR #161
+    // review feedback).
+    // ---------------------------------------------------------------------
+
+    @Test
+    public void test_zip_unsupportedEntryDoesNotConsumeCaps() throws Exception {
+        // A "big.bin" payload that, were it to be buffered, would exceed
+        // both the per-entry cap and the total cap. The supported "ok.txt"
+        // alongside it must still extract because no extractor is
+        // registered for application/octet-stream.
+        final byte[] big = new byte[4 * 1024 * 1024];
+        final byte[] data = buildZip(new EntrySpec("big.bin", big), new EntrySpec("ok.txt", "good".getBytes(StandardCharsets.UTF_8)));
+
+        zipExtractor.setMaxBytes(64 * 1024); // smaller than big.bin
+        zipExtractor.setMaxBytesPerEntry(64 * 1024); // also smaller
+        zipExtractor.setMaxContentSize(64 * 1024);
+        try (InputStream in = new ByteArrayInputStream(data)) {
+            final String content = zipExtractor.getText(in, null).getContent();
+            assertTrue(content.contains("good"));
+        }
+    }
+
+    @Test
+    public void test_tar_unsupportedEntryDoesNotConsumeCaps() throws Exception {
+        final byte[] big = new byte[4 * 1024 * 1024];
+        final byte[] data = buildTar(new TarEntrySpec("big.bin", big), new TarEntrySpec("ok.txt", "good".getBytes(StandardCharsets.UTF_8)));
+
+        tarExtractor.setMaxBytes(64 * 1024);
+        tarExtractor.setMaxBytesPerEntry(64 * 1024);
+        tarExtractor.setMaxContentSize(64 * 1024);
+        try (InputStream in = new ByteArrayInputStream(data)) {
+            final String content = tarExtractor.getText(in, null).getContent();
+            assertTrue(content.contains("good"));
+        }
+    }
+
+    // ---------------------------------------------------------------------
+    // maxContentSize is folded into the read budget — a small legacy cap
+    // must trip BEFORE the buffer grows to the much larger per-entry cap
+    // (regression for PR #161 review feedback).
+    // ---------------------------------------------------------------------
+
+    @Test
+    public void test_zip_maxContentSize_capsBufferBeforePerEntryCap() throws Exception {
+        // 4 MiB supported entry; per-entry cap default is large; legacy
+        // maxContentSize is small. Without the fix the buffer would grow
+        // up to maxBytesPerEntry+1 before throwing. With the fix the read
+        // budget is bounded by maxContentSize+1 so buffering stops early.
+        final int legacyCap = 64 * 1024;
+        final byte[] payload = new byte[4 * 1024 * 1024];
+        final byte[] data = buildZip(new EntrySpec("big.txt", payload));
+
+        zipExtractor.setMaxBytes(-1);
+        zipExtractor.setMaxCompressionRatio(-1);
+        zipExtractor.setMaxBytesPerEntry(8L * 1024L * 1024L); // intentionally larger than payload
+        zipExtractor.setMaxContentSize(legacyCap);
+        try (InputStream in = new ByteArrayInputStream(data)) {
+            zipExtractor.getText(in, null);
+            fail();
+        } catch (final MaxLengthExceededException e) {
+            assertTrue(e.getMessage().contains("Extracted size is"));
+        }
+    }
+
     @Test
     public void test_lha_recursionDepth_exceeded() {
         final Map<String, String> params = new HashMap<>();

From 7f5b2438bc7feba13f92ca49436455cd96af28c3 Mon Sep 17 00:00:00 2001
From: Shinsuke Sugaya <shinsuke@apache.org>
Date: Sat, 16 May 2026 12:30:58 +0900
Subject: [PATCH 5/6] fix(extractor): harden archive defenses from review
 feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses correctness, security, and observability issues raised in the
review of the archive-bomb / Zip-Slip / recursion-depth defenses.

Correctness
- AbstractExtractor: new addOneSaturating(long) clamps the read-limit
  "+1" margin at Long.MAX_VALUE so an admin who sets a cap near
  Long.MAX_VALUE no longer wraps to Long.MIN_VALUE and silently buffers
  zero bytes. All "+1L" sites in Zip/Tar/Lha now go through it.
- AbstractExtractor.copyBounded: a negative limit now throws
  IllegalArgumentException instead of silently returning 0, so a
  misconfiguration upstream surfaces immediately.
- AbstractExtractor.isPathTraversal: normalise backslash to forward
  slash BEFORE calling Paths.get so single-segment names like "a\\.."
  are recognised as traversal on Linux/macOS JVMs (where backslash is
  a literal filename character).

Security
- ZipExtractor: tighten the early signature check to PK\x03\x04 only;
  short-circuit valid empty archives that start with PK\x05\x06; reject
  PK\x07\x08 (data-descriptor) and other non-opening signatures.
- ZipExtractor: flip allowStoredEntriesWithDataDescriptor back to false
  (the pre-PR factory default) to avoid widening the attack surface.
- ZipExtractor: compression-ratio check now uses
  min(header_compressed_size, measured_compressed_size) so a header
  that lies about the compressed size cannot suppress the check.
- TarExtractor: skip PAX extended/global header pseudo-entries before
  incrementing entryCount so they do not consume the maxEntries budget.
- TarExtractor: symbolic/hard-link skip log upgraded from DEBUG to WARN
  for parity with path-traversal rejection.

Observability
- ZipExtractor / TarExtractor / LhaExtractor: emit a summary WARN when
  the entry loop completes normally with failedEntries > 0. Operators
  no longer have to enable DEBUG to notice partial extraction.
- LhaExtractor: widen the per-entry catch to (IOException |
  RuntimeException) and guard against a null InputStream from
  LhaFile#getInputStream — the dangan library does not declare checked
  exceptions for corrupt/unknown compression methods and can return
  null when a header is missing. Failed entries are now logged at WARN
  with the filename and counted in failedEntries.
- LhaExtractor: replace the empty catch on LhaFile#close() with a WARN
  log so a failing close (e.g. EBADF, disk full) is no longer silently
  hidden.
- LhaExtractor: use validateInputStream(in) instead of the bespoke null
  check so the message and exception type match Zip/Tar.

Tests
- AbstractExtractorTest: 9 new isPathTraversal cases (null, empty,
  drive letter, leading slash, leading backslash, lone "..", classic
  traversal, safe relative, NUL char) plus the new single-segment
  backslash regression that exercises the Paths.get fix; 2 new
  addOneSaturating cases.
- ArchiveExtractorSecurityTest: 15 new tests covering the ZIP
  signature variants, the saturating-add at Long.MAX_VALUE, the
  min(header,measured) ratio path, nested-archive recursion-depth
  increment, per-entry/total cap matrix with each cap disabled, the
  CP932 test via Assumptions.assumeTrue, a tightened ratio assertion,
  the Tar PAX-header skip, the Tar per-entry cap, the Tar symlink
  WARN behaviour, and the setMaxArchiveDepth setter.

All 83 tests in the archive-extractor test classes pass. The single
flaky failure in the wider suite (Hc5HttpClientTest#test_doHead_
accessTimeoutTarget) is pre-existing and unrelated.
---
 .../extractor/impl/AbstractExtractor.java     |  39 +-
 .../crawler/extractor/impl/LhaExtractor.java  |  47 ++-
 .../crawler/extractor/impl/TarExtractor.java  |  21 +-
 .../crawler/extractor/impl/ZipExtractor.java  |  64 ++-
 .../extractor/impl/AbstractExtractorTest.java | 109 ++++++
 .../impl/ArchiveExtractorSecurityTest.java    | 367 ++++++++++++++++++
 6 files changed, 603 insertions(+), 44 deletions(-)

diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java
index 7a2eb122..19c71822 100644
--- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java
+++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java
@@ -263,8 +263,18 @@ protected static boolean isPathTraversal(final String name) {
         if (name.length() >= 2 && name.charAt(1) == ':') {
             return true;
         }
+        // Normalise backslashes to forward slashes BEFORE calling Paths.get().
+        // On Linux (and macOS) a backslash is a literal filename character, so
+        // Paths.get("a\\..")  treats "a\\.." as a SINGLE opaque segment and
+        // normalize() leaves it unchanged — bypassing the ".." segment check.
+        // Unifying to "/" first forces the path parser to recognise each
+        // component correctly on all platforms.
+        final String unified = name.replace('\\', '/');
+        if (unified.startsWith("/")) {
+            return true;
+        }
         try {
-            final Path normalised = Paths.get(name).normalize();
+            final Path normalised = Paths.get(unified).normalize();
             final String normStr = normalised.toString().replace('\\', '/');
             if (normStr.equals("..") || normStr.startsWith("../") || normStr.contains("/../")) {
                 return true;
@@ -280,6 +290,21 @@ protected static boolean isPathTraversal(final String name) {
         return false;
     }
 
+    /**
+     * Saturating add: returns {@code value + 1} unless that would overflow
+     * {@code Long.MAX_VALUE}, in which case {@code Long.MAX_VALUE} is returned.
+     * Used when computing a read limit that is one byte beyond a cap so that
+     * the caller can detect "exactly at the cap" vs "exceeds the cap" without
+     * silently wrapping to a negative limit and reading nothing.
+     *
+     * @param value the value to increment (must be non-negative)
+     * @return {@code value + 1} or {@code Long.MAX_VALUE} if already at the
+     *         maximum
+     */
+    protected static long addOneSaturating(final long value) {
+        return value >= Long.MAX_VALUE ? Long.MAX_VALUE : value + 1L;
+    }
+
     /**
      * Copies up to {@code limit} bytes from {@code in} to {@code out}, returning
      * the actual number of bytes copied. Used by archive extractors to bound
@@ -288,14 +313,20 @@ protected static boolean isPathTraversal(final String name) {
      *
      * @param in the source stream
      * @param out the sink stream
-     * @param limit the maximum number of bytes to copy (inclusive). Values
-     *              {@code <= 0} cause the method to return without reading.
+     * @param limit the maximum number of bytes to copy (inclusive). Must be
+     *              non-negative; a negative value throws
+     *              {@link IllegalArgumentException} so misconfiguration is
+     *              surfaced immediately rather than silently reading nothing.
      * @return the number of bytes actually copied
+     * @throws IllegalArgumentException if {@code limit} is negative
      * @throws IOException if reading from {@code in} or writing to {@code out}
      *                     fails
      */
     protected static long copyBounded(final InputStream in, final OutputStream out, final long limit) throws IOException {
-        if (limit <= 0) {
+        if (limit < 0) {
+            throw new IllegalArgumentException("copyBounded: limit must be non-negative, got " + limit);
+        }
+        if (limit == 0) {
             return 0;
         }
         final byte[] buffer = new byte[8192];
diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java
index 7e975e13..9e1b1585 100644
--- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java
+++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java
@@ -29,7 +29,6 @@
 import org.codelibs.core.io.CloseableUtil;
 import org.codelibs.core.io.FileUtil;
 import org.codelibs.fess.crawler.entity.ExtractData;
-import org.codelibs.fess.crawler.exception.CrawlerSystemException;
 import org.codelibs.fess.crawler.exception.ExtractException;
 import org.codelibs.fess.crawler.exception.MaxLengthExceededException;
 import org.codelibs.fess.crawler.extractor.Extractor;
@@ -116,14 +115,14 @@ public LhaExtractor() {
      */
     @Override
     public ExtractData getText(final InputStream in, final Map<String, String> params) {
-        if (in == null) {
-            throw new CrawlerSystemException("LHA archive input stream is null. Cannot extract text from null input.");
-        }
+        validateInputStream(in);
         checkDepth(params, maxArchiveDepth);
 
         final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper();
         final ExtractorFactory extractorFactory = getExtractorFactory();
         final StringBuilder buf = new StringBuilder(1000);
+        int processedEntries = 0;
+        int failedEntries = 0;
 
         File tempFile = null;
         LhaFile lhaFile = null;
@@ -133,7 +132,7 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
                 // Stage the (untrusted) archive bytes to disk under a hard
                 // cap so a hostile producer cannot exhaust local storage by
                 // streaming an arbitrarily large body.
-                final long inputReadLimit = maxInputBytes > 0 ? maxInputBytes + 1L : Long.MAX_VALUE;
+                final long inputReadLimit = maxInputBytes > 0 ? addOneSaturating(maxInputBytes) : Long.MAX_VALUE;
                 final long staged = copyBounded(in, fos, inputReadLimit);
                 if (maxInputBytes > 0 && staged > maxInputBytes) {
                     throw new MaxLengthExceededException("lha input size exceeded: bytes=" + staged + " max=" + maxInputBytes);
@@ -175,10 +174,20 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
                 final byte[] entryBytes;
                 InputStream is = null;
                 try {
+                    // getInputStream(LhaHeader) can return null when the header
+                    // is not found in the archive index, and can throw
+                    // RuntimeException (e.g. IllegalArgumentException from
+                    // CompressMethod.getCore) for unknown/corrupt compression
+                    // methods — both must be handled before touching `is`.
                     is = lhaFile.getInputStream(head);
+                    if (is == null) {
+                        logger.warn("lha entry stream is null: name={}", filename);
+                        failedEntries++;
+                        continue;
+                    }
                     final long totalReadLimit;
                     if (maxBytes > 0) {
-                        totalReadLimit = Math.max(0L, maxBytes - totalBytes) + 1L;
+                        totalReadLimit = addOneSaturating(Math.max(0L, maxBytes - totalBytes));
                     } else {
                         totalReadLimit = Long.MAX_VALUE;
                     }
@@ -187,19 +196,23 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
                     // can buffer hundreds of MiB into memory.
                     final long contentReadLimit;
                     if (maxContentSize >= 0) {
-                        contentReadLimit = Math.max(0L, maxContentSize - totalBytes) + 1L;
+                        contentReadLimit = addOneSaturating(Math.max(0L, maxContentSize - totalBytes));
                     } else {
                         contentReadLimit = Long.MAX_VALUE;
                     }
-                    final long perEntryReadLimit = maxBytesPerEntry > 0 ? maxBytesPerEntry + 1L : Long.MAX_VALUE;
+                    final long perEntryReadLimit = maxBytesPerEntry > 0 ? addOneSaturating(maxBytesPerEntry) : Long.MAX_VALUE;
                     final long readLimit = Math.min(Math.min(totalReadLimit, contentReadLimit), perEntryReadLimit);
                     final ByteArrayOutputStream out = new ByteArrayOutputStream();
                     actualBytes = copyBounded(is, out, readLimit);
                     entryBytes = out.toByteArray();
-                } catch (final IOException ioe) {
-                    if (logger.isDebugEnabled()) {
-                        logger.debug("Failed to read lha entry: name={}", filename, ioe);
-                    }
+                } catch (final IOException | RuntimeException ioe) {
+                    // IOException: normal read failure.
+                    // RuntimeException: includes IllegalArgumentException thrown
+                    // by CompressMethod.getCore() for corrupt/unknown compression
+                    // methods in the dangan LHA library (getInputStream does not
+                    // declare checked exceptions for these cases).
+                    logger.warn("Failed to read lha entry: name={}", filename, ioe);
+                    failedEntries++;
                     continue;
                 } finally {
                     CloseableUtil.closeQuietly(is);
@@ -223,14 +236,20 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
                     map.put(ExtractData.RESOURCE_NAME_KEY, filename);
                     buf.append(extractor.getText(new ByteArrayInputStream(entryBytes), map).getContent());
                     buf.append('\n');
+                    processedEntries++;
                 } catch (final MaxLengthExceededException e) {
                     throw e;
                 } catch (final Exception e) {
+                    failedEntries++;
                     if (logger.isDebugEnabled()) {
-                        logger.debug("Exception in an internal extractor.", e);
+                        logger.debug("Exception in an internal extractor: name={}", filename, e);
                     }
                 }
             }
+            // Summary warn when the loop completed normally but some entries failed.
+            if (failedEntries > 0) {
+                logger.warn("LHA archive partially processed: processed={} failed={}", processedEntries, failedEntries);
+            }
         } catch (final MaxLengthExceededException e) {
             throw e;
         } catch (final Exception e) {
@@ -240,7 +259,7 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
                 try {
                     lhaFile.close();
                 } catch (final IOException e) {
-                    // ignore
+                    logger.warn("Failed to close LHA file. tempFile={}", tempFile, e);
                 }
             }
             FileUtil.deleteInBackground(tempFile);
diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java
index 602878e5..0bebefc7 100644
--- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java
+++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java
@@ -123,6 +123,13 @@ protected String getTextInternal(final InputStream in, final MimeTypeHelper mime
             long totalBytes = 0;
             int entryCount = 0;
             while ((entry = (TarArchiveEntry) ais.getNextEntry()) != null) {
+                // PAX extended / global headers are metadata-only pseudo-entries
+                // that must NOT count toward maxEntries. Commons Compress
+                // usually absorbs them, but skip defensively for non-standard
+                // tar dialects that surface them through getNextEntry().
+                if (entry.isPaxHeader() || entry.isGlobalPaxHeader()) {
+                    continue;
+                }
                 entryCount++;
                 if (maxEntries > 0 && entryCount > maxEntries) {
                     throw new MaxLengthExceededException("tar entry count exceeded: count=" + entryCount + " max=" + maxEntries);
@@ -132,9 +139,7 @@ protected String getTextInternal(final InputStream in, final MimeTypeHelper mime
                     continue;
                 }
                 if (entry.isSymbolicLink() || entry.isLink()) {
-                    if (logger.isDebugEnabled()) {
-                        logger.debug("tar entry skipped: name={} reason=link link={}", filename, entry.getLinkName());
-                    }
+                    logger.warn("tar entry skipped: name={} reason=link link={}", filename, entry.getLinkName());
                     continue;
                 }
                 if (isPathTraversal(filename)) {
@@ -158,7 +163,7 @@ protected String getTextInternal(final InputStream in, final MimeTypeHelper mime
                 try {
                     final long totalReadLimit;
                     if (maxBytes > 0) {
-                        totalReadLimit = Math.max(0L, maxBytes - totalBytes) + 1L;
+                        totalReadLimit = addOneSaturating(Math.max(0L, maxBytes - totalBytes));
                     } else {
                         totalReadLimit = Long.MAX_VALUE;
                     }
@@ -167,11 +172,11 @@ protected String getTextInternal(final InputStream in, final MimeTypeHelper mime
                     // can buffer hundreds of MiB into memory.
                     final long contentReadLimit;
                     if (maxContentSize >= 0) {
-                        contentReadLimit = Math.max(0L, maxContentSize - totalBytes) + 1L;
+                        contentReadLimit = addOneSaturating(Math.max(0L, maxContentSize - totalBytes));
                     } else {
                         contentReadLimit = Long.MAX_VALUE;
                     }
-                    final long perEntryReadLimit = maxBytesPerEntry > 0 ? maxBytesPerEntry + 1L : Long.MAX_VALUE;
+                    final long perEntryReadLimit = maxBytesPerEntry > 0 ? addOneSaturating(maxBytesPerEntry) : Long.MAX_VALUE;
                     final long readLimit = Math.min(Math.min(totalReadLimit, contentReadLimit), perEntryReadLimit);
                     final ByteArrayOutputStream out = new ByteArrayOutputStream();
                     actualBytes = copyBounded(ais, out, readLimit);
@@ -212,6 +217,10 @@ protected String getTextInternal(final InputStream in, final MimeTypeHelper mime
                     }
                 }
             }
+            // Summary warn when the loop completed normally but some entries failed.
+            if (failedEntries > 0) {
+                logger.warn("TAR archive partially processed: processed={} failed={}", processedEntries, failedEntries);
+            }
         } catch (final MaxLengthExceededException e) {
             throw e;
         } catch (final Exception e) {
diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java
index bfa83264..ad2f8714 100644
--- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java
+++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java
@@ -142,7 +142,19 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
                 read += n;
             }
             wrapped.reset();
-            if (read != 4 || sig[0] != 'P' || sig[1] != 'K' || (sig[2] != 0x03 && sig[2] != 0x05 && sig[2] != 0x07)) {
+            // ZIP local-file-header:  PK\x03\x04
+            // ZIP empty-archive EOCD: PK\x05\x06
+            // PK\x07\x08 is a data-descriptor signature and must NEVER appear
+            // at the very start of a valid ZIP; reject it along with anything
+            // else that is not a recognised opening signature.
+            if (read < 4 || sig[0] != 'P' || sig[1] != 'K') {
+                throw new ExtractException("Failed to extract content from ZIP archive. Not a recognised ZIP signature.");
+            }
+            if (sig[2] == 0x05 && sig[3] == 0x06) {
+                // Valid but empty archive — short-circuit immediately.
+                return new ExtractData("");
+            }
+            if (sig[2] != 0x03 || sig[3] != 0x04) {
                 throw new ExtractException("Failed to extract content from ZIP archive. Not a recognised ZIP signature.");
             }
         } catch (final IOException ioe) {
@@ -153,7 +165,10 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
         // signal in streaming mode (ZipArchiveEntry#getCompressedSize() is
         // often -1 when entries use a data descriptor).
         final CountingInputStream counter = new CountingInputStream(wrapped);
-        try (final ZipArchiveInputStream ais = new ZipArchiveInputStream(counter, filenameEncoding, true, true)) {
+        // allowStoredEntriesWithDataDescriptor=false (the default) to avoid
+        // widening the attack surface; the pre-PR factory default was also
+        // false and no test requires the looser mode.
+        try (final ZipArchiveInputStream ais = new ZipArchiveInputStream(counter, filenameEncoding, true, false)) {
             ZipArchiveEntry entry;
             long totalBytes = 0;
             long lastCompressedBytes = counter.getByteCount();
@@ -196,7 +211,7 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
                 try {
                     final long totalReadLimit;
                     if (maxBytes > 0) {
-                        totalReadLimit = Math.max(0L, maxBytes - totalBytes) + 1L;
+                        totalReadLimit = addOneSaturating(Math.max(0L, maxBytes - totalBytes));
                     } else {
                         totalReadLimit = Long.MAX_VALUE;
                     }
@@ -205,7 +220,7 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
                     // can buffer hundreds of MiB into memory.
                     final long contentReadLimit;
                     if (maxContentSize >= 0) {
-                        contentReadLimit = Math.max(0L, maxContentSize - totalBytes) + 1L;
+                        contentReadLimit = addOneSaturating(Math.max(0L, maxContentSize - totalBytes));
                     } else {
                         contentReadLimit = Long.MAX_VALUE;
                     }
@@ -214,7 +229,7 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
                     // the JVM heap. We read one byte beyond the cap so the
                     // explicit overflow check below can distinguish
                     // "exactly at the cap" from "exceeds the cap".
-                    final long perEntryReadLimit = maxBytesPerEntry > 0 ? maxBytesPerEntry + 1L : Long.MAX_VALUE;
+                    final long perEntryReadLimit = maxBytesPerEntry > 0 ? addOneSaturating(maxBytesPerEntry) : Long.MAX_VALUE;
                     final long readLimit = Math.min(Math.min(totalReadLimit, contentReadLimit), perEntryReadLimit);
                     final ByteArrayOutputStream out = new ByteArrayOutputStream();
                     actualBytes = copyBounded(ais, out, readLimit);
@@ -242,21 +257,26 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
                 }
 
                 // Compression-ratio check (only meaningful for non-tiny entries).
-                // Prefer the entry header's compressed size when present;
-                // otherwise fall back to the bytes actually consumed from the
-                // underlying stream during this entry's read.
-                long compressed = entry.getCompressedSize();
-                if (compressed <= 0) {
-                    final long now = counter.getByteCount();
-                    compressed = Math.max(0L, now - lastCompressedBytes);
-                    lastCompressedBytes = now;
-                } else {
-                    lastCompressedBytes = counter.getByteCount();
-                }
-                if (maxCompressionRatio > 0 && compressed > 0 && actualBytes > COMPRESSION_RATIO_MIN_BYTES
-                        && actualBytes / compressed > maxCompressionRatio) {
-                    throw new MaxLengthExceededException("zip compression ratio exceeded: name=" + filename + " ratio="
-                            + (actualBytes / compressed) + " max=" + maxCompressionRatio);
+                // Always measure the bytes actually consumed from the underlying
+                // stream during this entry's read (this is the only reliable
+                // signal in streaming mode where data descriptors are used).
+                // When the header also reports a positive compressed size, take
+                // the minimum of both so an attacker who inflates the header
+                // value cannot suppress the ratio check.
+                final long now = counter.getByteCount();
+                final long measuredCompressed = Math.max(0L, now - lastCompressedBytes);
+                lastCompressedBytes = now;
+                final long headerCompressed = entry.getCompressedSize();
+                final long compressed = (headerCompressed > 0) ? Math.min(headerCompressed, measuredCompressed) : measuredCompressed;
+                if (maxCompressionRatio > 0 && actualBytes > COMPRESSION_RATIO_MIN_BYTES) {
+                    if (compressed == 0) {
+                        // Zero compressed bytes with substantial output is suspicious;
+                        // log it but let the per-entry cap enforce the real limit.
+                        logger.warn("zip entry has 0 compressed bytes with {} uncompressed bytes: name={}", actualBytes, filename);
+                    } else if (actualBytes / compressed > maxCompressionRatio) {
+                        throw new MaxLengthExceededException("zip compression ratio exceeded: name=" + filename + " ratio="
+                                + (actualBytes / compressed) + " max=" + maxCompressionRatio);
+                    }
                 }
 
                 try {
@@ -274,6 +294,10 @@ public ExtractData getText(final InputStream in, final Map<String, String> param
                     }
                 }
             }
+            // Summary warn when the loop completed normally but some entries failed.
+            if (failedEntries > 0) {
+                logger.warn("ZIP archive partially processed: processed={} failed={}", processedEntries, failedEntries);
+            }
         } catch (final MaxLengthExceededException e) {
             throw e;
         } catch (final Exception e) {
diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractorTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractorTest.java
index 1a51483f..4054092b 100644
--- a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractorTest.java
+++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractorTest.java
@@ -79,6 +79,15 @@ public Map<String, String> testIncrementDepth(final Map<String, String> params)
         public void testCheckDepth(final Map<String, String> params, final int maxDepth) {
             checkDepth(params, maxDepth);
         }
+
+        // Expose static helpers for testing.
+        public boolean testIsPathTraversal(final String name) {
+            return isPathTraversal(name);
+        }
+
+        public long testAddOneSaturating(final long value) {
+            return addOneSaturating(value);
+        }
     }
 
     private TestExtractor extractor;
@@ -339,4 +348,104 @@ public void test_checkDepth_atOrAboveLimit_throws() {
             // pass
         }
     }
+
+    // -----------------------------------------------------------------------
+    // isPathTraversal tests (C3 fix validation)
+    // -----------------------------------------------------------------------
+
+    /** null and empty are always traversals. */
+    @Test
+    public void test_isPathTraversal_nullAndEmpty() {
+        assertTrue(extractor.testIsPathTraversal(null));
+        assertTrue(extractor.testIsPathTraversal(""));
+    }
+
+    /** Drive letter prefix is always rejected. */
+    @Test
+    public void test_isPathTraversal_driveLetter() {
+        assertTrue(extractor.testIsPathTraversal("C:\\foo"));
+        assertTrue(extractor.testIsPathTraversal("C:foo"));
+    }
+
+    /** Leading slash (Unix absolute) is rejected. */
+    @Test
+    public void test_isPathTraversal_leadingSlash() {
+        assertTrue(extractor.testIsPathTraversal("/etc/passwd"));
+    }
+
+    /** Leading backslash is rejected. */
+    @Test
+    public void test_isPathTraversal_leadingBackslash() {
+        assertTrue(extractor.testIsPathTraversal("\\foo\\bar"));
+    }
+
+    /** Lone ".." is rejected. */
+    @Test
+    public void test_isPathTraversal_loneDotDot() {
+        assertTrue(extractor.testIsPathTraversal(".."));
+    }
+
+    /** Classic traversal sequences are rejected. */
+    @Test
+    public void test_isPathTraversal_classicTraversal() {
+        assertTrue(extractor.testIsPathTraversal("../../etc/passwd"));
+        assertTrue(extractor.testIsPathTraversal("foo/../../etc/passwd"));
+    }
+
+    /** Safe name that resolves inside the root is allowed. */
+    @Test
+    public void test_isPathTraversal_safeRelativePath() {
+        assertFalse(extractor.testIsPathTraversal("foo/bar.txt"));
+        assertFalse(extractor.testIsPathTraversal("foo/../bar.txt")); // normalises to bar.txt
+    }
+
+    /**
+     * Single-segment backslash traversal (C3 regression).
+     * On Linux the path "a\..\..\etc" is a single opaque filename when
+     * Paths.get() is called without pre-normalisation, so ".." segments are
+     * not detected.  After unifying backslash to forward-slash before
+     * Paths.get(), "a/../../etc" normalises to "../etc" which starts with
+     * ".." and is correctly rejected.
+     * Note: "a\.." unifies to "a/.." which normalises to the empty path
+     * (current dir, i.e. the archive root) — that is safe and is NOT rejected.
+     */
+    @Test
+    public void test_isPathTraversal_backslashSingleSegment() {
+        // "a\..\..\etc" must be caught — escapes the archive root.
+        assertTrue(extractor.testIsPathTraversal("a\\..\\..\\etc"));
+        // Three levels up — definitely escapes.
+        assertTrue(extractor.testIsPathTraversal("a\\..\\..\\..")); // escapes
+        // "a\.." normalises to the archive root (current dir) — safe.
+        assertFalse(extractor.testIsPathTraversal("a\\.."));
+        // A purely safe backslash path: "foo\\bar.txt" → "foo/bar.txt" — safe.
+        assertFalse(extractor.testIsPathTraversal("foo\\bar.txt"));
+    }
+
+    /** NUL character in path — should be rejected (InvalidPathException path). */
+    @Test
+    public void test_isPathTraversal_nulCharacter() {
+        assertTrue(extractor.testIsPathTraversal("a\0b"));
+    }
+
+    // -----------------------------------------------------------------------
+    // addOneSaturating (C2 fix validation)
+    // -----------------------------------------------------------------------
+
+    /** addOneSaturating returns value+1 for normal inputs. */
+    @Test
+    public void test_addOneSaturating_normalIncrement() {
+        assertEquals(1L, extractor.testAddOneSaturating(0L));
+        assertEquals(101L, extractor.testAddOneSaturating(100L));
+        assertEquals(Long.MAX_VALUE - 1L, extractor.testAddOneSaturating(Long.MAX_VALUE - 2L));
+    }
+
+    /** addOneSaturating returns Long.MAX_VALUE when already at max. */
+    @Test
+    public void test_addOneSaturating_saturatesAtMax() {
+        assertEquals(Long.MAX_VALUE, extractor.testAddOneSaturating(Long.MAX_VALUE));
+        // (MAX-1)+1 = MAX naturally — not overflow.
+        assertEquals(Long.MAX_VALUE, extractor.testAddOneSaturating(Long.MAX_VALUE - 1L));
+        // Verify the result is positive (not wrapped to negative).
+        assertTrue(extractor.testAddOneSaturating(Long.MAX_VALUE) > 0);
+    }
 }
diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java
index 39aeefc4..0bab94c4 100644
--- a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java
+++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java
@@ -31,6 +31,7 @@
 import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream;
 import org.apache.commons.compress.compressors.CompressorStreamFactory;
 import org.codelibs.fess.crawler.container.StandardCrawlerContainer;
+import org.codelibs.fess.crawler.exception.ExtractException;
 import org.codelibs.fess.crawler.exception.MaxLengthExceededException;
 import org.codelibs.fess.crawler.extractor.ExtractorFactory;
 import org.codelibs.fess.crawler.helper.impl.MimeTypeHelperImpl;
@@ -596,4 +597,370 @@ public void test_lha_maxInputBytes_capsStaging() {
             fail();
         }
     }
+
+    // ---------------------------------------------------------------------
+    // ZIP signature checks (M1)
+    // ---------------------------------------------------------------------
+
+    @Test
+    public void test_zip_signatureCheck_rejectsDataDescriptorPrefix() throws Exception {
+        // PK\x07\x08 is a data-descriptor signature and must not appear at
+        // the start of a valid ZIP; reject it as ExtractException.
+        final byte[] data = new byte[] { 'P', 'K', 0x07, 0x08, 0, 0, 0, 0 };
+        try (InputStream in = new ByteArrayInputStream(data)) {
+            zipExtractor.getText(in, null);
+            fail();
+        } catch (final ExtractException e) {
+            assertTrue(e.getMessage().contains("ZIP"));
+        }
+    }
+
+    @Test
+    public void test_zip_signatureCheck_acceptsEmptyArchive() throws Exception {
+        // PK\x05\x06 is a valid empty-archive EOCD; extractor must return
+        // empty content rather than throwing.
+        final byte[] eocd = new byte[22];
+        eocd[0] = 'P';
+        eocd[1] = 'K';
+        eocd[2] = 0x05;
+        eocd[3] = 0x06;
+        // Remaining 18 bytes stay 0 (valid minimal EOCD).
+        try (InputStream in = new ByteArrayInputStream(eocd)) {
+            final String content = zipExtractor.getText(in, null).getContent();
+            assertEquals("", content);
+        }
+    }
+
+    @Test
+    public void test_zip_signatureCheck_rejectsTruncatedStream() throws Exception {
+        // 2 bytes — not enough for a valid ZIP magic → ExtractException.
+        final byte[] data = new byte[] { 'P', 'K' };
+        try (InputStream in = new ByteArrayInputStream(data)) {
+            zipExtractor.getText(in, null);
+            fail();
+        } catch (final ExtractException e) {
+            assertTrue(e.getMessage().contains("ZIP"));
+        }
+    }
+
+    @Test
+    public void test_zip_signatureCheck_rejectsNonZip() throws Exception {
+        // Completely wrong magic.
+        final byte[] data = "not a zip file at all".getBytes(StandardCharsets.UTF_8);
+        try (InputStream in = new ByteArrayInputStream(data)) {
+            zipExtractor.getText(in, null);
+            fail();
+        } catch (final ExtractException e) {
+            assertTrue(e.getMessage().contains("ZIP"));
+        }
+    }
+
+    // ---------------------------------------------------------------------
+    // Overflow: saturating +1L at Long.MAX_VALUE (C2)
+    // ---------------------------------------------------------------------
+
+    @Test
+    public void test_overflow_saturatingAdd_atLongMaxValue() throws Exception {
+        // With maxBytes=Long.MAX_VALUE a small archive must succeed, not
+        // silently read 0 bytes due to Long overflow wrapping to negative.
+        final byte[] payload = "hello world".getBytes(StandardCharsets.UTF_8);
+        final byte[] data = buildZip(new EntrySpec("ok.txt", payload));
+
+        zipExtractor.setMaxBytes(Long.MAX_VALUE);
+        zipExtractor.setMaxBytesPerEntry(Long.MAX_VALUE);
+        zipExtractor.setMaxContentSize(-1);
+        try (InputStream in = new ByteArrayInputStream(data)) {
+            final String content = zipExtractor.getText(in, null).getContent();
+            assertTrue(content.contains("hello world"));
+        }
+    }
+
+    // ---------------------------------------------------------------------
+    // Compression ratio — min(header, measured) (M3)
+    // ---------------------------------------------------------------------
+
+    @Test
+    public void test_zip_compressionRatio_usesMinOfHeaderAndMeasured() throws Exception {
+        // Build a zip where the entry header reports a huge compressedSize.
+        // The ratio check must use the minimum of header vs. measured bytes
+        // so a lying header cannot suppress the check. We build a highly
+        // compressible 2 MiB entry; the measured compressed bytes will be
+        // small, making the ratio >> 100 regardless of the header claim.
+        final byte[] payload = new byte[2 * 1024 * 1024]; // all zeros
+
+        final java.util.zip.Deflater def = new java.util.zip.Deflater(java.util.zip.Deflater.BEST_COMPRESSION);
+        def.setInput(payload);
+        def.finish();
+        final ByteArrayOutputStream compBuf = new ByteArrayOutputStream();
+        final byte[] tmpBuf = new byte[8192];
+        while (!def.finished()) {
+            final int n = def.deflate(tmpBuf);
+            compBuf.write(tmpBuf, 0, n);
+        }
+        def.end();
+        final java.util.zip.CRC32 crc = new java.util.zip.CRC32();
+        crc.update(payload);
+
+        final ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        try (ZipArchiveOutputStream zos = new ZipArchiveOutputStream(baos)) {
+            final ZipArchiveEntry entry = new ZipArchiveEntry("zeros.txt");
+            entry.setMethod(ZipArchiveEntry.DEFLATED);
+            entry.setSize(payload.length);
+            // Set a deliberately inflated compressedSize in the header so
+            // that ratio = uncompressed / fakeCompressed would be < threshold.
+            // If the code uses min(header, measured) the measured value wins
+            // and ratio >> threshold => MaxLengthExceededException fires.
+            entry.setCompressedSize(payload.length); // 1:1 — no bomb per header
+            entry.setCrc(crc.getValue());
+            zos.putArchiveEntry(entry);
+            zos.write(payload);
+            zos.closeArchiveEntry();
+            zos.finish();
+        }
+        final byte[] data = baos.toByteArray();
+
+        zipExtractor.setMaxBytes(-1);
+        zipExtractor.setMaxContentSize(-1);
+        // Ratio threshold: 100 — actual ratio >> 100 using measured bytes.
+        zipExtractor.setMaxCompressionRatio(100L);
+        try (InputStream in = new ByteArrayInputStream(data)) {
+            zipExtractor.getText(in, null);
+            fail();
+        } catch (final MaxLengthExceededException e) {
+            assertTrue(e.getMessage().contains("compression ratio"));
+        }
+    }
+
+    // ---------------------------------------------------------------------
+    // Nested recursion depth (zip-in-zip) (test 6)
+    // ---------------------------------------------------------------------
+
+    @Test
+    public void test_zip_nestedRecursionCountsDepth() throws Exception {
+        // Build inner zip containing a text file.
+        final byte[] innerPayload = buildZip(new EntrySpec("inner.txt", "hello".getBytes(StandardCharsets.UTF_8)));
+        // Build outer zip containing the inner zip.
+        final byte[] data = buildZip(new EntrySpec("nested.zip", innerPayload));
+
+        // Allow depth=1 only — outer zip processes ok (depth 0→1),
+        // inner zip invocation is at depth=1 which == maxArchiveDepth → throws.
+        zipExtractor.setMaxArchiveDepth(1);
+        try (InputStream in = new ByteArrayInputStream(data)) {
+            zipExtractor.getText(in, null);
+            fail();
+        } catch (final MaxLengthExceededException e) {
+            assertTrue(e.getMessage().toLowerCase().contains("recursion"));
+        }
+    }
+
+    // ---------------------------------------------------------------------
+    // Per-entry cap fires when total cap disabled (test 7)
+    // ---------------------------------------------------------------------
+
+    @Test
+    public void test_zip_perEntryCap_whenMaxBytesDisabled() throws Exception {
+        final byte[] payload = new byte[2 * 1024 * 1024]; // 2 MiB
+        final byte[] data = buildZip(new EntrySpec("big.txt", payload));
+
+        zipExtractor.setMaxBytes(-1);
+        zipExtractor.setMaxContentSize(-1);
+        zipExtractor.setMaxCompressionRatio(-1);
+        zipExtractor.setMaxBytesPerEntry(1024 * 1024); // 1 MiB cap
+        try (InputStream in = new ByteArrayInputStream(data)) {
+            zipExtractor.getText(in, null);
+            fail();
+        } catch (final MaxLengthExceededException e) {
+            assertTrue(e.getMessage().contains("per-entry size exceeded"));
+        }
+    }
+
+    // ---------------------------------------------------------------------
+    // Total cap fires when per-entry cap disabled (test 8)
+    // ---------------------------------------------------------------------
+
+    @Test
+    public void test_zip_maxBytes_whenPerEntryDisabled() throws Exception {
+        final byte[] payload = new byte[64 * 1024]; // 64 KiB each
+        final byte[] data = buildZip(new EntrySpec("a.txt", payload), new EntrySpec("b.txt", payload));
+
+        zipExtractor.setMaxBytesPerEntry(-1); // disable per-entry
+        zipExtractor.setMaxCompressionRatio(-1);
+        zipExtractor.setMaxBytes(64 * 1024); // exactly one entry → second exceeds
+        try (InputStream in = new ByteArrayInputStream(data)) {
+            zipExtractor.getText(in, null);
+            fail();
+        } catch (final MaxLengthExceededException e) {
+            assertTrue(e.getMessage().contains("zip uncompressed size exceeded"));
+        }
+    }
+
+    // ---------------------------------------------------------------------
+    // CP932 filename (m2: use Assumptions instead of silent return)
+    // ---------------------------------------------------------------------
+
+    @Test
+    public void test_cp932Filename_withAssumption() throws Exception {
+        // If MS932 is unavailable, skip with Assumptions rather than a
+        // silent return, so the test result is clearly SKIPPED not PASSED.
+        org.junit.jupiter.api.Assumptions.assumeTrue(Charset.isSupported("MS932"), "MS932 charset not available on this JVM");
+
+        final Charset cp932 = Charset.forName("MS932");
+        final byte[] data = buildZipWithCharset(cp932, new EntrySpec("テスト.txt", "japan".getBytes(StandardCharsets.UTF_8)));
+
+        zipExtractor.setFilenameEncoding("MS932");
+        try (InputStream in = new ByteArrayInputStream(data)) {
+            final String content = zipExtractor.getText(in, null).getContent();
+            assertTrue(content.contains("japan"));
+        }
+    }
+
+    // ---------------------------------------------------------------------
+    // Compression-ratio message tightened (m3)
+    // ---------------------------------------------------------------------
+
+    @Test
+    public void test_compressionRatioExceeded_messageContainsRatio() throws Exception {
+        // Same high-ratio archive as the existing test; assert the message
+        // specifically contains "compression ratio" (not just "uncompressed
+        // size"), because maxBytes=-1 means the total cap is disabled.
+        final byte[] payload = new byte[2 * 1024 * 1024];
+        final java.util.zip.Deflater def = new java.util.zip.Deflater(java.util.zip.Deflater.BEST_COMPRESSION);
+        def.setInput(payload);
+        def.finish();
+        final ByteArrayOutputStream compBuf = new ByteArrayOutputStream();
+        final byte[] tmpBuf = new byte[8192];
+        while (!def.finished()) {
+            final int n = def.deflate(tmpBuf);
+            compBuf.write(tmpBuf, 0, n);
+        }
+        def.end();
+        final java.util.zip.CRC32 crc = new java.util.zip.CRC32();
+        crc.update(payload);
+
+        final ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        try (ZipArchiveOutputStream zos = new ZipArchiveOutputStream(baos)) {
+            final ZipArchiveEntry entry = new ZipArchiveEntry("zeros.txt");
+            entry.setMethod(ZipArchiveEntry.DEFLATED);
+            entry.setSize(payload.length);
+            entry.setCompressedSize(compBuf.size());
+            entry.setCrc(crc.getValue());
+            zos.putArchiveEntry(entry);
+            zos.write(payload);
+            zos.closeArchiveEntry();
+            zos.finish();
+        }
+        final byte[] data = baos.toByteArray();
+
+        zipExtractor.setMaxBytes(-1);
+        zipExtractor.setMaxContentSize(-1);
+        try (InputStream in = new ByteArrayInputStream(data)) {
+            zipExtractor.getText(in, null);
+            fail();
+        } catch (final MaxLengthExceededException e) {
+            // With maxBytes=-1, only the ratio check can fire.
+            assertTrue(e.getMessage().contains("compression ratio"));
+        }
+    }
+
+    // ---------------------------------------------------------------------
+    // Tar PAX global header does not consume entry count (m4)
+    // ---------------------------------------------------------------------
+
+    @Test
+    public void test_tar_paxGlobalHeader_doesNotConsumeEntryCount() throws Exception {
+        // Build a tar that, when read by TarArchiveInputStream, produces a
+        // PAX global header entry followed by a real text entry.  We use a
+        // long filename (>100 chars) with LONGFILE_POSIX mode, which causes
+        // Commons Compress to emit a PAX extended header (type 'x') for the
+        // long name before the real entry.  isPaxHeader() returns true for
+        // type 'x', so the fix must skip it without incrementing entryCount.
+        // With maxEntries=1, if the PAX extension header is counted the real
+        // entry would push the count to 2 and trigger the cap.
+        final String longName = "a".repeat(110) + ".txt"; // > 100-char POSIX limit
+        final byte[] content = "hello".getBytes(StandardCharsets.UTF_8);
+        final ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        try (TarArchiveOutputStream tos = new TarArchiveOutputStream(baos)) {
+            tos.setLongFileMode(TarArchiveOutputStream.LONGFILE_POSIX);
+            final TarArchiveEntry real = new TarArchiveEntry(longName);
+            real.setSize(content.length);
+            tos.putArchiveEntry(real);
+            tos.write(content);
+            tos.closeArchiveEntry();
+            tos.finish();
+        }
+        final byte[] data = baos.toByteArray();
+
+        tarExtractor.setMaxEntries(1); // only 1 real entry allowed
+        try (InputStream in = new ByteArrayInputStream(data)) {
+            final String text = tarExtractor.getText(in, null).getContent();
+            assertTrue(text.contains("hello"));
+        }
+    }
+
+    // ---------------------------------------------------------------------
+    // Tar per-entry cap enforced (test 13)
+    // ---------------------------------------------------------------------
+
+    @Test
+    public void test_tar_perEntryCapEnforced() throws Exception {
+        final byte[] payload = new byte[2 * 1024 * 1024]; // 2 MiB
+        final byte[] data = buildTar(new TarEntrySpec("big.txt", payload));
+
+        tarExtractor.setMaxBytes(-1);
+        tarExtractor.setMaxContentSize(-1);
+        tarExtractor.setMaxBytesPerEntry(1024 * 1024); // 1 MiB cap
+        try (InputStream in = new ByteArrayInputStream(data)) {
+            tarExtractor.getText(in, null);
+            fail();
+        } catch (final MaxLengthExceededException e) {
+            assertTrue(e.getMessage().contains("tar per-entry size exceeded"));
+        }
+    }
+
+    // ---------------------------------------------------------------------
+    // Tar symlink skip now at WARN (m9) — verify it does not throw
+    // ---------------------------------------------------------------------
+
+    @Test
+    public void test_tar_symlinkSkipped_doesNotThrow() throws Exception {
+        // Already covered by test_tar_symlinkSkipped; this confirms the
+        // behaviour is unchanged after upgrading the log level to WARN.
+        final byte[] data = buildTar(new TarEntrySpec("ok.txt", "regular".getBytes(StandardCharsets.UTF_8)),
+                new TarEntrySpec("evil.txt", null, TarArchiveEntry.LF_SYMLINK, "/etc/passwd"));
+
+        try (InputStream in = new ByteArrayInputStream(data)) {
+            final String content = tarExtractor.getText(in, null).getContent();
+            assertTrue(content.contains("regular"));
+            assertFalse(content.contains("passwd"));
+        }
+    }
+
+    // ---------------------------------------------------------------------
+    // setMaxArchiveDepth changes threshold (test 16)
+    // ---------------------------------------------------------------------
+
+    @Test
+    public void test_setMaxArchiveDepth_changesThreshold() throws Exception {
+        final byte[] data = buildZip(new EntrySpec("ok.txt", "hi".getBytes(StandardCharsets.UTF_8)));
+
+        // depth=3 at maxArchiveDepth=3 → throws
+        zipExtractor.setMaxArchiveDepth(3);
+        final Map<String, String> params3 = new HashMap<>();
+        params3.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "3");
+        try (InputStream in = new ByteArrayInputStream(data)) {
+            zipExtractor.getText(in, params3);
+            fail();
+        } catch (final MaxLengthExceededException e) {
+            assertTrue(e.getMessage().toLowerCase().contains("recursion"));
+        }
+
+        // depth=11 at maxArchiveDepth=20 → passes
+        zipExtractor.setMaxArchiveDepth(20);
+        final Map<String, String> params11 = new HashMap<>();
+        params11.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "11");
+        try (InputStream in = new ByteArrayInputStream(data)) {
+            final String content = zipExtractor.getText(in, params11).getContent();
+            assertTrue(content.contains("hi"));
+        }
+    }
 }

From bc2ab790e2d0848c86fde2ccfc78ee9c7431aef0 Mon Sep 17 00:00:00 2001
From: Shinsuke Sugaya <shinsuke@apache.org>
Date: Sat, 16 May 2026 13:24:07 +0900
Subject: [PATCH 6/6] fix(extractor): import CrawlerSystemException in
 LhaExtractor for javadoc

Javadoc tag @throws CrawlerSystemException on getText() referred to a
class not on the file's import list, breaking maven-javadoc-plugin with
'reference not found' on CI.
---
 .../org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java
index 9e1b1585..5938f857 100644
--- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java
+++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java
@@ -29,6 +29,7 @@
 import org.codelibs.core.io.CloseableUtil;
 import org.codelibs.core.io.FileUtil;
 import org.codelibs.fess.crawler.entity.ExtractData;
+import org.codelibs.fess.crawler.exception.CrawlerSystemException;
 import org.codelibs.fess.crawler.exception.ExtractException;
 import org.codelibs.fess.crawler.exception.MaxLengthExceededException;
 import org.codelibs.fess.crawler.extractor.Extractor;