From 27bc0c58fb644a89cf014a76c40d6b5676ed2050 Mon Sep 17 00:00:00 2001 From: Shinsuke Sugaya Date: Tue, 5 May 2026 07:31:25 +0900 Subject: [PATCH 1/6] fix(extractor): defend against archive bomb / recursion / Zip Slip Adds defensive limits to ZipExtractor, TarExtractor and LhaExtractor so that a malicious content stream can no longer cause unbounded resource consumption or write outside its conceptual extraction root. Threat model assumption: the InputStream is untrusted; the params Map is admin-configured / trusted. Protections: - AbstractExtractor: extractorDepth param + checkDepth/getCurrentDepth/ incrementDepth helpers, plus configurable maxArchiveDepth (default 10). Recursive archive-in-archive expansions now propagate an incremented depth and abort with MaxLengthExceededException at the threshold. - ZipExtractor: per-archive caps (maxBytes 2 GiB, maxEntries 100k, maxCompressionRatio 100:1 for entries > 1 MiB), Zip Slip path-traversal rejection via Path.normalize(), early ZIP signature validation, and configurable filenameEncoding (UTF-8 / CP932 / MS932) for non-UTF8 archive names. Compression ratio uses the central directory size when available and falls back to bytes consumed from a CountingInputStream. - TarExtractor: maxBytes / maxEntries caps, depth check, path-traversal rejection, and explicit skipping of symbolic-link and hard-link entries to avoid leaking content from outside the archive sandbox. - LhaExtractor: maxBytes / maxEntries caps, depth check, path-traversal rejection. Tests: - AbstractExtractorTest: new coverage for getCurrentDepth / incrementDepth (returns NEW map, leaves input untouched) / checkDepth pass and fail paths. - ArchiveExtractorSecurityTest: 15 synthetic-archive tests covering each threat (zip byte / entry / ratio bombs, Zip Slip, absolute paths, recursion-depth, CP932 filenames, tar symlink/hardlink/path-traversal, tar byte/entry caps and depth, lha depth). All previously-passing extractor tests continue to pass; failures in the suite are limited to environment-dependent tests (Docker for SMB / S3 / GCS / Storage clients, LibreOffice for JodExtractor) which are unchanged by this commit. --- .../extractor/impl/AbstractExtractor.java | 87 ++++ .../crawler/extractor/impl/LhaExtractor.java | 88 +++- .../crawler/extractor/impl/TarExtractor.java | 164 ++++++- .../crawler/extractor/impl/ZipExtractor.java | 267 +++++++++- .../extractor/impl/AbstractExtractorTest.java | 96 ++++ .../impl/ArchiveExtractorSecurityTest.java | 457 ++++++++++++++++++ 6 files changed, 1119 insertions(+), 40 deletions(-) create mode 100644 fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java index 14759e30..bb30c235 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java @@ -18,10 +18,13 @@ import java.io.File; import java.io.IOException; import java.io.InputStream; +import java.util.HashMap; import java.util.List; +import java.util.Map; import org.codelibs.fess.crawler.container.CrawlerContainer; import org.codelibs.fess.crawler.exception.CrawlerSystemException; +import org.codelibs.fess.crawler.exception.MaxLengthExceededException; import org.codelibs.fess.crawler.extractor.Extractor; import org.codelibs.fess.crawler.extractor.ExtractorFactory; import org.codelibs.fess.crawler.helper.MimeTypeHelper; @@ -48,6 +51,14 @@ */ public abstract class AbstractExtractor implements Extractor { + /** + * Parameter key used to track the recursion depth across nested archive + * extraction. Callers/recursive extractor invocations may set this to + * limit how deeply nested archives are unpacked. The value is parsed as + * an integer; missing or unparseable values are treated as depth 0. + */ + public static final String EXTRACTOR_DEPTH_KEY = "extractorDepth"; + /** The crawler container. */ @Resource protected CrawlerContainer crawlerContainer; @@ -55,6 +66,14 @@ public abstract class AbstractExtractor implements Extractor { /** The weight of this extractor. */ protected int weight = 1; + /** + * Maximum allowed depth for recursive archive extraction. When the depth + * value parsed from {@link #EXTRACTOR_DEPTH_KEY} reaches this threshold, + * {@link #checkDepth(Map, int)} aborts further recursion to defend + * against recursion-bomb archives. + */ + protected int maxArchiveDepth = 10; + /** * Constructs a new AbstractExtractor. */ @@ -62,6 +81,74 @@ public AbstractExtractor() { // NOP } + /** + * Sets the maximum allowed recursion depth for nested archive extraction. + * @param maxArchiveDepth the new maximum depth (non-negative) + */ + public void setMaxArchiveDepth(final int maxArchiveDepth) { + this.maxArchiveDepth = maxArchiveDepth; + } + + /** + * Returns the current recursion depth recorded in the extractor params. + * Missing, blank, or unparseable values are treated as {@code 0}. + * + * @param params the extractor parameters (may be {@code null}) + * @return the parsed depth, or {@code 0} if not set + */ + protected int getCurrentDepth(final Map params) { + if (params == null) { + return 0; + } + final String value = params.get(EXTRACTOR_DEPTH_KEY); + if (value == null || value.isBlank()) { + return 0; + } + try { + final int depth = Integer.parseInt(value.trim()); + return depth < 0 ? 0 : depth; + } catch (final NumberFormatException e) { + return 0; + } + } + + /** + * Returns a NEW parameter map (the original is not mutated) with the + * recursion depth incremented by one. Useful when an archive extractor + * recursively delegates to another extractor for a nested archive entry. + * + * @param params the current extractor parameters (may be {@code null}) + * @return a new map containing all original entries plus an incremented + * depth + */ + protected Map incrementDepth(final Map params) { + final Map next = new HashMap<>(); + if (params != null) { + next.putAll(params); + } + next.put(EXTRACTOR_DEPTH_KEY, Integer.toString(getCurrentDepth(params) + 1)); + return next; + } + + /** + * Validates that the recursion depth recorded in {@code params} does not + * meet or exceed {@code maxDepth}. Throws {@link MaxLengthExceededException} + * (a {@link org.codelibs.fess.crawler.exception.CrawlingAccessException + * CrawlingAccessException}) when the threshold is reached so that the + * surrounding crawler treats it as a data-driven access failure rather + * than a system error. + * + * @param params the extractor parameters (may be {@code null}) + * @param maxDepth the (exclusive) maximum allowed depth + * @throws MaxLengthExceededException when {@code currentDepth >= maxDepth} + */ + protected void checkDepth(final Map params, final int maxDepth) { + final int current = getCurrentDepth(params); + if (current >= maxDepth) { + throw new MaxLengthExceededException("Archive recursion depth exceeded: depth=" + current + " max=" + maxDepth); + } + } + @Override public int getWeight() { return weight; diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java index 7a5ff29a..70e2cced 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java @@ -19,8 +19,10 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; +import java.nio.file.InvalidPathException; +import java.nio.file.Path; +import java.nio.file.Paths; import java.util.Enumeration; -import java.util.HashMap; import java.util.Map; import org.apache.logging.log4j.LogManager; @@ -45,6 +47,11 @@ * This extractor can extract text content from files within LHA archives * by using appropriate extractors for each contained file type. * + *

+ * Defends against decompression / many-entry / recursion bombs and Zip Slip + * style path traversal in entry names. + *

+ * * @author shinsuke */ public class LhaExtractor extends AbstractExtractor { @@ -54,6 +61,18 @@ public class LhaExtractor extends AbstractExtractor { /** Maximum content size for extraction. -1 means no limit. */ protected long maxContentSize = -1; + /** + * Maximum total uncompressed bytes that may be read from all entries + * combined. Defaults to 2 GiB. Set to {@code -1} to disable. + */ + protected long maxBytes = 1L << 31; + + /** + * Maximum allowed number of entries to iterate. Defaults to 100,000. + * Set to {@code -1} to disable. + */ + protected int maxEntries = 100_000; + /** * Creates a new LhaExtractor instance. */ @@ -76,6 +95,7 @@ public ExtractData getText(final InputStream in, final Map param if (in == null) { throw new CrawlerSystemException("LHA archive input stream is null. Cannot extract text from null input."); } + checkDepth(params, maxArchiveDepth); final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper(); final ExtractorFactory extractorFactory = getExtractorFactory(); @@ -93,13 +113,25 @@ public ExtractData getText(final InputStream in, final Map param @SuppressWarnings("unchecked") final Enumeration entries = lhaFile.entries(); long contentSize = 0; + int entryCount = 0; while (entries.hasMoreElements()) { final LhaHeader head = entries.nextElement(); + entryCount++; + if (maxEntries > 0 && entryCount > maxEntries) { + throw new MaxLengthExceededException("lha entry count exceeded: count=" + entryCount + " max=" + maxEntries); + } contentSize += head.getOriginalSize(); + if (maxBytes > 0 && contentSize > maxBytes) { + throw new MaxLengthExceededException("lha uncompressed size exceeded: total=" + contentSize + " max=" + maxBytes); + } if (maxContentSize != -1 && contentSize > maxContentSize) { throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize); } final String filename = head.getPath(); + if (isPathTraversal(filename)) { + logger.warn("lha entry rejected: name={} reason=path-traversal", filename); + continue; + } final String mimeType = mimeTypeHelper.getContentType(null, filename); if (mimeType != null) { final Extractor extractor = extractorFactory.getExtractor(mimeType); @@ -107,10 +139,12 @@ public ExtractData getText(final InputStream in, final Map param InputStream is = null; try { is = lhaFile.getInputStream(head); - final Map map = new HashMap<>(); + final Map map = incrementDepth(params); map.put(ExtractData.RESOURCE_NAME_KEY, filename); buf.append(extractor.getText(new IgnoreCloseInputStream(is), map).getContent()); buf.append('\n'); + } catch (final MaxLengthExceededException e) { + throw e; } catch (final Exception e) { if (logger.isDebugEnabled()) { logger.debug("Exception in an internal extractor.", e); @@ -139,6 +173,40 @@ public ExtractData getText(final InputStream in, final Map param return new ExtractData(buf.toString().trim()); } + /** + * Returns true when the supplied entry name escapes the conceptual + * extraction root via path-traversal segments. + * + * @param name the entry name as reported by the archive + * @return {@code true} if the name should be rejected + */ + protected boolean isPathTraversal(final String name) { + if (name == null || name.isEmpty()) { + return true; + } + if (name.startsWith("/") || name.startsWith("\\")) { + return true; + } + if (name.length() >= 2 && name.charAt(1) == ':') { + return true; + } + try { + final Path normalised = Paths.get(name).normalize(); + final String normStr = normalised.toString().replace('\\', '/'); + if (normStr.equals("..") || normStr.startsWith("../") || normStr.contains("/../")) { + return true; + } + for (final Path part : normalised) { + if ("..".equals(part.toString())) { + return true; + } + } + } catch (final InvalidPathException ipe) { + return true; + } + return false; + } + /** * Sets the maximum content size for extraction. * @@ -147,4 +215,20 @@ public ExtractData getText(final InputStream in, final Map param public void setMaxContentSize(final long maxContentSize) { this.maxContentSize = maxContentSize; } + + /** + * Sets the cap on total uncompressed bytes read from all entries. + * @param maxBytes the maximum total bytes (use {@code -1} to disable) + */ + public void setMaxBytes(final long maxBytes) { + this.maxBytes = maxBytes; + } + + /** + * Sets the maximum number of entries that may be iterated. + * @param maxEntries the maximum entry count (use {@code -1} to disable) + */ + public void setMaxEntries(final int maxEntries) { + this.maxEntries = maxEntries; + } } diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java index b5bb0238..35f4a8f0 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java @@ -15,13 +15,19 @@ */ package org.codelibs.fess.crawler.extractor.impl; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; import java.io.InputStream; -import java.util.HashMap; +import java.nio.file.InvalidPathException; +import java.nio.file.Path; +import java.nio.file.Paths; import java.util.Map; import org.apache.commons.compress.archivers.ArchiveInputStream; import org.apache.commons.compress.archivers.ArchiveStreamFactory; import org.apache.commons.compress.archivers.tar.TarArchiveEntry; +import org.apache.commons.io.IOUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.codelibs.fess.crawler.entity.ExtractData; @@ -30,12 +36,17 @@ import org.codelibs.fess.crawler.extractor.Extractor; import org.codelibs.fess.crawler.extractor.ExtractorFactory; import org.codelibs.fess.crawler.helper.MimeTypeHelper; -import org.codelibs.fess.crawler.util.IgnoreCloseInputStream; import jakarta.annotation.Resource; /** * Extracts text content from TAR archives. + * + *

+ * Defends against decompression / many-entry / recursion bombs and Zip Slip + * style path traversal. Symbolic and hard link entries are skipped because + * they can reference files outside the archive sandbox. + *

*/ public class TarExtractor extends AbstractExtractor { private static final Logger logger = LogManager.getLogger(TarExtractor.class); @@ -47,10 +58,23 @@ public class TarExtractor extends AbstractExtractor { protected ArchiveStreamFactory archiveStreamFactory; /** - * Maximum content size. + * Maximum content size (legacy field). When set, the actual bytes + * read from each entry stream are summed and compared against this limit. */ protected long maxContentSize = -1; + /** + * Maximum total uncompressed bytes that may be read from all entries + * combined. Defaults to 2 GiB. Set to {@code -1} to disable. + */ + protected long maxBytes = 1L << 31; + + /** + * Maximum allowed number of entries to iterate. Defaults to 100,000. + * Set to {@code -1} to disable. + */ + protected int maxEntries = 100_000; + /** * Creates a new TarExtractor instance. */ @@ -61,10 +85,11 @@ public TarExtractor() { @Override public ExtractData getText(final InputStream in, final Map params) { validateInputStream(in); + checkDepth(params, maxArchiveDepth); final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper(); final ExtractorFactory extractorFactory = getExtractorFactory(); - return new ExtractData(getTextInternal(in, mimeTypeHelper, extractorFactory)); + return new ExtractData(getTextInternal(in, mimeTypeHelper, extractorFactory, params)); } /** @@ -73,36 +98,83 @@ public ExtractData getText(final InputStream in, final Map param * @param in The input stream. * @param mimeTypeHelper The mime type helper. * @param extractorFactory The extractor factory. + * @param params Extractor parameters used to track recursion depth. * @return A text. */ - protected String getTextInternal(final InputStream in, final MimeTypeHelper mimeTypeHelper, final ExtractorFactory extractorFactory) { + protected String getTextInternal(final InputStream in, final MimeTypeHelper mimeTypeHelper, final ExtractorFactory extractorFactory, + final Map params) { final StringBuilder buf = new StringBuilder(1000); int processedEntries = 0; int failedEntries = 0; try (final ArchiveInputStream ais = archiveStreamFactory.createArchiveInputStream("tar", in)) { - TarArchiveEntry entry = null; - long contentSize = 0; + TarArchiveEntry entry; + long totalBytes = 0; + int entryCount = 0; while ((entry = (TarArchiveEntry) ais.getNextEntry()) != null) { - contentSize += entry.getSize(); - if (maxContentSize != -1 && contentSize > maxContentSize) { - throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize); + entryCount++; + if (maxEntries > 0 && entryCount > maxEntries) { + throw new MaxLengthExceededException("tar entry count exceeded: count=" + entryCount + " max=" + maxEntries); } final String filename = entry.getName(); + if (entry.isDirectory()) { + continue; + } + if (entry.isSymbolicLink() || entry.isLink()) { + if (logger.isDebugEnabled()) { + logger.debug("tar entry skipped: name={} reason=link link={}", filename, entry.getLinkName()); + } + continue; + } + if (isPathTraversal(filename)) { + logger.warn("tar entry rejected: name={} reason=path-traversal", filename); + continue; + } + + final long actualBytes; + final byte[] entryBytes; + try { + final long readLimit; + if (maxBytes > 0) { + readLimit = Math.max(0L, maxBytes - totalBytes) + 1L; + } else { + readLimit = Long.MAX_VALUE; + } + final ByteArrayOutputStream out = new ByteArrayOutputStream(); + actualBytes = copyBounded(ais, out, readLimit); + entryBytes = out.toByteArray(); + } catch (final IOException ioe) { + failedEntries++; + if (logger.isDebugEnabled()) { + logger.debug("Failed to read tar entry: name={}", filename, ioe); + } + continue; + } + + totalBytes += actualBytes; + if (maxBytes > 0 && totalBytes > maxBytes) { + throw new MaxLengthExceededException("tar uncompressed size exceeded: total=" + totalBytes + " max=" + maxBytes); + } + if (maxContentSize != -1 && totalBytes > maxContentSize) { + throw new MaxLengthExceededException("Extracted size is " + totalBytes + " > " + maxContentSize); + } + final String mimeType = mimeTypeHelper.getContentType(null, filename); if (mimeType != null) { final Extractor extractor = extractorFactory.getExtractor(mimeType); if (extractor != null) { try { - final Map map = new HashMap<>(); + final Map map = incrementDepth(params); map.put(ExtractData.RESOURCE_NAME_KEY, filename); - buf.append(extractor.getText(new IgnoreCloseInputStream(ais), map).getContent()); + buf.append(extractor.getText(new ByteArrayInputStream(entryBytes), map).getContent()); buf.append('\n'); processedEntries++; + } catch (final MaxLengthExceededException e) { + throw e; } catch (final Exception e) { failedEntries++; if (logger.isDebugEnabled()) { - logger.debug("Failed to extract content from archive entry: {}", filename, e); + logger.debug("Failed to extract content from archive entry: name={}", filename, e); } } } @@ -115,13 +187,61 @@ protected String getTextInternal(final InputStream in, final MimeTypeHelper mime throw new ExtractException("Failed to extract content from TAR archive. No entries could be processed.", e); } if (logger.isWarnEnabled()) { - logger.warn("Partial extraction from TAR archive. Processed: {}, Failed: {}", processedEntries, failedEntries, e); + logger.warn("Partial extraction from TAR archive. processed={} failed={}", processedEntries, failedEntries, e); } } return buf.toString().trim(); } + /** + * Returns true when the supplied entry name escapes the conceptual + * extraction root via path-traversal segments. + * + * @param name the entry name as reported by the archive + * @return {@code true} if the name should be rejected + */ + protected boolean isPathTraversal(final String name) { + if (name == null || name.isEmpty()) { + return true; + } + if (name.startsWith("/") || name.startsWith("\\")) { + return true; + } + if (name.length() >= 2 && name.charAt(1) == ':') { + return true; + } + try { + final Path normalised = Paths.get(name).normalize(); + final String normStr = normalised.toString().replace('\\', '/'); + if (normStr.equals("..") || normStr.startsWith("../") || normStr.contains("/../")) { + return true; + } + for (final Path part : normalised) { + if ("..".equals(part.toString())) { + return true; + } + } + } catch (final InvalidPathException ipe) { + return true; + } + return false; + } + + private long copyBounded(final InputStream in, final ByteArrayOutputStream out, final long limit) throws IOException { + if (limit <= 0) { + return 0; + } + final byte[] buffer = new byte[8192]; + long total = 0; + int read; + while (total < limit && (read = in.read(buffer, 0, (int) Math.min(buffer.length, limit - total))) != IOUtils.EOF) { + out.write(buffer, 0, read); + total += read; + } + return total; + } + /** * Sets the maximum content size. * @param maxContentSize The maximum content size to set. @@ -129,4 +249,20 @@ protected String getTextInternal(final InputStream in, final MimeTypeHelper mime public void setMaxContentSize(final long maxContentSize) { this.maxContentSize = maxContentSize; } + + /** + * Sets the cap on total uncompressed bytes read from all entries. + * @param maxBytes the maximum total bytes (use {@code -1} to disable) + */ + public void setMaxBytes(final long maxBytes) { + this.maxBytes = maxBytes; + } + + /** + * Sets the maximum number of entries that may be iterated. + * @param maxEntries the maximum entry count (use {@code -1} to disable) + */ + public void setMaxEntries(final int maxEntries) { + this.maxEntries = maxEntries; + } } diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java index a543b3a9..ee4a9370 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java @@ -16,13 +16,19 @@ package org.codelibs.fess.crawler.extractor.impl; import java.io.BufferedInputStream; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; import java.io.InputStream; -import java.util.HashMap; +import java.nio.file.InvalidPathException; +import java.nio.file.Path; +import java.nio.file.Paths; import java.util.Map; -import org.apache.commons.compress.archivers.ArchiveInputStream; -import org.apache.commons.compress.archivers.ArchiveStreamFactory; import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; +import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream; +import org.apache.commons.io.IOUtils; +import org.apache.commons.io.input.CountingInputStream; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.codelibs.fess.crawler.entity.ExtractData; @@ -31,26 +37,64 @@ import org.codelibs.fess.crawler.extractor.Extractor; import org.codelibs.fess.crawler.extractor.ExtractorFactory; import org.codelibs.fess.crawler.helper.MimeTypeHelper; -import org.codelibs.fess.crawler.util.IgnoreCloseInputStream; - -import jakarta.annotation.Resource; /** * Extracts text content from ZIP archives. + * + *

+ * The extractor defends against several content-driven attack vectors. The + * input stream itself is treated as untrusted, while the {@code params} map is + * assumed to be admin-configured / trusted. Protections include: + *

+ *
    + *
  • Total uncompressed-size cap ({@link #setMaxBytes(long)})
  • + *
  • Maximum number of entries ({@link #setMaxEntries(int)})
  • + *
  • Per-entry compression-ratio threshold + * ({@link #setMaxCompressionRatio(long)}) to detect zip bombs
  • + *
  • Recursion-depth check (via {@link AbstractExtractor#checkDepth})
  • + *
  • Zip Slip path-traversal detection (entry names normalised and + * rejected when they escape the conceptual extraction root)
  • + *
  • Configurable filename encoding (e.g. {@code "CP932"} / + * {@code "MS932"} for Japanese filenames)
  • + *
*/ public class ZipExtractor extends AbstractExtractor { private static final Logger logger = LogManager.getLogger(ZipExtractor.class); + /** Threshold below which compression-ratio checks are skipped (bytes). */ + private static final long COMPRESSION_RATIO_MIN_BYTES = 1L << 20; // 1 MiB + + /** + * The maximum content size (legacy field). When set, the actual bytes + * read from each entry stream are summed and compared against this limit. + */ + protected long maxContentSize = -1; + /** - * The archive stream factory. + * Maximum total uncompressed bytes that may be read from all entries + * combined. Defaults to 2 GiB. Set to {@code -1} to disable. */ - @Resource - protected ArchiveStreamFactory archiveStreamFactory; + protected long maxBytes = 1L << 31; /** - * The maximum content size. + * Maximum allowed compression ratio (uncompressed / compressed). Entries + * exceeding this ratio AND larger than 1 MiB are rejected as suspected + * zip bombs. Set to {@code -1} to disable. */ - protected long maxContentSize = -1; + protected long maxCompressionRatio = 100L; + + /** + * Maximum allowed number of entries to iterate. Defaults to 100,000. + * Set to {@code -1} to disable. + */ + protected int maxEntries = 100_000; + + /** + * Filename encoding used to decode entry names that lack the UTF-8 flag. + * Defaults to {@code "UTF-8"}; set to {@code "CP932"} or {@code "MS932"} + * for archives created on Japanese Windows systems. + */ + protected String filenameEncoding = "UTF-8"; /** * Creates a new ZipExtractor instance. @@ -62,6 +106,7 @@ public ZipExtractor() { @Override public ExtractData getText(final InputStream in, final Map params) { validateInputStream(in); + checkDepth(params, maxArchiveDepth); final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper(); final ExtractorFactory extractorFactory = getExtractorFactory(); @@ -69,30 +114,114 @@ public ExtractData getText(final InputStream in, final Map param int processedEntries = 0; int failedEntries = 0; - try (final ArchiveInputStream ais = - archiveStreamFactory.createArchiveInputStream(in.markSupported() ? in : new BufferedInputStream(in))) { - ZipArchiveEntry entry = null; - long contentSize = 0; - while ((entry = (ZipArchiveEntry) ais.getNextEntry()) != null) { - contentSize += entry.getSize(); - if (maxContentSize != -1 && contentSize > maxContentSize) { - throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize); + final InputStream wrapped = in.markSupported() ? in : new BufferedInputStream(in); + // Early-validate the ZIP magic so a clearly non-zip blob is reported + // as ExtractException rather than silently returning empty text. + wrapped.mark(4); + try { + final byte[] sig = new byte[4]; + int read = 0; + while (read < 4) { + final int n = wrapped.read(sig, read, 4 - read); + if (n < 0) { + break; + } + read += n; + } + wrapped.reset(); + if (read != 4 || sig[0] != 'P' || sig[1] != 'K' || (sig[2] != 0x03 && sig[2] != 0x05 && sig[2] != 0x07)) { + throw new ExtractException("Failed to extract content from ZIP archive. Not a recognised ZIP signature."); + } + } catch (final IOException ioe) { + throw new ExtractException("Failed to extract content from ZIP archive. No entries could be processed.", ioe); + } + // CountingInputStream lets us measure the compressed bytes consumed + // from the underlying stream per entry, which is the only reliable + // signal in streaming mode (ZipArchiveEntry#getCompressedSize() is + // often -1 when entries use a data descriptor). + final CountingInputStream counter = new CountingInputStream(wrapped); + try (final ZipArchiveInputStream ais = new ZipArchiveInputStream(counter, filenameEncoding, true, true)) { + ZipArchiveEntry entry; + long totalBytes = 0; + long lastCompressedBytes = counter.getByteCount(); + int entryCount = 0; + while ((entry = ais.getNextEntry()) != null) { + entryCount++; + if (maxEntries > 0 && entryCount > maxEntries) { + throw new MaxLengthExceededException("zip entry count exceeded: count=" + entryCount + " max=" + maxEntries); } final String filename = entry.getName(); + if (entry.isDirectory()) { + continue; + } + if (isPathTraversal(filename)) { + logger.warn("zip entry rejected: name={} reason=path-traversal", filename); + continue; + } + + // Read entry into bounded buffer while counting actual bytes. + final long actualBytes; + final byte[] entryBytes; + try { + final long readLimit; + if (maxBytes > 0) { + readLimit = Math.max(0L, maxBytes - totalBytes) + 1L; + } else { + readLimit = Long.MAX_VALUE; + } + final ByteArrayOutputStream out = new ByteArrayOutputStream(); + actualBytes = copyBounded(ais, out, readLimit); + entryBytes = out.toByteArray(); + } catch (final IOException ioe) { + failedEntries++; + if (logger.isDebugEnabled()) { + logger.debug("Failed to read zip entry: name={}", filename, ioe); + } + continue; + } + + totalBytes += actualBytes; + if (maxBytes > 0 && totalBytes > maxBytes) { + throw new MaxLengthExceededException("zip uncompressed size exceeded: total=" + totalBytes + " max=" + maxBytes); + } + if (maxContentSize != -1 && totalBytes > maxContentSize) { + throw new MaxLengthExceededException("Extracted size is " + totalBytes + " > " + maxContentSize); + } + + // Compression-ratio check (only meaningful for non-tiny entries). + // Prefer the entry header's compressed size when present; + // otherwise fall back to the bytes actually consumed from the + // underlying stream during this entry's read. + long compressed = entry.getCompressedSize(); + if (compressed <= 0) { + final long now = counter.getByteCount(); + compressed = Math.max(0L, now - lastCompressedBytes); + lastCompressedBytes = now; + } else { + lastCompressedBytes = counter.getByteCount(); + } + if (maxCompressionRatio > 0 && compressed > 0 && actualBytes > COMPRESSION_RATIO_MIN_BYTES + && actualBytes / compressed > maxCompressionRatio) { + throw new MaxLengthExceededException("zip compression ratio exceeded: name=" + filename + " ratio=" + + (actualBytes / compressed) + " max=" + maxCompressionRatio); + } + final String mimeType = mimeTypeHelper.getContentType(null, filename); if (mimeType != null) { final Extractor extractor = extractorFactory.getExtractor(mimeType); if (extractor != null) { try { - final Map map = new HashMap<>(); + final Map map = incrementDepth(params); map.put(ExtractData.RESOURCE_NAME_KEY, filename); - buf.append(extractor.getText(new IgnoreCloseInputStream(ais), map).getContent()); + buf.append(extractor.getText(new ByteArrayInputStream(entryBytes), map).getContent()); buf.append('\n'); processedEntries++; + } catch (final MaxLengthExceededException e) { + throw e; } catch (final Exception e) { failedEntries++; if (logger.isDebugEnabled()) { - logger.debug("Failed to extract content from archive entry: {}", filename, e); + logger.debug("Failed to extract content from archive entry: name={}", filename, e); } } } @@ -105,13 +234,68 @@ public ExtractData getText(final InputStream in, final Map param throw new ExtractException("Failed to extract content from ZIP archive. No entries could be processed.", e); } if (logger.isWarnEnabled()) { - logger.warn("Partial extraction from ZIP archive. Processed: {}, Failed: {}", processedEntries, failedEntries, e); + logger.warn("Partial extraction from ZIP archive. processed={} failed={}", processedEntries, failedEntries, e); } } return new ExtractData(buf.toString().trim()); } + /** + * Returns true when the supplied entry name escapes the conceptual + * extraction root via path-traversal segments. The check is performed on + * a normalised form of the path. + * + * @param name the entry name as reported by the archive + * @return {@code true} if the name should be rejected + */ + protected boolean isPathTraversal(final String name) { + if (name == null || name.isEmpty()) { + return true; + } + // Absolute paths (Unix or Windows-style) are unsafe in the + // context of an archive extracted into a sandbox root. + if (name.startsWith("/") || name.startsWith("\\")) { + return true; + } + if (name.length() >= 2 && name.charAt(1) == ':') { + return true; + } + try { + final Path normalised = Paths.get(name).normalize(); + final String normStr = normalised.toString().replace('\\', '/'); + if (normStr.equals("..") || normStr.startsWith("../") || normStr.contains("/../")) { + return true; + } + for (final Path part : normalised) { + if ("..".equals(part.toString())) { + return true; + } + } + } catch (final InvalidPathException ipe) { + return true; + } + return false; + } + + /** + * Copies up to {@code limit} bytes from {@code in} to {@code out}. Returns + * the actual number of bytes copied. + */ + private long copyBounded(final InputStream in, final ByteArrayOutputStream out, final long limit) throws IOException { + if (limit <= 0) { + return 0; + } + final byte[] buffer = new byte[8192]; + long total = 0; + int read; + while (total < limit && (read = in.read(buffer, 0, (int) Math.min(buffer.length, limit - total))) != IOUtils.EOF) { + out.write(buffer, 0, read); + total += read; + } + return total; + } + /** * Sets the maximum content size. * @param maxContentSize The maximum content size to set. @@ -119,4 +303,39 @@ public ExtractData getText(final InputStream in, final Map param public void setMaxContentSize(final long maxContentSize) { this.maxContentSize = maxContentSize; } -} \ No newline at end of file + + /** + * Sets the cap on total uncompressed bytes read from all entries. + * @param maxBytes the maximum total bytes (use {@code -1} to disable) + */ + public void setMaxBytes(final long maxBytes) { + this.maxBytes = maxBytes; + } + + /** + * Sets the maximum permitted uncompressed/compressed ratio per entry. + * @param maxCompressionRatio the threshold (use {@code -1} to disable) + */ + public void setMaxCompressionRatio(final long maxCompressionRatio) { + this.maxCompressionRatio = maxCompressionRatio; + } + + /** + * Sets the maximum number of entries that may be iterated. + * @param maxEntries the maximum entry count (use {@code -1} to disable) + */ + public void setMaxEntries(final int maxEntries) { + this.maxEntries = maxEntries; + } + + /** + * Sets the filename encoding used to decode entry names that lack the + * UTF-8 flag (e.g. {@code "CP932"} / {@code "MS932"} for Japanese + * archives). + * + * @param filenameEncoding the charset name + */ + public void setFilenameEncoding(final String filenameEncoding) { + this.filenameEncoding = filenameEncoding; + } +} diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractorTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractorTest.java index 30e7db86..1a51483f 100644 --- a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractorTest.java +++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractorTest.java @@ -17,10 +17,12 @@ import java.io.ByteArrayInputStream; import java.io.InputStream; +import java.util.HashMap; import java.util.Map; import org.codelibs.fess.crawler.entity.ExtractData; import org.codelibs.fess.crawler.exception.CrawlerSystemException; +import org.codelibs.fess.crawler.exception.MaxLengthExceededException; import org.dbflute.utflute.core.PlainTestCase; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -64,6 +66,19 @@ public void resetTestState() { public void testValidateInputStream(final InputStream in) { validateInputStream(in); } + + // Expose depth helpers for testing. + public int testGetCurrentDepth(final Map params) { + return getCurrentDepth(params); + } + + public Map testIncrementDepth(final Map params) { + return incrementDepth(params); + } + + public void testCheckDepth(final Map params, final int maxDepth) { + checkDepth(params, maxDepth); + } } private TestExtractor extractor; @@ -243,4 +258,85 @@ public void test_validateInputStream_throwsCorrectExceptionType() { fail(); } } + + /** Recursion-depth helper: missing/null params return 0. */ + @Test + public void test_getCurrentDepth_returnsZeroForMissing() { + assertEquals(0, extractor.testGetCurrentDepth(null)); + assertEquals(0, extractor.testGetCurrentDepth(new HashMap<>())); + final Map blank = new HashMap<>(); + blank.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, ""); + assertEquals(0, extractor.testGetCurrentDepth(blank)); + final Map garbage = new HashMap<>(); + garbage.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "not-a-number"); + assertEquals(0, extractor.testGetCurrentDepth(garbage)); + } + + /** Recursion-depth helper: depth value is parsed and clamped to >= 0. */ + @Test + public void test_getCurrentDepth_parsesValidValue() { + final Map params = new HashMap<>(); + params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "3"); + assertEquals(3, extractor.testGetCurrentDepth(params)); + + params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "-5"); + assertEquals(0, extractor.testGetCurrentDepth(params)); + } + + /** incrementDepth must return a NEW map and not mutate the input. */ + @Test + public void test_incrementDepth_returnsNewMap() { + final Map original = new HashMap<>(); + original.put("foo", "bar"); + final Map next = extractor.testIncrementDepth(original); + + assertFalse(original == next); + // original is unchanged + assertFalse(original.containsKey(AbstractExtractor.EXTRACTOR_DEPTH_KEY)); + assertEquals("bar", next.get("foo")); + assertEquals("1", next.get(AbstractExtractor.EXTRACTOR_DEPTH_KEY)); + + final Map after = extractor.testIncrementDepth(next); + assertEquals("2", after.get(AbstractExtractor.EXTRACTOR_DEPTH_KEY)); + // first map still says "1" + assertEquals("1", next.get(AbstractExtractor.EXTRACTOR_DEPTH_KEY)); + } + + /** incrementDepth on null produces depth=1. */ + @Test + public void test_incrementDepth_nullInput() { + final Map next = extractor.testIncrementDepth(null); + assertNotNull(next); + assertEquals("1", next.get(AbstractExtractor.EXTRACTOR_DEPTH_KEY)); + } + + /** checkDepth allows depths below the limit. */ + @Test + public void test_checkDepth_belowLimit_passes() { + final Map params = new HashMap<>(); + params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "3"); + extractor.testCheckDepth(params, 10); // no throw + extractor.testCheckDepth(null, 10); + } + + /** checkDepth rejects depths at or above the limit. */ + @Test + public void test_checkDepth_atOrAboveLimit_throws() { + final Map params = new HashMap<>(); + params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "10"); + try { + extractor.testCheckDepth(params, 10); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("recursion depth")); + } + + params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "99"); + try { + extractor.testCheckDepth(params, 10); + fail(); + } catch (final MaxLengthExceededException e) { + // pass + } + } } diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java new file mode 100644 index 00000000..1be37ccd --- /dev/null +++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java @@ -0,0 +1,457 @@ +/* + * Copyright 2012-2025 CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +package org.codelibs.fess.crawler.extractor.impl; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.HashMap; +import java.util.Map; + +import org.apache.commons.compress.archivers.ArchiveStreamFactory; +import org.apache.commons.compress.archivers.tar.TarArchiveEntry; +import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream; +import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; +import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream; +import org.apache.commons.compress.compressors.CompressorStreamFactory; +import org.codelibs.fess.crawler.container.StandardCrawlerContainer; +import org.codelibs.fess.crawler.exception.MaxLengthExceededException; +import org.codelibs.fess.crawler.extractor.ExtractorFactory; +import org.codelibs.fess.crawler.helper.impl.MimeTypeHelperImpl; +import org.dbflute.utflute.core.PlainTestCase; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInfo; + +/** + * Security-oriented tests that validate the archive-bomb / recursion / Zip + * Slip / link-skipping defences added to the Zip / Tar / Lha extractors. + * + *

+ * Synthetic archives are constructed in-memory with Apache Commons Compress + * so the tests are fully self-contained. + *

+ */ +public class ArchiveExtractorSecurityTest extends PlainTestCase { + + private ZipExtractor zipExtractor; + private TarExtractor tarExtractor; + private LhaExtractor lhaExtractor; + + @Override + protected void setUp(final TestInfo testInfo) throws Exception { + super.setUp(testInfo); + final StandardCrawlerContainer container = new StandardCrawlerContainer(); + container.singleton("archiveStreamFactory", ArchiveStreamFactory.class) + .singleton("compressorStreamFactory", CompressorStreamFactory.class) + .singleton("mimeTypeHelper", MimeTypeHelperImpl.class) + .singleton("textExtractor", TextExtractor.class) + .singleton("zipExtractor", ZipExtractor.class) + .singleton("tarExtractor", TarExtractor.class) + .singleton("lhaExtractor", LhaExtractor.class) + . singleton("extractorFactory", ExtractorFactory.class, factory -> { + final TextExtractor textExtractor = container.getComponent("textExtractor"); + final ZipExtractor zip = container.getComponent("zipExtractor"); + final TarExtractor tar = container.getComponent("tarExtractor"); + final LhaExtractor lha = container.getComponent("lhaExtractor"); + factory.addExtractor("text/plain", textExtractor); + factory.addExtractor("application/zip", zip); + factory.addExtractor("application/x-tar", tar); + factory.addExtractor("application/x-lha", lha); + }); + + zipExtractor = container.getComponent("zipExtractor"); + tarExtractor = container.getComponent("tarExtractor"); + lhaExtractor = container.getComponent("lhaExtractor"); + } + + // --------------------------------------------------------------------- + // Helpers + // --------------------------------------------------------------------- + + private byte[] buildZip(final EntrySpec... specs) throws IOException { + return buildZipWithCharset(StandardCharsets.UTF_8, specs); + } + + private byte[] buildZipWithCharset(final Charset charset, final EntrySpec... specs) throws IOException { + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try (ZipArchiveOutputStream zos = new ZipArchiveOutputStream(baos)) { + zos.setEncoding(charset.name()); + // Disable the UTF-8 flag so the encoding parameter is honoured by + // ZipArchiveInputStream during read. + zos.setUseLanguageEncodingFlag(false); + zos.setCreateUnicodeExtraFields(ZipArchiveOutputStream.UnicodeExtraFieldPolicy.NEVER); + for (final EntrySpec spec : specs) { + final ZipArchiveEntry entry = new ZipArchiveEntry(spec.name); + zos.putArchiveEntry(entry); + if (spec.content != null) { + zos.write(spec.content); + } + zos.closeArchiveEntry(); + } + zos.finish(); + } + return baos.toByteArray(); + } + + private byte[] buildTar(final TarEntrySpec... specs) throws IOException { + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try (TarArchiveOutputStream tos = new TarArchiveOutputStream(baos)) { + tos.setLongFileMode(TarArchiveOutputStream.LONGFILE_POSIX); + for (final TarEntrySpec spec : specs) { + final TarArchiveEntry entry; + if (spec.linkType != 0) { + entry = new TarArchiveEntry(spec.name, spec.linkType); + if (spec.linkName != null) { + entry.setLinkName(spec.linkName); + } + } else { + entry = new TarArchiveEntry(spec.name); + entry.setSize(spec.content == null ? 0 : spec.content.length); + } + tos.putArchiveEntry(entry); + if (spec.linkType == 0 && spec.content != null) { + tos.write(spec.content); + } + tos.closeArchiveEntry(); + } + tos.finish(); + } + return baos.toByteArray(); + } + + private static final class EntrySpec { + final String name; + final byte[] content; + + EntrySpec(final String name, final byte[] content) { + this.name = name; + this.content = content; + } + } + + private static final class TarEntrySpec { + final String name; + final byte[] content; + final byte linkType; + final String linkName; + + TarEntrySpec(final String name, final byte[] content) { + this(name, content, (byte) 0, null); + } + + TarEntrySpec(final String name, final byte[] content, final byte linkType, final String linkName) { + this.name = name; + this.content = content; + this.linkType = linkType; + this.linkName = linkName; + } + } + + // --------------------------------------------------------------------- + // Zip — byte-limit bomb + // --------------------------------------------------------------------- + + @Test + public void test_zipBomb_byteLimit() throws Exception { + final byte[] payload = new byte[64 * 1024]; + final byte[] data = buildZip(new EntrySpec("a.txt", payload), new EntrySpec("b.txt", payload), new EntrySpec("c.txt", payload)); + + zipExtractor.setMaxBytes(64 * 1024); // exactly one entry's worth -> 2nd should fail + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("zip uncompressed size exceeded")); + } + } + + // --------------------------------------------------------------------- + // Zip — many-entry bomb + // --------------------------------------------------------------------- + + @Test + public void test_zipBomb_entryLimit() throws Exception { + final EntrySpec[] specs = new EntrySpec[20]; + for (int i = 0; i < specs.length; i++) { + specs[i] = new EntrySpec("e" + i + ".txt", new byte[0]); + } + final byte[] data = buildZip(specs); + + zipExtractor.setMaxEntries(5); + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("zip entry count exceeded")); + } + } + + // --------------------------------------------------------------------- + // Zip — Zip Slip path traversal + // --------------------------------------------------------------------- + + @Test + public void test_zipSlip_pathTraversal() throws Exception { + final byte[] data = buildZip(new EntrySpec("../../etc/passwd", "evil".getBytes(StandardCharsets.UTF_8)), + new EntrySpec("ok.txt", "good".getBytes(StandardCharsets.UTF_8))); + + try (InputStream in = new ByteArrayInputStream(data)) { + final String content = zipExtractor.getText(in, null).getContent(); + // Bad entry must be skipped; good entry must still be processed. + assertFalse(content.contains("evil")); + assertTrue(content.contains("good")); + } + } + + @Test + public void test_zipSlip_absolutePath() throws Exception { + final byte[] data = buildZip(new EntrySpec("/etc/passwd", "evil".getBytes(StandardCharsets.UTF_8)), + new EntrySpec("ok.txt", "good".getBytes(StandardCharsets.UTF_8))); + + try (InputStream in = new ByteArrayInputStream(data)) { + final String content = zipExtractor.getText(in, null).getContent(); + assertFalse(content.contains("evil")); + assertTrue(content.contains("good")); + } + } + + // --------------------------------------------------------------------- + // Recursion-depth bomb + // --------------------------------------------------------------------- + + @Test + public void test_recursionDepth_exceeded() throws Exception { + final byte[] data = buildZip(new EntrySpec("ok.txt", "hello".getBytes(StandardCharsets.UTF_8))); + final Map params = new HashMap<>(); + params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "10"); // == default max + + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, params); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().toLowerCase().contains("recursion")); + } + } + + @Test + public void test_recursionDepth_belowLimit_succeeds() throws Exception { + final byte[] data = buildZip(new EntrySpec("ok.txt", "hello".getBytes(StandardCharsets.UTF_8))); + final Map params = new HashMap<>(); + params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "3"); + + try (InputStream in = new ByteArrayInputStream(data)) { + final String content = zipExtractor.getText(in, params).getContent(); + assertTrue(content.contains("hello")); + } + // Original params must be unchanged. + assertEquals("3", params.get(AbstractExtractor.EXTRACTOR_DEPTH_KEY)); + } + + // --------------------------------------------------------------------- + // CP932 / non-UTF-8 filename encoding + // --------------------------------------------------------------------- + + @Test + public void test_cp932Filename() throws Exception { + final Charset cp932; + try { + cp932 = Charset.forName("MS932"); + } catch (final Exception e) { + // CP932/MS932 not available on this JVM; skip. + return; + } + + final byte[] data = buildZipWithCharset(cp932, new EntrySpec("テスト.txt", "japan".getBytes(StandardCharsets.UTF_8))); + + // Default UTF-8 encoding may mojibake the filename, but once we set + // CP932 the filename should round-trip cleanly. We assert by + // inspecting the entry list directly via the public API: setting the + // proper encoding allows the .txt suffix to be detected and the + // entry's content extracted. + zipExtractor.setFilenameEncoding("MS932"); + try (InputStream in = new ByteArrayInputStream(data)) { + final String content = zipExtractor.getText(in, null).getContent(); + assertTrue(content.contains("japan")); + } + } + + // --------------------------------------------------------------------- + // Tar — symlink / hardlink entries are skipped + // --------------------------------------------------------------------- + + @Test + public void test_tar_symlinkSkipped() throws Exception { + final byte[] data = buildTar(new TarEntrySpec("ok.txt", "regular".getBytes(StandardCharsets.UTF_8)), + new TarEntrySpec("evil.txt", null, TarArchiveEntry.LF_SYMLINK, "/etc/passwd")); + + try (InputStream in = new ByteArrayInputStream(data)) { + final String content = tarExtractor.getText(in, null).getContent(); + assertTrue(content.contains("regular")); + // Symlink target text must NOT leak into the output. + assertFalse(content.contains("/etc/passwd")); + } + } + + @Test + public void test_tar_hardlinkSkipped() throws Exception { + final byte[] data = buildTar(new TarEntrySpec("ok.txt", "regular".getBytes(StandardCharsets.UTF_8)), + new TarEntrySpec("evil.txt", null, TarArchiveEntry.LF_LINK, "ok.txt")); + + try (InputStream in = new ByteArrayInputStream(data)) { + final String content = tarExtractor.getText(in, null).getContent(); + assertTrue(content.contains("regular")); + // The hardlink should not have introduced a duplicate of the + // referenced entry's content. + assertEquals(content.indexOf("regular"), content.lastIndexOf("regular")); + } + } + + @Test + public void test_tar_pathTraversal() throws Exception { + final byte[] data = buildTar(new TarEntrySpec("../../etc/passwd", "evil".getBytes(StandardCharsets.UTF_8)), + new TarEntrySpec("ok.txt", "good".getBytes(StandardCharsets.UTF_8))); + + try (InputStream in = new ByteArrayInputStream(data)) { + final String content = tarExtractor.getText(in, null).getContent(); + assertFalse(content.contains("evil")); + assertTrue(content.contains("good")); + } + } + + // --------------------------------------------------------------------- + // Compression-ratio bomb — produce a highly-compressible big entry + // --------------------------------------------------------------------- + + @Test + public void test_compressionRatioExceeded() throws Exception { + // 2 MiB of zeroes compresses extremely well, well above the 100:1 + // default threshold. Build the entry with explicit method/size/crc so + // the local file header carries the compressed size (otherwise a + // streaming DEFLATED entry uses a data descriptor, leaving + // ZipArchiveEntry#getCompressedSize() as -1 and bypassing the ratio + // check). + final byte[] payload = new byte[2 * 1024 * 1024]; + final java.util.zip.Deflater def = new java.util.zip.Deflater(java.util.zip.Deflater.BEST_COMPRESSION); + def.setInput(payload); + def.finish(); + final ByteArrayOutputStream compBuf = new ByteArrayOutputStream(); + final byte[] tmpBuf = new byte[8192]; + while (!def.finished()) { + final int n = def.deflate(tmpBuf); + compBuf.write(tmpBuf, 0, n); + } + def.end(); + final java.util.zip.CRC32 crc = new java.util.zip.CRC32(); + crc.update(payload); + + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try (ZipArchiveOutputStream zos = new ZipArchiveOutputStream(baos)) { + final ZipArchiveEntry entry = new ZipArchiveEntry("zeros.txt"); + entry.setMethod(ZipArchiveEntry.DEFLATED); + entry.setSize(payload.length); + entry.setCompressedSize(compBuf.size()); + entry.setCrc(crc.getValue()); + zos.putArchiveEntry(entry); + zos.write(payload); + zos.closeArchiveEntry(); + zos.finish(); + } + final byte[] data = baos.toByteArray(); + + // Disable the byte cap so the compression-ratio check is the one that + // fires. + zipExtractor.setMaxBytes(-1); + zipExtractor.setMaxContentSize(-1); + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("compression ratio") || e.getMessage().contains("uncompressed size")); + } + } + + // --------------------------------------------------------------------- + // Tar byte/entry limits + // --------------------------------------------------------------------- + + @Test + public void test_tarBomb_byteLimit() throws Exception { + final byte[] payload = new byte[64 * 1024]; + final byte[] data = buildTar(new TarEntrySpec("a.txt", payload), new TarEntrySpec("b.txt", payload)); + + tarExtractor.setMaxBytes(64 * 1024); + try (InputStream in = new ByteArrayInputStream(data)) { + tarExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("tar uncompressed size exceeded")); + } + } + + @Test + public void test_tarBomb_entryLimit() throws Exception { + final TarEntrySpec[] specs = new TarEntrySpec[20]; + for (int i = 0; i < specs.length; i++) { + specs[i] = new TarEntrySpec("e" + i + ".txt", new byte[0]); + } + final byte[] data = buildTar(specs); + + tarExtractor.setMaxEntries(5); + try (InputStream in = new ByteArrayInputStream(data)) { + tarExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("tar entry count exceeded")); + } + } + + @Test + public void test_tar_recursionDepth_exceeded() throws Exception { + final byte[] data = buildTar(new TarEntrySpec("ok.txt", "hi".getBytes(StandardCharsets.UTF_8))); + final Map params = new HashMap<>(); + params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "10"); + + try (InputStream in = new ByteArrayInputStream(data)) { + tarExtractor.getText(in, params); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().toLowerCase().contains("recursion")); + } + } + + // --------------------------------------------------------------------- + // Lha recursion-depth check (uses isPathTraversal helper too) + // --------------------------------------------------------------------- + + @Test + public void test_lha_recursionDepth_exceeded() { + final Map params = new HashMap<>(); + params.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "10"); + // We pass a tiny non-archive stream; the depth check fires before + // the LHA library is invoked. + try (InputStream in = new ByteArrayInputStream("dummy".getBytes(StandardCharsets.UTF_8))) { + lhaExtractor.getText(in, params); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().toLowerCase().contains("recursion")); + } catch (final IOException e) { + fail(); + } + } +} From 102b22fde5b28965b215fa11adf3c3bf7b5b1ebc Mon Sep 17 00:00:00 2001 From: Shinsuke Sugaya Date: Tue, 5 May 2026 12:39:26 +0900 Subject: [PATCH 2/6] fix(extractor): add per-entry size cap and dedupe Zip Slip / copy-bounded helpers --- .../extractor/impl/AbstractExtractor.java | 78 +++++++++++++++ .../crawler/extractor/impl/LhaExtractor.java | 62 +++++------- .../crawler/extractor/impl/TarExtractor.java | 83 ++++++---------- .../crawler/extractor/impl/ZipExtractor.java | 96 +++++++------------ .../impl/ArchiveExtractorSecurityTest.java | 53 ++++++++++ 5 files changed, 217 insertions(+), 155 deletions(-) diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java index bb30c235..7a2eb122 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java @@ -18,10 +18,15 @@ import java.io.File; import java.io.IOException; import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.InvalidPathException; +import java.nio.file.Path; +import java.nio.file.Paths; import java.util.HashMap; import java.util.List; import java.util.Map; +import org.apache.commons.io.IOUtils; import org.codelibs.fess.crawler.container.CrawlerContainer; import org.codelibs.fess.crawler.exception.CrawlerSystemException; import org.codelibs.fess.crawler.exception.MaxLengthExceededException; @@ -229,4 +234,77 @@ protected void validateInputStream(final InputStream in) { throw new CrawlerSystemException("The inputstream is null."); } } + + /** + * Returns true when the supplied entry name escapes the conceptual + * extraction root via path-traversal segments. The check is performed on + * a normalised form of the path and is shared between the archive + * extractors (Zip / Tar / Lha) so the rejection rules stay in lock step. + * + *

+ * An entry is rejected when it is null/empty, when it is rooted at + * {@code /} or {@code \}, when it begins with a Windows drive letter + * (e.g. {@code C:}), when its normalised form contains a {@code ..} + * segment, or when {@link Paths#get} treats it as malformed. + *

+ * + * @param name the entry name as reported by the archive + * @return {@code true} if the name should be rejected + */ + protected static boolean isPathTraversal(final String name) { + if (name == null || name.isEmpty()) { + return true; + } + // Absolute paths (Unix or Windows-style) are unsafe in the + // context of an archive extracted into a sandbox root. + if (name.startsWith("/") || name.startsWith("\\")) { + return true; + } + if (name.length() >= 2 && name.charAt(1) == ':') { + return true; + } + try { + final Path normalised = Paths.get(name).normalize(); + final String normStr = normalised.toString().replace('\\', '/'); + if (normStr.equals("..") || normStr.startsWith("../") || normStr.contains("/../")) { + return true; + } + for (final Path part : normalised) { + if ("..".equals(part.toString())) { + return true; + } + } + } catch (final InvalidPathException ipe) { + return true; + } + return false; + } + + /** + * Copies up to {@code limit} bytes from {@code in} to {@code out}, returning + * the actual number of bytes copied. Used by archive extractors to bound + * the amount of memory consumed when buffering an entry's uncompressed + * payload. + * + * @param in the source stream + * @param out the sink stream + * @param limit the maximum number of bytes to copy (inclusive). Values + * {@code <= 0} cause the method to return without reading. + * @return the number of bytes actually copied + * @throws IOException if reading from {@code in} or writing to {@code out} + * fails + */ + protected static long copyBounded(final InputStream in, final OutputStream out, final long limit) throws IOException { + if (limit <= 0) { + return 0; + } + final byte[] buffer = new byte[8192]; + long total = 0; + int read; + while (total < limit && (read = in.read(buffer, 0, (int) Math.min(buffer.length, limit - total))) != IOUtils.EOF) { + out.write(buffer, 0, read); + total += read; + } + return total; + } } diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java index 70e2cced..b6668fd3 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java @@ -19,9 +19,6 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; -import java.nio.file.InvalidPathException; -import java.nio.file.Path; -import java.nio.file.Paths; import java.util.Enumeration; import java.util.Map; @@ -67,6 +64,14 @@ public class LhaExtractor extends AbstractExtractor { */ protected long maxBytes = 1L << 31; + /** + * Maximum uncompressed bytes for a SINGLE entry (as reported by the LHA + * header). Guards against an oversized entry being routed to a buffered + * downstream extractor. Defaults to 256 MiB. Set to {@code -1} to + * disable. Enforced independently of {@link #maxBytes}. + */ + protected long maxBytesPerEntry = 256L * 1024L * 1024L; + /** * Maximum allowed number of entries to iterate. Defaults to 100,000. * Set to {@code -1} to disable. @@ -120,7 +125,12 @@ public ExtractData getText(final InputStream in, final Map param if (maxEntries > 0 && entryCount > maxEntries) { throw new MaxLengthExceededException("lha entry count exceeded: count=" + entryCount + " max=" + maxEntries); } - contentSize += head.getOriginalSize(); + final long entrySize = head.getOriginalSize(); + if (maxBytesPerEntry > 0 && entrySize > maxBytesPerEntry) { + throw new MaxLengthExceededException( + "lha per-entry size exceeded: name=" + head.getPath() + " size=" + entrySize + " max=" + maxBytesPerEntry); + } + contentSize += entrySize; if (maxBytes > 0 && contentSize > maxBytes) { throw new MaxLengthExceededException("lha uncompressed size exceeded: total=" + contentSize + " max=" + maxBytes); } @@ -173,40 +183,6 @@ public ExtractData getText(final InputStream in, final Map param return new ExtractData(buf.toString().trim()); } - /** - * Returns true when the supplied entry name escapes the conceptual - * extraction root via path-traversal segments. - * - * @param name the entry name as reported by the archive - * @return {@code true} if the name should be rejected - */ - protected boolean isPathTraversal(final String name) { - if (name == null || name.isEmpty()) { - return true; - } - if (name.startsWith("/") || name.startsWith("\\")) { - return true; - } - if (name.length() >= 2 && name.charAt(1) == ':') { - return true; - } - try { - final Path normalised = Paths.get(name).normalize(); - final String normStr = normalised.toString().replace('\\', '/'); - if (normStr.equals("..") || normStr.startsWith("../") || normStr.contains("/../")) { - return true; - } - for (final Path part : normalised) { - if ("..".equals(part.toString())) { - return true; - } - } - } catch (final InvalidPathException ipe) { - return true; - } - return false; - } - /** * Sets the maximum content size for extraction. * @@ -224,6 +200,16 @@ public void setMaxBytes(final long maxBytes) { this.maxBytes = maxBytes; } + /** + * Sets the per-entry cap on the original (uncompressed) size reported by + * the LHA header. Set to {@code -1} to disable. + * + * @param maxBytesPerEntry the per-entry maximum + */ + public void setMaxBytesPerEntry(final long maxBytesPerEntry) { + this.maxBytesPerEntry = maxBytesPerEntry; + } + /** * Sets the maximum number of entries that may be iterated. * @param maxEntries the maximum entry count (use {@code -1} to disable) diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java index 35f4a8f0..0839b2c1 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java @@ -19,15 +19,11 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; -import java.nio.file.InvalidPathException; -import java.nio.file.Path; -import java.nio.file.Paths; import java.util.Map; import org.apache.commons.compress.archivers.ArchiveInputStream; import org.apache.commons.compress.archivers.ArchiveStreamFactory; import org.apache.commons.compress.archivers.tar.TarArchiveEntry; -import org.apache.commons.io.IOUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.codelibs.fess.crawler.entity.ExtractData; @@ -69,6 +65,14 @@ public class TarExtractor extends AbstractExtractor { */ protected long maxBytes = 1L << 31; + /** + * Maximum uncompressed bytes that may be buffered for a SINGLE entry. + * Guards against an oversized entry exhausting the JVM heap when + * buffered into memory. Defaults to 256 MiB. Set to {@code -1} to + * disable. Enforced independently of {@link #maxBytes}. + */ + protected long maxBytesPerEntry = 256L * 1024L * 1024L; + /** * Maximum allowed number of entries to iterate. Defaults to 100,000. * Set to {@code -1} to disable. @@ -134,12 +138,14 @@ protected String getTextInternal(final InputStream in, final MimeTypeHelper mime final long actualBytes; final byte[] entryBytes; try { - final long readLimit; + final long totalReadLimit; if (maxBytes > 0) { - readLimit = Math.max(0L, maxBytes - totalBytes) + 1L; + totalReadLimit = Math.max(0L, maxBytes - totalBytes) + 1L; } else { - readLimit = Long.MAX_VALUE; + totalReadLimit = Long.MAX_VALUE; } + final long perEntryReadLimit = maxBytesPerEntry > 0 ? maxBytesPerEntry + 1L : Long.MAX_VALUE; + final long readLimit = Math.min(totalReadLimit, perEntryReadLimit); final ByteArrayOutputStream out = new ByteArrayOutputStream(); actualBytes = copyBounded(ais, out, readLimit); entryBytes = out.toByteArray(); @@ -151,6 +157,11 @@ protected String getTextInternal(final InputStream in, final MimeTypeHelper mime continue; } + if (maxBytesPerEntry > 0 && actualBytes > maxBytesPerEntry) { + throw new MaxLengthExceededException( + "tar per-entry size exceeded: name=" + filename + " size=" + actualBytes + " max=" + maxBytesPerEntry); + } + totalBytes += actualBytes; if (maxBytes > 0 && totalBytes > maxBytes) { throw new MaxLengthExceededException("tar uncompressed size exceeded: total=" + totalBytes + " max=" + maxBytes); @@ -194,54 +205,6 @@ protected String getTextInternal(final InputStream in, final MimeTypeHelper mime return buf.toString().trim(); } - /** - * Returns true when the supplied entry name escapes the conceptual - * extraction root via path-traversal segments. - * - * @param name the entry name as reported by the archive - * @return {@code true} if the name should be rejected - */ - protected boolean isPathTraversal(final String name) { - if (name == null || name.isEmpty()) { - return true; - } - if (name.startsWith("/") || name.startsWith("\\")) { - return true; - } - if (name.length() >= 2 && name.charAt(1) == ':') { - return true; - } - try { - final Path normalised = Paths.get(name).normalize(); - final String normStr = normalised.toString().replace('\\', '/'); - if (normStr.equals("..") || normStr.startsWith("../") || normStr.contains("/../")) { - return true; - } - for (final Path part : normalised) { - if ("..".equals(part.toString())) { - return true; - } - } - } catch (final InvalidPathException ipe) { - return true; - } - return false; - } - - private long copyBounded(final InputStream in, final ByteArrayOutputStream out, final long limit) throws IOException { - if (limit <= 0) { - return 0; - } - final byte[] buffer = new byte[8192]; - long total = 0; - int read; - while (total < limit && (read = in.read(buffer, 0, (int) Math.min(buffer.length, limit - total))) != IOUtils.EOF) { - out.write(buffer, 0, read); - total += read; - } - return total; - } - /** * Sets the maximum content size. * @param maxContentSize The maximum content size to set. @@ -258,6 +221,16 @@ public void setMaxBytes(final long maxBytes) { this.maxBytes = maxBytes; } + /** + * Sets the per-entry cap on uncompressed bytes buffered in memory. Set + * to {@code -1} to disable. + * + * @param maxBytesPerEntry the per-entry maximum + */ + public void setMaxBytesPerEntry(final long maxBytesPerEntry) { + this.maxBytesPerEntry = maxBytesPerEntry; + } + /** * Sets the maximum number of entries that may be iterated. * @param maxEntries the maximum entry count (use {@code -1} to disable) diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java index ee4a9370..eeff5326 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java @@ -20,14 +20,10 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; -import java.nio.file.InvalidPathException; -import java.nio.file.Path; -import java.nio.file.Paths; import java.util.Map; import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream; -import org.apache.commons.io.IOUtils; import org.apache.commons.io.input.CountingInputStream; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -76,6 +72,15 @@ public class ZipExtractor extends AbstractExtractor { */ protected long maxBytes = 1L << 31; + /** + * Maximum uncompressed bytes that may be buffered for a SINGLE entry. + * This guards against a legitimate-looking but oversized entry (e.g. a + * 1.9 GiB file inside an otherwise small archive) exhausting the JVM + * heap when buffered into memory. Defaults to 256 MiB. Set to + * {@code -1} to disable. Enforced independently of {@link #maxBytes}. + */ + protected long maxBytesPerEntry = 256L * 1024L * 1024L; + /** * Maximum allowed compression ratio (uncompressed / compressed). Entries * exceeding this ratio AND larger than 1 MiB are rejected as suspected @@ -163,12 +168,19 @@ public ExtractData getText(final InputStream in, final Map param final long actualBytes; final byte[] entryBytes; try { - final long readLimit; + final long totalReadLimit; if (maxBytes > 0) { - readLimit = Math.max(0L, maxBytes - totalBytes) + 1L; + totalReadLimit = Math.max(0L, maxBytes - totalBytes) + 1L; } else { - readLimit = Long.MAX_VALUE; + totalReadLimit = Long.MAX_VALUE; } + // Enforce a per-entry cap independently of the total + // cap so that a single oversized entry cannot exhaust + // the JVM heap. We read one byte beyond the cap so the + // explicit overflow check below can distinguish + // "exactly at the cap" from "exceeds the cap". + final long perEntryReadLimit = maxBytesPerEntry > 0 ? maxBytesPerEntry + 1L : Long.MAX_VALUE; + final long readLimit = Math.min(totalReadLimit, perEntryReadLimit); final ByteArrayOutputStream out = new ByteArrayOutputStream(); actualBytes = copyBounded(ais, out, readLimit); entryBytes = out.toByteArray(); @@ -180,6 +192,11 @@ public ExtractData getText(final InputStream in, final Map param continue; } + if (maxBytesPerEntry > 0 && actualBytes > maxBytesPerEntry) { + throw new MaxLengthExceededException( + "zip per-entry size exceeded: name=" + filename + " size=" + actualBytes + " max=" + maxBytesPerEntry); + } + totalBytes += actualBytes; if (maxBytes > 0 && totalBytes > maxBytes) { throw new MaxLengthExceededException("zip uncompressed size exceeded: total=" + totalBytes + " max=" + maxBytes); @@ -241,61 +258,6 @@ public ExtractData getText(final InputStream in, final Map param return new ExtractData(buf.toString().trim()); } - /** - * Returns true when the supplied entry name escapes the conceptual - * extraction root via path-traversal segments. The check is performed on - * a normalised form of the path. - * - * @param name the entry name as reported by the archive - * @return {@code true} if the name should be rejected - */ - protected boolean isPathTraversal(final String name) { - if (name == null || name.isEmpty()) { - return true; - } - // Absolute paths (Unix or Windows-style) are unsafe in the - // context of an archive extracted into a sandbox root. - if (name.startsWith("/") || name.startsWith("\\")) { - return true; - } - if (name.length() >= 2 && name.charAt(1) == ':') { - return true; - } - try { - final Path normalised = Paths.get(name).normalize(); - final String normStr = normalised.toString().replace('\\', '/'); - if (normStr.equals("..") || normStr.startsWith("../") || normStr.contains("/../")) { - return true; - } - for (final Path part : normalised) { - if ("..".equals(part.toString())) { - return true; - } - } - } catch (final InvalidPathException ipe) { - return true; - } - return false; - } - - /** - * Copies up to {@code limit} bytes from {@code in} to {@code out}. Returns - * the actual number of bytes copied. - */ - private long copyBounded(final InputStream in, final ByteArrayOutputStream out, final long limit) throws IOException { - if (limit <= 0) { - return 0; - } - final byte[] buffer = new byte[8192]; - long total = 0; - int read; - while (total < limit && (read = in.read(buffer, 0, (int) Math.min(buffer.length, limit - total))) != IOUtils.EOF) { - out.write(buffer, 0, read); - total += read; - } - return total; - } - /** * Sets the maximum content size. * @param maxContentSize The maximum content size to set. @@ -312,6 +274,16 @@ public void setMaxBytes(final long maxBytes) { this.maxBytes = maxBytes; } + /** + * Sets the per-entry cap on uncompressed bytes buffered in memory. Set + * to {@code -1} to disable. + * + * @param maxBytesPerEntry the per-entry maximum + */ + public void setMaxBytesPerEntry(final long maxBytesPerEntry) { + this.maxBytesPerEntry = maxBytesPerEntry; + } + /** * Sets the maximum permitted uncompressed/compressed ratio per entry. * @param maxCompressionRatio the threshold (use {@code -1} to disable) diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java index 1be37ccd..97184a5f 100644 --- a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java +++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java @@ -439,6 +439,59 @@ public void test_tar_recursionDepth_exceeded() throws Exception { // Lha recursion-depth check (uses isPathTraversal helper too) // --------------------------------------------------------------------- + // --------------------------------------------------------------------- + // Per-entry size cap — guards against a single oversized entry + // --------------------------------------------------------------------- + + @Test + public void test_perEntryCapEnforced() throws Exception { + // Build a zip whose single entry holds 300 MiB of (highly + // compressible) zeroes. The extractor must trip the per-entry cap + // before buffering the entire 300 MiB into memory. + final int entrySize = 300 * 1024 * 1024; + final byte[] payload = new byte[entrySize]; + + final java.util.zip.Deflater def = new java.util.zip.Deflater(java.util.zip.Deflater.BEST_COMPRESSION); + def.setInput(payload); + def.finish(); + final ByteArrayOutputStream compBuf = new ByteArrayOutputStream(); + final byte[] tmpBuf = new byte[8192]; + while (!def.finished()) { + final int n = def.deflate(tmpBuf); + compBuf.write(tmpBuf, 0, n); + } + def.end(); + final java.util.zip.CRC32 crc = new java.util.zip.CRC32(); + crc.update(payload); + + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try (ZipArchiveOutputStream zos = new ZipArchiveOutputStream(baos)) { + final ZipArchiveEntry entry = new ZipArchiveEntry("big.bin"); + entry.setMethod(ZipArchiveEntry.DEFLATED); + entry.setSize(payload.length); + entry.setCompressedSize(compBuf.size()); + entry.setCrc(crc.getValue()); + zos.putArchiveEntry(entry); + zos.write(payload); + zos.closeArchiveEntry(); + zos.finish(); + } + final byte[] data = baos.toByteArray(); + + // Disable the total-size and ratio checks so only the per-entry cap + // can trigger. Default per-entry cap (256 MiB) must reject the + // 300 MiB entry. + zipExtractor.setMaxBytes(-1); + zipExtractor.setMaxContentSize(-1); + zipExtractor.setMaxCompressionRatio(-1); + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("per-entry size exceeded")); + } + } + @Test public void test_lha_recursionDepth_exceeded() { final Map params = new HashMap<>(); From 9de64ae94923beca5837c60cf4e5136ae749cbf1 Mon Sep 17 00:00:00 2001 From: Shinsuke Sugaya Date: Tue, 5 May 2026 19:52:38 +0900 Subject: [PATCH 3/6] fix(extractor): bound LHA input staging and per-entry actual bytes - LhaExtractor now stages the input archive through copyBounded with a configurable maxInputBytes cap (default 1 GiB) so a hostile producer cannot fill local storage before LhaFile is opened. - The per-entry size cap is enforced against bytes actually decompressed (mirroring Zip/Tar) instead of the attacker-controlled LhaHeader#getOriginalSize. Entries are now buffered into a bounded ByteArrayOutputStream and forwarded to downstream extractors via ByteArrayInputStream, dropping the unbounded IgnoreCloseInputStream hand-off. - Down-scale test_perEntryCapEnforced (300 MiB -> 2 MiB payload with a 1 MiB cap) so it stays cheap on parallel/low-memory CI, and add test_lha_maxInputBytes_capsStaging covering the new input staging cap. --- .../crawler/extractor/impl/LhaExtractor.java | 108 +++++++++++++----- .../impl/ArchiveExtractorSecurityTest.java | 31 ++++- 2 files changed, 106 insertions(+), 33 deletions(-) diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java index b6668fd3..b28d59df 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java @@ -15,6 +15,8 @@ */ package org.codelibs.fess.crawler.extractor.impl; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; @@ -25,7 +27,6 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.codelibs.core.io.CloseableUtil; -import org.codelibs.core.io.CopyUtil; import org.codelibs.core.io.FileUtil; import org.codelibs.fess.crawler.entity.ExtractData; import org.codelibs.fess.crawler.exception.CrawlerSystemException; @@ -34,7 +35,6 @@ import org.codelibs.fess.crawler.extractor.Extractor; import org.codelibs.fess.crawler.extractor.ExtractorFactory; import org.codelibs.fess.crawler.helper.MimeTypeHelper; -import org.codelibs.fess.crawler.util.IgnoreCloseInputStream; import jp.gr.java_conf.dangan.util.lha.LhaFile; import jp.gr.java_conf.dangan.util.lha.LhaHeader; @@ -65,13 +65,23 @@ public class LhaExtractor extends AbstractExtractor { protected long maxBytes = 1L << 31; /** - * Maximum uncompressed bytes for a SINGLE entry (as reported by the LHA - * header). Guards against an oversized entry being routed to a buffered - * downstream extractor. Defaults to 256 MiB. Set to {@code -1} to - * disable. Enforced independently of {@link #maxBytes}. + * Maximum uncompressed bytes that may be buffered for a SINGLE entry. + * Enforced against the actual bytes read from the entry stream (NOT the + * header-reported size, which is attacker-controlled). Defaults to + * 256 MiB. Set to {@code -1} to disable. Enforced independently of + * {@link #maxBytes}. */ protected long maxBytesPerEntry = 256L * 1024L * 1024L; + /** + * Maximum bytes copied from the input stream to the local temporary file + * before {@link LhaFile} is opened. The LHA library requires a seekable + * file, so the entire archive must be staged on disk; this cap prevents a + * hostile producer from filling local storage. Defaults to 1 GiB. Set to + * {@code -1} to disable. + */ + protected long maxInputBytes = 1L << 30; + /** * Maximum allowed number of entries to iterate. Defaults to 100,000. * Set to {@code -1} to disable. @@ -111,13 +121,20 @@ public ExtractData getText(final InputStream in, final Map param try { tempFile = createTempFile("crawler-", ".lzh", null); try (FileOutputStream fos = new FileOutputStream(tempFile)) { - CopyUtil.copy(in, fos); + // Stage the (untrusted) archive bytes to disk under a hard + // cap so a hostile producer cannot exhaust local storage by + // streaming an arbitrarily large body. + final long inputReadLimit = maxInputBytes > 0 ? maxInputBytes + 1L : Long.MAX_VALUE; + final long staged = copyBounded(in, fos, inputReadLimit); + if (maxInputBytes > 0 && staged > maxInputBytes) { + throw new MaxLengthExceededException("lha input size exceeded: bytes=" + staged + " max=" + maxInputBytes); + } } lhaFile = new LhaFile(tempFile); @SuppressWarnings("unchecked") final Enumeration entries = lhaFile.entries(); - long contentSize = 0; + long totalBytes = 0; int entryCount = 0; while (entries.hasMoreElements()) { final LhaHeader head = entries.nextElement(); @@ -125,33 +142,61 @@ public ExtractData getText(final InputStream in, final Map param if (maxEntries > 0 && entryCount > maxEntries) { throw new MaxLengthExceededException("lha entry count exceeded: count=" + entryCount + " max=" + maxEntries); } - final long entrySize = head.getOriginalSize(); - if (maxBytesPerEntry > 0 && entrySize > maxBytesPerEntry) { - throw new MaxLengthExceededException( - "lha per-entry size exceeded: name=" + head.getPath() + " size=" + entrySize + " max=" + maxBytesPerEntry); - } - contentSize += entrySize; - if (maxBytes > 0 && contentSize > maxBytes) { - throw new MaxLengthExceededException("lha uncompressed size exceeded: total=" + contentSize + " max=" + maxBytes); - } - if (maxContentSize != -1 && contentSize > maxContentSize) { - throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize); - } final String filename = head.getPath(); if (isPathTraversal(filename)) { logger.warn("lha entry rejected: name={} reason=path-traversal", filename); continue; } + + // Read the entry payload through copyBounded so the cap is + // enforced against bytes actually decompressed, not the + // header-reported size (which is attacker-controlled). + final long actualBytes; + final byte[] entryBytes; + InputStream is = null; + try { + is = lhaFile.getInputStream(head); + final long totalReadLimit; + if (maxBytes > 0) { + totalReadLimit = Math.max(0L, maxBytes - totalBytes) + 1L; + } else { + totalReadLimit = Long.MAX_VALUE; + } + final long perEntryReadLimit = maxBytesPerEntry > 0 ? maxBytesPerEntry + 1L : Long.MAX_VALUE; + final long readLimit = Math.min(totalReadLimit, perEntryReadLimit); + final ByteArrayOutputStream out = new ByteArrayOutputStream(); + actualBytes = copyBounded(is, out, readLimit); + entryBytes = out.toByteArray(); + } catch (final IOException ioe) { + if (logger.isDebugEnabled()) { + logger.debug("Failed to read lha entry: name={}", filename, ioe); + } + continue; + } finally { + CloseableUtil.closeQuietly(is); + } + + if (maxBytesPerEntry > 0 && actualBytes > maxBytesPerEntry) { + throw new MaxLengthExceededException( + "lha per-entry size exceeded: name=" + filename + " size=" + actualBytes + " max=" + maxBytesPerEntry); + } + + totalBytes += actualBytes; + if (maxBytes > 0 && totalBytes > maxBytes) { + throw new MaxLengthExceededException("lha uncompressed size exceeded: total=" + totalBytes + " max=" + maxBytes); + } + if (maxContentSize != -1 && totalBytes > maxContentSize) { + throw new MaxLengthExceededException("Extracted size is " + totalBytes + " > " + maxContentSize); + } + final String mimeType = mimeTypeHelper.getContentType(null, filename); if (mimeType != null) { final Extractor extractor = extractorFactory.getExtractor(mimeType); if (extractor != null) { - InputStream is = null; try { - is = lhaFile.getInputStream(head); final Map map = incrementDepth(params); map.put(ExtractData.RESOURCE_NAME_KEY, filename); - buf.append(extractor.getText(new IgnoreCloseInputStream(is), map).getContent()); + buf.append(extractor.getText(new ByteArrayInputStream(entryBytes), map).getContent()); buf.append('\n'); } catch (final MaxLengthExceededException e) { throw e; @@ -159,8 +204,6 @@ public ExtractData getText(final InputStream in, final Map param if (logger.isDebugEnabled()) { logger.debug("Exception in an internal extractor.", e); } - } finally { - CloseableUtil.closeQuietly(is); } } } @@ -201,8 +244,9 @@ public void setMaxBytes(final long maxBytes) { } /** - * Sets the per-entry cap on the original (uncompressed) size reported by - * the LHA header. Set to {@code -1} to disable. + * Sets the per-entry cap on uncompressed bytes buffered in memory. The + * cap is enforced against bytes actually decompressed (not the + * header-reported size). Set to {@code -1} to disable. * * @param maxBytesPerEntry the per-entry maximum */ @@ -210,6 +254,16 @@ public void setMaxBytesPerEntry(final long maxBytesPerEntry) { this.maxBytesPerEntry = maxBytesPerEntry; } + /** + * Sets the cap on the number of input bytes staged to a temporary file + * before {@link LhaFile} is opened. Set to {@code -1} to disable. + * + * @param maxInputBytes the input-stage maximum + */ + public void setMaxInputBytes(final long maxInputBytes) { + this.maxInputBytes = maxInputBytes; + } + /** * Sets the maximum number of entries that may be iterated. * @param maxEntries the maximum entry count (use {@code -1} to disable) diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java index 97184a5f..6c6ba2b4 100644 --- a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java +++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java @@ -445,10 +445,12 @@ public void test_tar_recursionDepth_exceeded() throws Exception { @Test public void test_perEntryCapEnforced() throws Exception { - // Build a zip whose single entry holds 300 MiB of (highly - // compressible) zeroes. The extractor must trip the per-entry cap - // before buffering the entire 300 MiB into memory. - final int entrySize = 300 * 1024 * 1024; + // Build a zip whose single entry exceeds the configured per-entry + // cap. The extractor must trip the cap before buffering the whole + // payload. We use a small cap (1 MiB) and a slightly larger payload + // (2 MiB) so the test stays cheap on parallel/low-memory CI. + final int perEntryCap = 1024 * 1024; + final int entrySize = 2 * perEntryCap; final byte[] payload = new byte[entrySize]; final java.util.zip.Deflater def = new java.util.zip.Deflater(java.util.zip.Deflater.BEST_COMPRESSION); @@ -479,11 +481,11 @@ public void test_perEntryCapEnforced() throws Exception { final byte[] data = baos.toByteArray(); // Disable the total-size and ratio checks so only the per-entry cap - // can trigger. Default per-entry cap (256 MiB) must reject the - // 300 MiB entry. + // can trigger. zipExtractor.setMaxBytes(-1); zipExtractor.setMaxContentSize(-1); zipExtractor.setMaxCompressionRatio(-1); + zipExtractor.setMaxBytesPerEntry(perEntryCap); try (InputStream in = new ByteArrayInputStream(data)) { zipExtractor.getText(in, null); fail(); @@ -507,4 +509,21 @@ public void test_lha_recursionDepth_exceeded() { fail(); } } + + @Test + public void test_lha_maxInputBytes_capsStaging() { + // Stage cap is enforced during the temp-file copy, before LhaFile + // is opened. Any blob larger than the cap must be rejected — we use + // arbitrary bytes since the failure precedes archive parsing. + lhaExtractor.setMaxInputBytes(1024L); + final byte[] payload = new byte[4 * 1024]; + try (InputStream in = new ByteArrayInputStream(payload)) { + lhaExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("input size exceeded")); + } catch (final IOException e) { + fail(); + } + } } From 2a92466827f11cc032d93ea0fbfd5a3f73657717 Mon Sep 17 00:00:00 2001 From: Shinsuke Sugaya Date: Tue, 5 May 2026 22:44:39 +0900 Subject: [PATCH 4/6] fix(extractor): skip unsupported archive entries; fold maxContentSize into read budget Addresses PR #161 review feedback: 1) Unsupported entries (no registered Extractor) are now skipped without buffering, mirroring the pre-defence behaviour. A large irrelevant entry (e.g. a video alongside a small .txt) no longer consumes the per-entry / total caps that should be reserved for entries the crawler actually wants to extract. 2) The legacy maxContentSize field is folded into the per-entry read budget (alongside maxBytes and maxBytesPerEntry). A small legacy cap (e.g. 10 MiB) now stops the buffer growing past it, instead of being checked only after up to 256 MiB+1 had been buffered. For Zip, the compressed-bytes anchor is also advanced for skipped / rejected entries so the next supported entry's compression-ratio is computed against its own compressed bytes, not also those of the skipped ones. Tests: updates test_perEntryCapEnforced to use a supported (.txt) entry and adds three regressions: - test_zip_unsupportedEntryDoesNotConsumeCaps - test_tar_unsupportedEntryDoesNotConsumeCaps - test_zip_maxContentSize_capsBufferBeforePerEntryCap --- .../crawler/extractor/impl/LhaExtractor.java | 65 ++++++++++----- .../crawler/extractor/impl/TarExtractor.java | 69 ++++++++++------ .../crawler/extractor/impl/ZipExtractor.java | 76 ++++++++++++------ .../impl/ArchiveExtractorSecurityTest.java | 80 +++++++++++++++++-- 4 files changed, 217 insertions(+), 73 deletions(-) diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java index b28d59df..7e975e13 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java @@ -55,12 +55,20 @@ public class LhaExtractor extends AbstractExtractor { /** Logger for this class. */ private static final Logger logger = LogManager.getLogger(LhaExtractor.class); - /** Maximum content size for extraction. -1 means no limit. */ + /** + * Legacy total cap on uncompressed bytes actually buffered from + * supported entries. The cap is also folded into the read budget so a + * single oversized entry cannot be buffered up to + * {@link #maxBytesPerEntry} when the user only asked for a much smaller + * total. Set to {@code -1} to disable. + */ protected long maxContentSize = -1; /** * Maximum total uncompressed bytes that may be read from all entries - * combined. Defaults to 2 GiB. Set to {@code -1} to disable. + * combined. Defaults to 2 GiB. Set to {@code -1} to disable. Only bytes + * from entries that have a registered {@link Extractor} contribute to + * this total — unsupported entries are skipped without buffering. */ protected long maxBytes = 1L << 31; @@ -69,7 +77,8 @@ public class LhaExtractor extends AbstractExtractor { * Enforced against the actual bytes read from the entry stream (NOT the * header-reported size, which is attacker-controlled). Defaults to * 256 MiB. Set to {@code -1} to disable. Enforced independently of - * {@link #maxBytes}. + * {@link #maxBytes}. Only applies to entries that have a registered + * {@link Extractor}; an unsupported entry is never buffered. */ protected long maxBytesPerEntry = 256L * 1024L * 1024L; @@ -148,6 +157,17 @@ public ExtractData getText(final InputStream in, final Map param continue; } + // Decide MIME / extractor up front so an unsupported entry + // is skipped without opening its decompressor at all. This + // mirrors the legacy behaviour and keeps a large irrelevant + // entry from consuming the per-entry / total caps reserved + // for entries the crawler actually wants to extract. + final String mimeType = mimeTypeHelper.getContentType(null, filename); + final Extractor extractor = mimeType != null ? extractorFactory.getExtractor(mimeType) : null; + if (extractor == null) { + continue; + } + // Read the entry payload through copyBounded so the cap is // enforced against bytes actually decompressed, not the // header-reported size (which is attacker-controlled). @@ -162,8 +182,17 @@ public ExtractData getText(final InputStream in, final Map param } else { totalReadLimit = Long.MAX_VALUE; } + // Fold maxContentSize into the read budget so a small + // legacy cap is honoured before a large per-entry cap + // can buffer hundreds of MiB into memory. + final long contentReadLimit; + if (maxContentSize >= 0) { + contentReadLimit = Math.max(0L, maxContentSize - totalBytes) + 1L; + } else { + contentReadLimit = Long.MAX_VALUE; + } final long perEntryReadLimit = maxBytesPerEntry > 0 ? maxBytesPerEntry + 1L : Long.MAX_VALUE; - final long readLimit = Math.min(totalReadLimit, perEntryReadLimit); + final long readLimit = Math.min(Math.min(totalReadLimit, contentReadLimit), perEntryReadLimit); final ByteArrayOutputStream out = new ByteArrayOutputStream(); actualBytes = copyBounded(is, out, readLimit); entryBytes = out.toByteArray(); @@ -185,26 +214,20 @@ public ExtractData getText(final InputStream in, final Map param if (maxBytes > 0 && totalBytes > maxBytes) { throw new MaxLengthExceededException("lha uncompressed size exceeded: total=" + totalBytes + " max=" + maxBytes); } - if (maxContentSize != -1 && totalBytes > maxContentSize) { + if (maxContentSize >= 0 && totalBytes > maxContentSize) { throw new MaxLengthExceededException("Extracted size is " + totalBytes + " > " + maxContentSize); } - final String mimeType = mimeTypeHelper.getContentType(null, filename); - if (mimeType != null) { - final Extractor extractor = extractorFactory.getExtractor(mimeType); - if (extractor != null) { - try { - final Map map = incrementDepth(params); - map.put(ExtractData.RESOURCE_NAME_KEY, filename); - buf.append(extractor.getText(new ByteArrayInputStream(entryBytes), map).getContent()); - buf.append('\n'); - } catch (final MaxLengthExceededException e) { - throw e; - } catch (final Exception e) { - if (logger.isDebugEnabled()) { - logger.debug("Exception in an internal extractor.", e); - } - } + try { + final Map map = incrementDepth(params); + map.put(ExtractData.RESOURCE_NAME_KEY, filename); + buf.append(extractor.getText(new ByteArrayInputStream(entryBytes), map).getContent()); + buf.append('\n'); + } catch (final MaxLengthExceededException e) { + throw e; + } catch (final Exception e) { + if (logger.isDebugEnabled()) { + logger.debug("Exception in an internal extractor.", e); } } } diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java index 0839b2c1..602878e5 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java @@ -54,14 +54,19 @@ public class TarExtractor extends AbstractExtractor { protected ArchiveStreamFactory archiveStreamFactory; /** - * Maximum content size (legacy field). When set, the actual bytes - * read from each entry stream are summed and compared against this limit. + * Legacy total cap on uncompressed bytes actually buffered from + * supported entries. The cap is also folded into the read budget so a + * single oversized entry cannot be buffered up to + * {@link #maxBytesPerEntry} when the user only asked for a much smaller + * total. Set to {@code -1} to disable. */ protected long maxContentSize = -1; /** * Maximum total uncompressed bytes that may be read from all entries - * combined. Defaults to 2 GiB. Set to {@code -1} to disable. + * combined. Defaults to 2 GiB. Set to {@code -1} to disable. Only bytes + * from entries that have a registered {@link Extractor} contribute to + * this total — unsupported entries are skipped without buffering. */ protected long maxBytes = 1L << 31; @@ -69,7 +74,9 @@ public class TarExtractor extends AbstractExtractor { * Maximum uncompressed bytes that may be buffered for a SINGLE entry. * Guards against an oversized entry exhausting the JVM heap when * buffered into memory. Defaults to 256 MiB. Set to {@code -1} to - * disable. Enforced independently of {@link #maxBytes}. + * disable. Enforced independently of {@link #maxBytes}. Only applies to + * entries that have a registered {@link Extractor}; an unsupported + * entry is never buffered, so this cap is irrelevant for it. */ protected long maxBytesPerEntry = 256L * 1024L * 1024L; @@ -135,6 +142,17 @@ protected String getTextInternal(final InputStream in, final MimeTypeHelper mime continue; } + // Decide MIME / extractor up front. An unsupported entry + // (e.g. a video alongside a small .txt) is skipped without + // buffering, so a large irrelevant entry does not consume + // the per-entry / total caps that should be reserved for + // entries the crawler actually wants to extract. + final String mimeType = mimeTypeHelper.getContentType(null, filename); + final Extractor extractor = mimeType != null ? extractorFactory.getExtractor(mimeType) : null; + if (extractor == null) { + continue; + } + final long actualBytes; final byte[] entryBytes; try { @@ -144,8 +162,17 @@ protected String getTextInternal(final InputStream in, final MimeTypeHelper mime } else { totalReadLimit = Long.MAX_VALUE; } + // Fold maxContentSize into the read budget so a small + // legacy cap is honoured before a large per-entry cap + // can buffer hundreds of MiB into memory. + final long contentReadLimit; + if (maxContentSize >= 0) { + contentReadLimit = Math.max(0L, maxContentSize - totalBytes) + 1L; + } else { + contentReadLimit = Long.MAX_VALUE; + } final long perEntryReadLimit = maxBytesPerEntry > 0 ? maxBytesPerEntry + 1L : Long.MAX_VALUE; - final long readLimit = Math.min(totalReadLimit, perEntryReadLimit); + final long readLimit = Math.min(Math.min(totalReadLimit, contentReadLimit), perEntryReadLimit); final ByteArrayOutputStream out = new ByteArrayOutputStream(); actualBytes = copyBounded(ais, out, readLimit); entryBytes = out.toByteArray(); @@ -166,28 +193,22 @@ protected String getTextInternal(final InputStream in, final MimeTypeHelper mime if (maxBytes > 0 && totalBytes > maxBytes) { throw new MaxLengthExceededException("tar uncompressed size exceeded: total=" + totalBytes + " max=" + maxBytes); } - if (maxContentSize != -1 && totalBytes > maxContentSize) { + if (maxContentSize >= 0 && totalBytes > maxContentSize) { throw new MaxLengthExceededException("Extracted size is " + totalBytes + " > " + maxContentSize); } - final String mimeType = mimeTypeHelper.getContentType(null, filename); - if (mimeType != null) { - final Extractor extractor = extractorFactory.getExtractor(mimeType); - if (extractor != null) { - try { - final Map map = incrementDepth(params); - map.put(ExtractData.RESOURCE_NAME_KEY, filename); - buf.append(extractor.getText(new ByteArrayInputStream(entryBytes), map).getContent()); - buf.append('\n'); - processedEntries++; - } catch (final MaxLengthExceededException e) { - throw e; - } catch (final Exception e) { - failedEntries++; - if (logger.isDebugEnabled()) { - logger.debug("Failed to extract content from archive entry: name={}", filename, e); - } - } + try { + final Map map = incrementDepth(params); + map.put(ExtractData.RESOURCE_NAME_KEY, filename); + buf.append(extractor.getText(new ByteArrayInputStream(entryBytes), map).getContent()); + buf.append('\n'); + processedEntries++; + } catch (final MaxLengthExceededException e) { + throw e; + } catch (final Exception e) { + failedEntries++; + if (logger.isDebugEnabled()) { + logger.debug("Failed to extract content from archive entry: name={}", filename, e); } } } diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java index eeff5326..bfa83264 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java @@ -61,14 +61,20 @@ public class ZipExtractor extends AbstractExtractor { private static final long COMPRESSION_RATIO_MIN_BYTES = 1L << 20; // 1 MiB /** - * The maximum content size (legacy field). When set, the actual bytes - * read from each entry stream are summed and compared against this limit. + * Legacy total cap on uncompressed bytes actually buffered from + * supported entries. The cap is also folded into the read budget so a + * single oversized entry cannot be buffered up to + * {@link #maxBytesPerEntry} when the user only asked for a much smaller + * total. Set to {@code -1} to disable. */ protected long maxContentSize = -1; /** * Maximum total uncompressed bytes that may be read from all entries - * combined. Defaults to 2 GiB. Set to {@code -1} to disable. + * combined. Defaults to 2 GiB. Set to {@code -1} to disable. Only bytes + * from entries that have a registered {@link Extractor} contribute to + * this total — unsupported entries are skipped without buffering or + * draining, mirroring the pre-defence behaviour. */ protected long maxBytes = 1L << 31; @@ -78,6 +84,8 @@ public class ZipExtractor extends AbstractExtractor { * 1.9 GiB file inside an otherwise small archive) exhausting the JVM * heap when buffered into memory. Defaults to 256 MiB. Set to * {@code -1} to disable. Enforced independently of {@link #maxBytes}. + * Only applies to entries that have a registered {@link Extractor}; an + * unsupported entry is never buffered, so this cap is irrelevant for it. */ protected long maxBytesPerEntry = 256L * 1024L * 1024L; @@ -157,10 +165,28 @@ public ExtractData getText(final InputStream in, final Map param } final String filename = entry.getName(); if (entry.isDirectory()) { + lastCompressedBytes = counter.getByteCount(); continue; } if (isPathTraversal(filename)) { logger.warn("zip entry rejected: name={} reason=path-traversal", filename); + // Keep the compressed-bytes anchor in step with the + // stream so the next supported entry's ratio is + // computed against ITS own compressed bytes, not also + // those of the rejected entry. + lastCompressedBytes = counter.getByteCount(); + continue; + } + + // Decide MIME / extractor up front. An unsupported entry + // (e.g. a video alongside a small .txt) is skipped without + // buffering, so a large irrelevant entry does not consume + // the per-entry / total caps that should be reserved for + // entries the crawler actually wants to extract. + final String mimeType = mimeTypeHelper.getContentType(null, filename); + final Extractor extractor = mimeType != null ? extractorFactory.getExtractor(mimeType) : null; + if (extractor == null) { + lastCompressedBytes = counter.getByteCount(); continue; } @@ -174,13 +200,22 @@ public ExtractData getText(final InputStream in, final Map param } else { totalReadLimit = Long.MAX_VALUE; } + // Fold maxContentSize into the read budget so a small + // legacy cap is honoured before a large per-entry cap + // can buffer hundreds of MiB into memory. + final long contentReadLimit; + if (maxContentSize >= 0) { + contentReadLimit = Math.max(0L, maxContentSize - totalBytes) + 1L; + } else { + contentReadLimit = Long.MAX_VALUE; + } // Enforce a per-entry cap independently of the total // cap so that a single oversized entry cannot exhaust // the JVM heap. We read one byte beyond the cap so the // explicit overflow check below can distinguish // "exactly at the cap" from "exceeds the cap". final long perEntryReadLimit = maxBytesPerEntry > 0 ? maxBytesPerEntry + 1L : Long.MAX_VALUE; - final long readLimit = Math.min(totalReadLimit, perEntryReadLimit); + final long readLimit = Math.min(Math.min(totalReadLimit, contentReadLimit), perEntryReadLimit); final ByteArrayOutputStream out = new ByteArrayOutputStream(); actualBytes = copyBounded(ais, out, readLimit); entryBytes = out.toByteArray(); @@ -189,6 +224,7 @@ public ExtractData getText(final InputStream in, final Map param if (logger.isDebugEnabled()) { logger.debug("Failed to read zip entry: name={}", filename, ioe); } + lastCompressedBytes = counter.getByteCount(); continue; } @@ -201,7 +237,7 @@ public ExtractData getText(final InputStream in, final Map param if (maxBytes > 0 && totalBytes > maxBytes) { throw new MaxLengthExceededException("zip uncompressed size exceeded: total=" + totalBytes + " max=" + maxBytes); } - if (maxContentSize != -1 && totalBytes > maxContentSize) { + if (maxContentSize >= 0 && totalBytes > maxContentSize) { throw new MaxLengthExceededException("Extracted size is " + totalBytes + " > " + maxContentSize); } @@ -223,24 +259,18 @@ public ExtractData getText(final InputStream in, final Map param + (actualBytes / compressed) + " max=" + maxCompressionRatio); } - final String mimeType = mimeTypeHelper.getContentType(null, filename); - if (mimeType != null) { - final Extractor extractor = extractorFactory.getExtractor(mimeType); - if (extractor != null) { - try { - final Map map = incrementDepth(params); - map.put(ExtractData.RESOURCE_NAME_KEY, filename); - buf.append(extractor.getText(new ByteArrayInputStream(entryBytes), map).getContent()); - buf.append('\n'); - processedEntries++; - } catch (final MaxLengthExceededException e) { - throw e; - } catch (final Exception e) { - failedEntries++; - if (logger.isDebugEnabled()) { - logger.debug("Failed to extract content from archive entry: name={}", filename, e); - } - } + try { + final Map map = incrementDepth(params); + map.put(ExtractData.RESOURCE_NAME_KEY, filename); + buf.append(extractor.getText(new ByteArrayInputStream(entryBytes), map).getContent()); + buf.append('\n'); + processedEntries++; + } catch (final MaxLengthExceededException e) { + throw e; + } catch (final Exception e) { + failedEntries++; + if (logger.isDebugEnabled()) { + logger.debug("Failed to extract content from archive entry: name={}", filename, e); } } } diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java index 6c6ba2b4..39aeefc4 100644 --- a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java +++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java @@ -445,10 +445,13 @@ public void test_tar_recursionDepth_exceeded() throws Exception { @Test public void test_perEntryCapEnforced() throws Exception { - // Build a zip whose single entry exceeds the configured per-entry - // cap. The extractor must trip the cap before buffering the whole - // payload. We use a small cap (1 MiB) and a slightly larger payload - // (2 MiB) so the test stays cheap on parallel/low-memory CI. + // Build a zip whose single SUPPORTED entry exceeds the configured + // per-entry cap. The extractor must trip the cap before buffering + // the whole payload. We use a small cap (1 MiB) and a slightly + // larger payload (2 MiB) so the test stays cheap on parallel / + // low-memory CI. The extension is .txt so the entry routes through + // the registered text/plain extractor — only supported entries are + // buffered (and therefore can hit the per-entry memory cap). final int perEntryCap = 1024 * 1024; final int entrySize = 2 * perEntryCap; final byte[] payload = new byte[entrySize]; @@ -468,7 +471,7 @@ public void test_perEntryCapEnforced() throws Exception { final ByteArrayOutputStream baos = new ByteArrayOutputStream(); try (ZipArchiveOutputStream zos = new ZipArchiveOutputStream(baos)) { - final ZipArchiveEntry entry = new ZipArchiveEntry("big.bin"); + final ZipArchiveEntry entry = new ZipArchiveEntry("big.txt"); entry.setMethod(ZipArchiveEntry.DEFLATED); entry.setSize(payload.length); entry.setCompressedSize(compBuf.size()); @@ -494,6 +497,73 @@ public void test_perEntryCapEnforced() throws Exception { } } + // --------------------------------------------------------------------- + // Unsupported entries must NOT consume the per-entry / total caps — + // they are skipped without buffering so that supported entries + // alongside them still extract successfully (regression for PR #161 + // review feedback). + // --------------------------------------------------------------------- + + @Test + public void test_zip_unsupportedEntryDoesNotConsumeCaps() throws Exception { + // A "big.bin" payload that, were it to be buffered, would exceed + // both the per-entry cap and the total cap. The supported "ok.txt" + // alongside it must still extract because no extractor is + // registered for application/octet-stream. + final byte[] big = new byte[4 * 1024 * 1024]; + final byte[] data = buildZip(new EntrySpec("big.bin", big), new EntrySpec("ok.txt", "good".getBytes(StandardCharsets.UTF_8))); + + zipExtractor.setMaxBytes(64 * 1024); // smaller than big.bin + zipExtractor.setMaxBytesPerEntry(64 * 1024); // also smaller + zipExtractor.setMaxContentSize(64 * 1024); + try (InputStream in = new ByteArrayInputStream(data)) { + final String content = zipExtractor.getText(in, null).getContent(); + assertTrue(content.contains("good")); + } + } + + @Test + public void test_tar_unsupportedEntryDoesNotConsumeCaps() throws Exception { + final byte[] big = new byte[4 * 1024 * 1024]; + final byte[] data = buildTar(new TarEntrySpec("big.bin", big), new TarEntrySpec("ok.txt", "good".getBytes(StandardCharsets.UTF_8))); + + tarExtractor.setMaxBytes(64 * 1024); + tarExtractor.setMaxBytesPerEntry(64 * 1024); + tarExtractor.setMaxContentSize(64 * 1024); + try (InputStream in = new ByteArrayInputStream(data)) { + final String content = tarExtractor.getText(in, null).getContent(); + assertTrue(content.contains("good")); + } + } + + // --------------------------------------------------------------------- + // maxContentSize is folded into the read budget — a small legacy cap + // must trip BEFORE the buffer grows to the much larger per-entry cap + // (regression for PR #161 review feedback). + // --------------------------------------------------------------------- + + @Test + public void test_zip_maxContentSize_capsBufferBeforePerEntryCap() throws Exception { + // 4 MiB supported entry; per-entry cap default is large; legacy + // maxContentSize is small. Without the fix the buffer would grow + // up to maxBytesPerEntry+1 before throwing. With the fix the read + // budget is bounded by maxContentSize+1 so buffering stops early. + final int legacyCap = 64 * 1024; + final byte[] payload = new byte[4 * 1024 * 1024]; + final byte[] data = buildZip(new EntrySpec("big.txt", payload)); + + zipExtractor.setMaxBytes(-1); + zipExtractor.setMaxCompressionRatio(-1); + zipExtractor.setMaxBytesPerEntry(8L * 1024L * 1024L); // intentionally larger than payload + zipExtractor.setMaxContentSize(legacyCap); + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("Extracted size is")); + } + } + @Test public void test_lha_recursionDepth_exceeded() { final Map params = new HashMap<>(); From 7f5b2438bc7feba13f92ca49436455cd96af28c3 Mon Sep 17 00:00:00 2001 From: Shinsuke Sugaya Date: Sat, 16 May 2026 12:30:58 +0900 Subject: [PATCH 5/6] fix(extractor): harden archive defenses from review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses correctness, security, and observability issues raised in the review of the archive-bomb / Zip-Slip / recursion-depth defenses. Correctness - AbstractExtractor: new addOneSaturating(long) clamps the read-limit "+1" margin at Long.MAX_VALUE so an admin who sets a cap near Long.MAX_VALUE no longer wraps to Long.MIN_VALUE and silently buffers zero bytes. All "+1L" sites in Zip/Tar/Lha now go through it. - AbstractExtractor.copyBounded: a negative limit now throws IllegalArgumentException instead of silently returning 0, so a misconfiguration upstream surfaces immediately. - AbstractExtractor.isPathTraversal: normalise backslash to forward slash BEFORE calling Paths.get so single-segment names like "a\\.." are recognised as traversal on Linux/macOS JVMs (where backslash is a literal filename character). Security - ZipExtractor: tighten the early signature check to PK\x03\x04 only; short-circuit valid empty archives that start with PK\x05\x06; reject PK\x07\x08 (data-descriptor) and other non-opening signatures. - ZipExtractor: flip allowStoredEntriesWithDataDescriptor back to false (the pre-PR factory default) to avoid widening the attack surface. - ZipExtractor: compression-ratio check now uses min(header_compressed_size, measured_compressed_size) so a header that lies about the compressed size cannot suppress the check. - TarExtractor: skip PAX extended/global header pseudo-entries before incrementing entryCount so they do not consume the maxEntries budget. - TarExtractor: symbolic/hard-link skip log upgraded from DEBUG to WARN for parity with path-traversal rejection. Observability - ZipExtractor / TarExtractor / LhaExtractor: emit a summary WARN when the entry loop completes normally with failedEntries > 0. Operators no longer have to enable DEBUG to notice partial extraction. - LhaExtractor: widen the per-entry catch to (IOException | RuntimeException) and guard against a null InputStream from LhaFile#getInputStream — the dangan library does not declare checked exceptions for corrupt/unknown compression methods and can return null when a header is missing. Failed entries are now logged at WARN with the filename and counted in failedEntries. - LhaExtractor: replace the empty catch on LhaFile#close() with a WARN log so a failing close (e.g. EBADF, disk full) is no longer silently hidden. - LhaExtractor: use validateInputStream(in) instead of the bespoke null check so the message and exception type match Zip/Tar. Tests - AbstractExtractorTest: 9 new isPathTraversal cases (null, empty, drive letter, leading slash, leading backslash, lone "..", classic traversal, safe relative, NUL char) plus the new single-segment backslash regression that exercises the Paths.get fix; 2 new addOneSaturating cases. - ArchiveExtractorSecurityTest: 15 new tests covering the ZIP signature variants, the saturating-add at Long.MAX_VALUE, the min(header,measured) ratio path, nested-archive recursion-depth increment, per-entry/total cap matrix with each cap disabled, the CP932 test via Assumptions.assumeTrue, a tightened ratio assertion, the Tar PAX-header skip, the Tar per-entry cap, the Tar symlink WARN behaviour, and the setMaxArchiveDepth setter. All 83 tests in the archive-extractor test classes pass. The single flaky failure in the wider suite (Hc5HttpClientTest#test_doHead_ accessTimeoutTarget) is pre-existing and unrelated. --- .../extractor/impl/AbstractExtractor.java | 39 +- .../crawler/extractor/impl/LhaExtractor.java | 47 ++- .../crawler/extractor/impl/TarExtractor.java | 21 +- .../crawler/extractor/impl/ZipExtractor.java | 64 ++- .../extractor/impl/AbstractExtractorTest.java | 109 ++++++ .../impl/ArchiveExtractorSecurityTest.java | 367 ++++++++++++++++++ 6 files changed, 603 insertions(+), 44 deletions(-) diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java index 7a2eb122..19c71822 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractor.java @@ -263,8 +263,18 @@ protected static boolean isPathTraversal(final String name) { if (name.length() >= 2 && name.charAt(1) == ':') { return true; } + // Normalise backslashes to forward slashes BEFORE calling Paths.get(). + // On Linux (and macOS) a backslash is a literal filename character, so + // Paths.get("a\\..") treats "a\\.." as a SINGLE opaque segment and + // normalize() leaves it unchanged — bypassing the ".." segment check. + // Unifying to "/" first forces the path parser to recognise each + // component correctly on all platforms. + final String unified = name.replace('\\', '/'); + if (unified.startsWith("/")) { + return true; + } try { - final Path normalised = Paths.get(name).normalize(); + final Path normalised = Paths.get(unified).normalize(); final String normStr = normalised.toString().replace('\\', '/'); if (normStr.equals("..") || normStr.startsWith("../") || normStr.contains("/../")) { return true; @@ -280,6 +290,21 @@ protected static boolean isPathTraversal(final String name) { return false; } + /** + * Saturating add: returns {@code value + 1} unless that would overflow + * {@code Long.MAX_VALUE}, in which case {@code Long.MAX_VALUE} is returned. + * Used when computing a read limit that is one byte beyond a cap so that + * the caller can detect "exactly at the cap" vs "exceeds the cap" without + * silently wrapping to a negative limit and reading nothing. + * + * @param value the value to increment (must be non-negative) + * @return {@code value + 1} or {@code Long.MAX_VALUE} if already at the + * maximum + */ + protected static long addOneSaturating(final long value) { + return value >= Long.MAX_VALUE ? Long.MAX_VALUE : value + 1L; + } + /** * Copies up to {@code limit} bytes from {@code in} to {@code out}, returning * the actual number of bytes copied. Used by archive extractors to bound @@ -288,14 +313,20 @@ protected static boolean isPathTraversal(final String name) { * * @param in the source stream * @param out the sink stream - * @param limit the maximum number of bytes to copy (inclusive). Values - * {@code <= 0} cause the method to return without reading. + * @param limit the maximum number of bytes to copy (inclusive). Must be + * non-negative; a negative value throws + * {@link IllegalArgumentException} so misconfiguration is + * surfaced immediately rather than silently reading nothing. * @return the number of bytes actually copied + * @throws IllegalArgumentException if {@code limit} is negative * @throws IOException if reading from {@code in} or writing to {@code out} * fails */ protected static long copyBounded(final InputStream in, final OutputStream out, final long limit) throws IOException { - if (limit <= 0) { + if (limit < 0) { + throw new IllegalArgumentException("copyBounded: limit must be non-negative, got " + limit); + } + if (limit == 0) { return 0; } final byte[] buffer = new byte[8192]; diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java index 7e975e13..9e1b1585 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java @@ -29,7 +29,6 @@ import org.codelibs.core.io.CloseableUtil; import org.codelibs.core.io.FileUtil; import org.codelibs.fess.crawler.entity.ExtractData; -import org.codelibs.fess.crawler.exception.CrawlerSystemException; import org.codelibs.fess.crawler.exception.ExtractException; import org.codelibs.fess.crawler.exception.MaxLengthExceededException; import org.codelibs.fess.crawler.extractor.Extractor; @@ -116,14 +115,14 @@ public LhaExtractor() { */ @Override public ExtractData getText(final InputStream in, final Map params) { - if (in == null) { - throw new CrawlerSystemException("LHA archive input stream is null. Cannot extract text from null input."); - } + validateInputStream(in); checkDepth(params, maxArchiveDepth); final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper(); final ExtractorFactory extractorFactory = getExtractorFactory(); final StringBuilder buf = new StringBuilder(1000); + int processedEntries = 0; + int failedEntries = 0; File tempFile = null; LhaFile lhaFile = null; @@ -133,7 +132,7 @@ public ExtractData getText(final InputStream in, final Map param // Stage the (untrusted) archive bytes to disk under a hard // cap so a hostile producer cannot exhaust local storage by // streaming an arbitrarily large body. - final long inputReadLimit = maxInputBytes > 0 ? maxInputBytes + 1L : Long.MAX_VALUE; + final long inputReadLimit = maxInputBytes > 0 ? addOneSaturating(maxInputBytes) : Long.MAX_VALUE; final long staged = copyBounded(in, fos, inputReadLimit); if (maxInputBytes > 0 && staged > maxInputBytes) { throw new MaxLengthExceededException("lha input size exceeded: bytes=" + staged + " max=" + maxInputBytes); @@ -175,10 +174,20 @@ public ExtractData getText(final InputStream in, final Map param final byte[] entryBytes; InputStream is = null; try { + // getInputStream(LhaHeader) can return null when the header + // is not found in the archive index, and can throw + // RuntimeException (e.g. IllegalArgumentException from + // CompressMethod.getCore) for unknown/corrupt compression + // methods — both must be handled before touching `is`. is = lhaFile.getInputStream(head); + if (is == null) { + logger.warn("lha entry stream is null: name={}", filename); + failedEntries++; + continue; + } final long totalReadLimit; if (maxBytes > 0) { - totalReadLimit = Math.max(0L, maxBytes - totalBytes) + 1L; + totalReadLimit = addOneSaturating(Math.max(0L, maxBytes - totalBytes)); } else { totalReadLimit = Long.MAX_VALUE; } @@ -187,19 +196,23 @@ public ExtractData getText(final InputStream in, final Map param // can buffer hundreds of MiB into memory. final long contentReadLimit; if (maxContentSize >= 0) { - contentReadLimit = Math.max(0L, maxContentSize - totalBytes) + 1L; + contentReadLimit = addOneSaturating(Math.max(0L, maxContentSize - totalBytes)); } else { contentReadLimit = Long.MAX_VALUE; } - final long perEntryReadLimit = maxBytesPerEntry > 0 ? maxBytesPerEntry + 1L : Long.MAX_VALUE; + final long perEntryReadLimit = maxBytesPerEntry > 0 ? addOneSaturating(maxBytesPerEntry) : Long.MAX_VALUE; final long readLimit = Math.min(Math.min(totalReadLimit, contentReadLimit), perEntryReadLimit); final ByteArrayOutputStream out = new ByteArrayOutputStream(); actualBytes = copyBounded(is, out, readLimit); entryBytes = out.toByteArray(); - } catch (final IOException ioe) { - if (logger.isDebugEnabled()) { - logger.debug("Failed to read lha entry: name={}", filename, ioe); - } + } catch (final IOException | RuntimeException ioe) { + // IOException: normal read failure. + // RuntimeException: includes IllegalArgumentException thrown + // by CompressMethod.getCore() for corrupt/unknown compression + // methods in the dangan LHA library (getInputStream does not + // declare checked exceptions for these cases). + logger.warn("Failed to read lha entry: name={}", filename, ioe); + failedEntries++; continue; } finally { CloseableUtil.closeQuietly(is); @@ -223,14 +236,20 @@ public ExtractData getText(final InputStream in, final Map param map.put(ExtractData.RESOURCE_NAME_KEY, filename); buf.append(extractor.getText(new ByteArrayInputStream(entryBytes), map).getContent()); buf.append('\n'); + processedEntries++; } catch (final MaxLengthExceededException e) { throw e; } catch (final Exception e) { + failedEntries++; if (logger.isDebugEnabled()) { - logger.debug("Exception in an internal extractor.", e); + logger.debug("Exception in an internal extractor: name={}", filename, e); } } } + // Summary warn when the loop completed normally but some entries failed. + if (failedEntries > 0) { + logger.warn("LHA archive partially processed: processed={} failed={}", processedEntries, failedEntries); + } } catch (final MaxLengthExceededException e) { throw e; } catch (final Exception e) { @@ -240,7 +259,7 @@ public ExtractData getText(final InputStream in, final Map param try { lhaFile.close(); } catch (final IOException e) { - // ignore + logger.warn("Failed to close LHA file. tempFile={}", tempFile, e); } } FileUtil.deleteInBackground(tempFile); diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java index 602878e5..0bebefc7 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/TarExtractor.java @@ -123,6 +123,13 @@ protected String getTextInternal(final InputStream in, final MimeTypeHelper mime long totalBytes = 0; int entryCount = 0; while ((entry = (TarArchiveEntry) ais.getNextEntry()) != null) { + // PAX extended / global headers are metadata-only pseudo-entries + // that must NOT count toward maxEntries. Commons Compress + // usually absorbs them, but skip defensively for non-standard + // tar dialects that surface them through getNextEntry(). + if (entry.isPaxHeader() || entry.isGlobalPaxHeader()) { + continue; + } entryCount++; if (maxEntries > 0 && entryCount > maxEntries) { throw new MaxLengthExceededException("tar entry count exceeded: count=" + entryCount + " max=" + maxEntries); @@ -132,9 +139,7 @@ protected String getTextInternal(final InputStream in, final MimeTypeHelper mime continue; } if (entry.isSymbolicLink() || entry.isLink()) { - if (logger.isDebugEnabled()) { - logger.debug("tar entry skipped: name={} reason=link link={}", filename, entry.getLinkName()); - } + logger.warn("tar entry skipped: name={} reason=link link={}", filename, entry.getLinkName()); continue; } if (isPathTraversal(filename)) { @@ -158,7 +163,7 @@ protected String getTextInternal(final InputStream in, final MimeTypeHelper mime try { final long totalReadLimit; if (maxBytes > 0) { - totalReadLimit = Math.max(0L, maxBytes - totalBytes) + 1L; + totalReadLimit = addOneSaturating(Math.max(0L, maxBytes - totalBytes)); } else { totalReadLimit = Long.MAX_VALUE; } @@ -167,11 +172,11 @@ protected String getTextInternal(final InputStream in, final MimeTypeHelper mime // can buffer hundreds of MiB into memory. final long contentReadLimit; if (maxContentSize >= 0) { - contentReadLimit = Math.max(0L, maxContentSize - totalBytes) + 1L; + contentReadLimit = addOneSaturating(Math.max(0L, maxContentSize - totalBytes)); } else { contentReadLimit = Long.MAX_VALUE; } - final long perEntryReadLimit = maxBytesPerEntry > 0 ? maxBytesPerEntry + 1L : Long.MAX_VALUE; + final long perEntryReadLimit = maxBytesPerEntry > 0 ? addOneSaturating(maxBytesPerEntry) : Long.MAX_VALUE; final long readLimit = Math.min(Math.min(totalReadLimit, contentReadLimit), perEntryReadLimit); final ByteArrayOutputStream out = new ByteArrayOutputStream(); actualBytes = copyBounded(ais, out, readLimit); @@ -212,6 +217,10 @@ protected String getTextInternal(final InputStream in, final MimeTypeHelper mime } } } + // Summary warn when the loop completed normally but some entries failed. + if (failedEntries > 0) { + logger.warn("TAR archive partially processed: processed={} failed={}", processedEntries, failedEntries); + } } catch (final MaxLengthExceededException e) { throw e; } catch (final Exception e) { diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java index bfa83264..ad2f8714 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/ZipExtractor.java @@ -142,7 +142,19 @@ public ExtractData getText(final InputStream in, final Map param read += n; } wrapped.reset(); - if (read != 4 || sig[0] != 'P' || sig[1] != 'K' || (sig[2] != 0x03 && sig[2] != 0x05 && sig[2] != 0x07)) { + // ZIP local-file-header: PK\x03\x04 + // ZIP empty-archive EOCD: PK\x05\x06 + // PK\x07\x08 is a data-descriptor signature and must NEVER appear + // at the very start of a valid ZIP; reject it along with anything + // else that is not a recognised opening signature. + if (read < 4 || sig[0] != 'P' || sig[1] != 'K') { + throw new ExtractException("Failed to extract content from ZIP archive. Not a recognised ZIP signature."); + } + if (sig[2] == 0x05 && sig[3] == 0x06) { + // Valid but empty archive — short-circuit immediately. + return new ExtractData(""); + } + if (sig[2] != 0x03 || sig[3] != 0x04) { throw new ExtractException("Failed to extract content from ZIP archive. Not a recognised ZIP signature."); } } catch (final IOException ioe) { @@ -153,7 +165,10 @@ public ExtractData getText(final InputStream in, final Map param // signal in streaming mode (ZipArchiveEntry#getCompressedSize() is // often -1 when entries use a data descriptor). final CountingInputStream counter = new CountingInputStream(wrapped); - try (final ZipArchiveInputStream ais = new ZipArchiveInputStream(counter, filenameEncoding, true, true)) { + // allowStoredEntriesWithDataDescriptor=false (the default) to avoid + // widening the attack surface; the pre-PR factory default was also + // false and no test requires the looser mode. + try (final ZipArchiveInputStream ais = new ZipArchiveInputStream(counter, filenameEncoding, true, false)) { ZipArchiveEntry entry; long totalBytes = 0; long lastCompressedBytes = counter.getByteCount(); @@ -196,7 +211,7 @@ public ExtractData getText(final InputStream in, final Map param try { final long totalReadLimit; if (maxBytes > 0) { - totalReadLimit = Math.max(0L, maxBytes - totalBytes) + 1L; + totalReadLimit = addOneSaturating(Math.max(0L, maxBytes - totalBytes)); } else { totalReadLimit = Long.MAX_VALUE; } @@ -205,7 +220,7 @@ public ExtractData getText(final InputStream in, final Map param // can buffer hundreds of MiB into memory. final long contentReadLimit; if (maxContentSize >= 0) { - contentReadLimit = Math.max(0L, maxContentSize - totalBytes) + 1L; + contentReadLimit = addOneSaturating(Math.max(0L, maxContentSize - totalBytes)); } else { contentReadLimit = Long.MAX_VALUE; } @@ -214,7 +229,7 @@ public ExtractData getText(final InputStream in, final Map param // the JVM heap. We read one byte beyond the cap so the // explicit overflow check below can distinguish // "exactly at the cap" from "exceeds the cap". - final long perEntryReadLimit = maxBytesPerEntry > 0 ? maxBytesPerEntry + 1L : Long.MAX_VALUE; + final long perEntryReadLimit = maxBytesPerEntry > 0 ? addOneSaturating(maxBytesPerEntry) : Long.MAX_VALUE; final long readLimit = Math.min(Math.min(totalReadLimit, contentReadLimit), perEntryReadLimit); final ByteArrayOutputStream out = new ByteArrayOutputStream(); actualBytes = copyBounded(ais, out, readLimit); @@ -242,21 +257,26 @@ public ExtractData getText(final InputStream in, final Map param } // Compression-ratio check (only meaningful for non-tiny entries). - // Prefer the entry header's compressed size when present; - // otherwise fall back to the bytes actually consumed from the - // underlying stream during this entry's read. - long compressed = entry.getCompressedSize(); - if (compressed <= 0) { - final long now = counter.getByteCount(); - compressed = Math.max(0L, now - lastCompressedBytes); - lastCompressedBytes = now; - } else { - lastCompressedBytes = counter.getByteCount(); - } - if (maxCompressionRatio > 0 && compressed > 0 && actualBytes > COMPRESSION_RATIO_MIN_BYTES - && actualBytes / compressed > maxCompressionRatio) { - throw new MaxLengthExceededException("zip compression ratio exceeded: name=" + filename + " ratio=" - + (actualBytes / compressed) + " max=" + maxCompressionRatio); + // Always measure the bytes actually consumed from the underlying + // stream during this entry's read (this is the only reliable + // signal in streaming mode where data descriptors are used). + // When the header also reports a positive compressed size, take + // the minimum of both so an attacker who inflates the header + // value cannot suppress the ratio check. + final long now = counter.getByteCount(); + final long measuredCompressed = Math.max(0L, now - lastCompressedBytes); + lastCompressedBytes = now; + final long headerCompressed = entry.getCompressedSize(); + final long compressed = (headerCompressed > 0) ? Math.min(headerCompressed, measuredCompressed) : measuredCompressed; + if (maxCompressionRatio > 0 && actualBytes > COMPRESSION_RATIO_MIN_BYTES) { + if (compressed == 0) { + // Zero compressed bytes with substantial output is suspicious; + // log it but let the per-entry cap enforce the real limit. + logger.warn("zip entry has 0 compressed bytes with {} uncompressed bytes: name={}", actualBytes, filename); + } else if (actualBytes / compressed > maxCompressionRatio) { + throw new MaxLengthExceededException("zip compression ratio exceeded: name=" + filename + " ratio=" + + (actualBytes / compressed) + " max=" + maxCompressionRatio); + } } try { @@ -274,6 +294,10 @@ public ExtractData getText(final InputStream in, final Map param } } } + // Summary warn when the loop completed normally but some entries failed. + if (failedEntries > 0) { + logger.warn("ZIP archive partially processed: processed={} failed={}", processedEntries, failedEntries); + } } catch (final MaxLengthExceededException e) { throw e; } catch (final Exception e) { diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractorTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractorTest.java index 1a51483f..4054092b 100644 --- a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractorTest.java +++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/AbstractExtractorTest.java @@ -79,6 +79,15 @@ public Map testIncrementDepth(final Map params) public void testCheckDepth(final Map params, final int maxDepth) { checkDepth(params, maxDepth); } + + // Expose static helpers for testing. + public boolean testIsPathTraversal(final String name) { + return isPathTraversal(name); + } + + public long testAddOneSaturating(final long value) { + return addOneSaturating(value); + } } private TestExtractor extractor; @@ -339,4 +348,104 @@ public void test_checkDepth_atOrAboveLimit_throws() { // pass } } + + // ----------------------------------------------------------------------- + // isPathTraversal tests (C3 fix validation) + // ----------------------------------------------------------------------- + + /** null and empty are always traversals. */ + @Test + public void test_isPathTraversal_nullAndEmpty() { + assertTrue(extractor.testIsPathTraversal(null)); + assertTrue(extractor.testIsPathTraversal("")); + } + + /** Drive letter prefix is always rejected. */ + @Test + public void test_isPathTraversal_driveLetter() { + assertTrue(extractor.testIsPathTraversal("C:\\foo")); + assertTrue(extractor.testIsPathTraversal("C:foo")); + } + + /** Leading slash (Unix absolute) is rejected. */ + @Test + public void test_isPathTraversal_leadingSlash() { + assertTrue(extractor.testIsPathTraversal("/etc/passwd")); + } + + /** Leading backslash is rejected. */ + @Test + public void test_isPathTraversal_leadingBackslash() { + assertTrue(extractor.testIsPathTraversal("\\foo\\bar")); + } + + /** Lone ".." is rejected. */ + @Test + public void test_isPathTraversal_loneDotDot() { + assertTrue(extractor.testIsPathTraversal("..")); + } + + /** Classic traversal sequences are rejected. */ + @Test + public void test_isPathTraversal_classicTraversal() { + assertTrue(extractor.testIsPathTraversal("../../etc/passwd")); + assertTrue(extractor.testIsPathTraversal("foo/../../etc/passwd")); + } + + /** Safe name that resolves inside the root is allowed. */ + @Test + public void test_isPathTraversal_safeRelativePath() { + assertFalse(extractor.testIsPathTraversal("foo/bar.txt")); + assertFalse(extractor.testIsPathTraversal("foo/../bar.txt")); // normalises to bar.txt + } + + /** + * Single-segment backslash traversal (C3 regression). + * On Linux the path "a\..\..\etc" is a single opaque filename when + * Paths.get() is called without pre-normalisation, so ".." segments are + * not detected. After unifying backslash to forward-slash before + * Paths.get(), "a/../../etc" normalises to "../etc" which starts with + * ".." and is correctly rejected. + * Note: "a\.." unifies to "a/.." which normalises to the empty path + * (current dir, i.e. the archive root) — that is safe and is NOT rejected. + */ + @Test + public void test_isPathTraversal_backslashSingleSegment() { + // "a\..\..\etc" must be caught — escapes the archive root. + assertTrue(extractor.testIsPathTraversal("a\\..\\..\\etc")); + // Three levels up — definitely escapes. + assertTrue(extractor.testIsPathTraversal("a\\..\\..\\..")); // escapes + // "a\.." normalises to the archive root (current dir) — safe. + assertFalse(extractor.testIsPathTraversal("a\\..")); + // A purely safe backslash path: "foo\\bar.txt" → "foo/bar.txt" — safe. + assertFalse(extractor.testIsPathTraversal("foo\\bar.txt")); + } + + /** NUL character in path — should be rejected (InvalidPathException path). */ + @Test + public void test_isPathTraversal_nulCharacter() { + assertTrue(extractor.testIsPathTraversal("a\0b")); + } + + // ----------------------------------------------------------------------- + // addOneSaturating (C2 fix validation) + // ----------------------------------------------------------------------- + + /** addOneSaturating returns value+1 for normal inputs. */ + @Test + public void test_addOneSaturating_normalIncrement() { + assertEquals(1L, extractor.testAddOneSaturating(0L)); + assertEquals(101L, extractor.testAddOneSaturating(100L)); + assertEquals(Long.MAX_VALUE - 1L, extractor.testAddOneSaturating(Long.MAX_VALUE - 2L)); + } + + /** addOneSaturating returns Long.MAX_VALUE when already at max. */ + @Test + public void test_addOneSaturating_saturatesAtMax() { + assertEquals(Long.MAX_VALUE, extractor.testAddOneSaturating(Long.MAX_VALUE)); + // (MAX-1)+1 = MAX naturally — not overflow. + assertEquals(Long.MAX_VALUE, extractor.testAddOneSaturating(Long.MAX_VALUE - 1L)); + // Verify the result is positive (not wrapped to negative). + assertTrue(extractor.testAddOneSaturating(Long.MAX_VALUE) > 0); + } } diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java index 39aeefc4..0bab94c4 100644 --- a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java +++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/ArchiveExtractorSecurityTest.java @@ -31,6 +31,7 @@ import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream; import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.codelibs.fess.crawler.container.StandardCrawlerContainer; +import org.codelibs.fess.crawler.exception.ExtractException; import org.codelibs.fess.crawler.exception.MaxLengthExceededException; import org.codelibs.fess.crawler.extractor.ExtractorFactory; import org.codelibs.fess.crawler.helper.impl.MimeTypeHelperImpl; @@ -596,4 +597,370 @@ public void test_lha_maxInputBytes_capsStaging() { fail(); } } + + // --------------------------------------------------------------------- + // ZIP signature checks (M1) + // --------------------------------------------------------------------- + + @Test + public void test_zip_signatureCheck_rejectsDataDescriptorPrefix() throws Exception { + // PK\x07\x08 is a data-descriptor signature and must not appear at + // the start of a valid ZIP; reject it as ExtractException. + final byte[] data = new byte[] { 'P', 'K', 0x07, 0x08, 0, 0, 0, 0 }; + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, null); + fail(); + } catch (final ExtractException e) { + assertTrue(e.getMessage().contains("ZIP")); + } + } + + @Test + public void test_zip_signatureCheck_acceptsEmptyArchive() throws Exception { + // PK\x05\x06 is a valid empty-archive EOCD; extractor must return + // empty content rather than throwing. + final byte[] eocd = new byte[22]; + eocd[0] = 'P'; + eocd[1] = 'K'; + eocd[2] = 0x05; + eocd[3] = 0x06; + // Remaining 18 bytes stay 0 (valid minimal EOCD). + try (InputStream in = new ByteArrayInputStream(eocd)) { + final String content = zipExtractor.getText(in, null).getContent(); + assertEquals("", content); + } + } + + @Test + public void test_zip_signatureCheck_rejectsTruncatedStream() throws Exception { + // 2 bytes — not enough for a valid ZIP magic → ExtractException. + final byte[] data = new byte[] { 'P', 'K' }; + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, null); + fail(); + } catch (final ExtractException e) { + assertTrue(e.getMessage().contains("ZIP")); + } + } + + @Test + public void test_zip_signatureCheck_rejectsNonZip() throws Exception { + // Completely wrong magic. + final byte[] data = "not a zip file at all".getBytes(StandardCharsets.UTF_8); + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, null); + fail(); + } catch (final ExtractException e) { + assertTrue(e.getMessage().contains("ZIP")); + } + } + + // --------------------------------------------------------------------- + // Overflow: saturating +1L at Long.MAX_VALUE (C2) + // --------------------------------------------------------------------- + + @Test + public void test_overflow_saturatingAdd_atLongMaxValue() throws Exception { + // With maxBytes=Long.MAX_VALUE a small archive must succeed, not + // silently read 0 bytes due to Long overflow wrapping to negative. + final byte[] payload = "hello world".getBytes(StandardCharsets.UTF_8); + final byte[] data = buildZip(new EntrySpec("ok.txt", payload)); + + zipExtractor.setMaxBytes(Long.MAX_VALUE); + zipExtractor.setMaxBytesPerEntry(Long.MAX_VALUE); + zipExtractor.setMaxContentSize(-1); + try (InputStream in = new ByteArrayInputStream(data)) { + final String content = zipExtractor.getText(in, null).getContent(); + assertTrue(content.contains("hello world")); + } + } + + // --------------------------------------------------------------------- + // Compression ratio — min(header, measured) (M3) + // --------------------------------------------------------------------- + + @Test + public void test_zip_compressionRatio_usesMinOfHeaderAndMeasured() throws Exception { + // Build a zip where the entry header reports a huge compressedSize. + // The ratio check must use the minimum of header vs. measured bytes + // so a lying header cannot suppress the check. We build a highly + // compressible 2 MiB entry; the measured compressed bytes will be + // small, making the ratio >> 100 regardless of the header claim. + final byte[] payload = new byte[2 * 1024 * 1024]; // all zeros + + final java.util.zip.Deflater def = new java.util.zip.Deflater(java.util.zip.Deflater.BEST_COMPRESSION); + def.setInput(payload); + def.finish(); + final ByteArrayOutputStream compBuf = new ByteArrayOutputStream(); + final byte[] tmpBuf = new byte[8192]; + while (!def.finished()) { + final int n = def.deflate(tmpBuf); + compBuf.write(tmpBuf, 0, n); + } + def.end(); + final java.util.zip.CRC32 crc = new java.util.zip.CRC32(); + crc.update(payload); + + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try (ZipArchiveOutputStream zos = new ZipArchiveOutputStream(baos)) { + final ZipArchiveEntry entry = new ZipArchiveEntry("zeros.txt"); + entry.setMethod(ZipArchiveEntry.DEFLATED); + entry.setSize(payload.length); + // Set a deliberately inflated compressedSize in the header so + // that ratio = uncompressed / fakeCompressed would be < threshold. + // If the code uses min(header, measured) the measured value wins + // and ratio >> threshold => MaxLengthExceededException fires. + entry.setCompressedSize(payload.length); // 1:1 — no bomb per header + entry.setCrc(crc.getValue()); + zos.putArchiveEntry(entry); + zos.write(payload); + zos.closeArchiveEntry(); + zos.finish(); + } + final byte[] data = baos.toByteArray(); + + zipExtractor.setMaxBytes(-1); + zipExtractor.setMaxContentSize(-1); + // Ratio threshold: 100 — actual ratio >> 100 using measured bytes. + zipExtractor.setMaxCompressionRatio(100L); + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("compression ratio")); + } + } + + // --------------------------------------------------------------------- + // Nested recursion depth (zip-in-zip) (test 6) + // --------------------------------------------------------------------- + + @Test + public void test_zip_nestedRecursionCountsDepth() throws Exception { + // Build inner zip containing a text file. + final byte[] innerPayload = buildZip(new EntrySpec("inner.txt", "hello".getBytes(StandardCharsets.UTF_8))); + // Build outer zip containing the inner zip. + final byte[] data = buildZip(new EntrySpec("nested.zip", innerPayload)); + + // Allow depth=1 only — outer zip processes ok (depth 0→1), + // inner zip invocation is at depth=1 which == maxArchiveDepth → throws. + zipExtractor.setMaxArchiveDepth(1); + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().toLowerCase().contains("recursion")); + } + } + + // --------------------------------------------------------------------- + // Per-entry cap fires when total cap disabled (test 7) + // --------------------------------------------------------------------- + + @Test + public void test_zip_perEntryCap_whenMaxBytesDisabled() throws Exception { + final byte[] payload = new byte[2 * 1024 * 1024]; // 2 MiB + final byte[] data = buildZip(new EntrySpec("big.txt", payload)); + + zipExtractor.setMaxBytes(-1); + zipExtractor.setMaxContentSize(-1); + zipExtractor.setMaxCompressionRatio(-1); + zipExtractor.setMaxBytesPerEntry(1024 * 1024); // 1 MiB cap + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("per-entry size exceeded")); + } + } + + // --------------------------------------------------------------------- + // Total cap fires when per-entry cap disabled (test 8) + // --------------------------------------------------------------------- + + @Test + public void test_zip_maxBytes_whenPerEntryDisabled() throws Exception { + final byte[] payload = new byte[64 * 1024]; // 64 KiB each + final byte[] data = buildZip(new EntrySpec("a.txt", payload), new EntrySpec("b.txt", payload)); + + zipExtractor.setMaxBytesPerEntry(-1); // disable per-entry + zipExtractor.setMaxCompressionRatio(-1); + zipExtractor.setMaxBytes(64 * 1024); // exactly one entry → second exceeds + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("zip uncompressed size exceeded")); + } + } + + // --------------------------------------------------------------------- + // CP932 filename (m2: use Assumptions instead of silent return) + // --------------------------------------------------------------------- + + @Test + public void test_cp932Filename_withAssumption() throws Exception { + // If MS932 is unavailable, skip with Assumptions rather than a + // silent return, so the test result is clearly SKIPPED not PASSED. + org.junit.jupiter.api.Assumptions.assumeTrue(Charset.isSupported("MS932"), "MS932 charset not available on this JVM"); + + final Charset cp932 = Charset.forName("MS932"); + final byte[] data = buildZipWithCharset(cp932, new EntrySpec("テスト.txt", "japan".getBytes(StandardCharsets.UTF_8))); + + zipExtractor.setFilenameEncoding("MS932"); + try (InputStream in = new ByteArrayInputStream(data)) { + final String content = zipExtractor.getText(in, null).getContent(); + assertTrue(content.contains("japan")); + } + } + + // --------------------------------------------------------------------- + // Compression-ratio message tightened (m3) + // --------------------------------------------------------------------- + + @Test + public void test_compressionRatioExceeded_messageContainsRatio() throws Exception { + // Same high-ratio archive as the existing test; assert the message + // specifically contains "compression ratio" (not just "uncompressed + // size"), because maxBytes=-1 means the total cap is disabled. + final byte[] payload = new byte[2 * 1024 * 1024]; + final java.util.zip.Deflater def = new java.util.zip.Deflater(java.util.zip.Deflater.BEST_COMPRESSION); + def.setInput(payload); + def.finish(); + final ByteArrayOutputStream compBuf = new ByteArrayOutputStream(); + final byte[] tmpBuf = new byte[8192]; + while (!def.finished()) { + final int n = def.deflate(tmpBuf); + compBuf.write(tmpBuf, 0, n); + } + def.end(); + final java.util.zip.CRC32 crc = new java.util.zip.CRC32(); + crc.update(payload); + + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try (ZipArchiveOutputStream zos = new ZipArchiveOutputStream(baos)) { + final ZipArchiveEntry entry = new ZipArchiveEntry("zeros.txt"); + entry.setMethod(ZipArchiveEntry.DEFLATED); + entry.setSize(payload.length); + entry.setCompressedSize(compBuf.size()); + entry.setCrc(crc.getValue()); + zos.putArchiveEntry(entry); + zos.write(payload); + zos.closeArchiveEntry(); + zos.finish(); + } + final byte[] data = baos.toByteArray(); + + zipExtractor.setMaxBytes(-1); + zipExtractor.setMaxContentSize(-1); + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + // With maxBytes=-1, only the ratio check can fire. + assertTrue(e.getMessage().contains("compression ratio")); + } + } + + // --------------------------------------------------------------------- + // Tar PAX global header does not consume entry count (m4) + // --------------------------------------------------------------------- + + @Test + public void test_tar_paxGlobalHeader_doesNotConsumeEntryCount() throws Exception { + // Build a tar that, when read by TarArchiveInputStream, produces a + // PAX global header entry followed by a real text entry. We use a + // long filename (>100 chars) with LONGFILE_POSIX mode, which causes + // Commons Compress to emit a PAX extended header (type 'x') for the + // long name before the real entry. isPaxHeader() returns true for + // type 'x', so the fix must skip it without incrementing entryCount. + // With maxEntries=1, if the PAX extension header is counted the real + // entry would push the count to 2 and trigger the cap. + final String longName = "a".repeat(110) + ".txt"; // > 100-char POSIX limit + final byte[] content = "hello".getBytes(StandardCharsets.UTF_8); + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try (TarArchiveOutputStream tos = new TarArchiveOutputStream(baos)) { + tos.setLongFileMode(TarArchiveOutputStream.LONGFILE_POSIX); + final TarArchiveEntry real = new TarArchiveEntry(longName); + real.setSize(content.length); + tos.putArchiveEntry(real); + tos.write(content); + tos.closeArchiveEntry(); + tos.finish(); + } + final byte[] data = baos.toByteArray(); + + tarExtractor.setMaxEntries(1); // only 1 real entry allowed + try (InputStream in = new ByteArrayInputStream(data)) { + final String text = tarExtractor.getText(in, null).getContent(); + assertTrue(text.contains("hello")); + } + } + + // --------------------------------------------------------------------- + // Tar per-entry cap enforced (test 13) + // --------------------------------------------------------------------- + + @Test + public void test_tar_perEntryCapEnforced() throws Exception { + final byte[] payload = new byte[2 * 1024 * 1024]; // 2 MiB + final byte[] data = buildTar(new TarEntrySpec("big.txt", payload)); + + tarExtractor.setMaxBytes(-1); + tarExtractor.setMaxContentSize(-1); + tarExtractor.setMaxBytesPerEntry(1024 * 1024); // 1 MiB cap + try (InputStream in = new ByteArrayInputStream(data)) { + tarExtractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("tar per-entry size exceeded")); + } + } + + // --------------------------------------------------------------------- + // Tar symlink skip now at WARN (m9) — verify it does not throw + // --------------------------------------------------------------------- + + @Test + public void test_tar_symlinkSkipped_doesNotThrow() throws Exception { + // Already covered by test_tar_symlinkSkipped; this confirms the + // behaviour is unchanged after upgrading the log level to WARN. + final byte[] data = buildTar(new TarEntrySpec("ok.txt", "regular".getBytes(StandardCharsets.UTF_8)), + new TarEntrySpec("evil.txt", null, TarArchiveEntry.LF_SYMLINK, "/etc/passwd")); + + try (InputStream in = new ByteArrayInputStream(data)) { + final String content = tarExtractor.getText(in, null).getContent(); + assertTrue(content.contains("regular")); + assertFalse(content.contains("passwd")); + } + } + + // --------------------------------------------------------------------- + // setMaxArchiveDepth changes threshold (test 16) + // --------------------------------------------------------------------- + + @Test + public void test_setMaxArchiveDepth_changesThreshold() throws Exception { + final byte[] data = buildZip(new EntrySpec("ok.txt", "hi".getBytes(StandardCharsets.UTF_8))); + + // depth=3 at maxArchiveDepth=3 → throws + zipExtractor.setMaxArchiveDepth(3); + final Map params3 = new HashMap<>(); + params3.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "3"); + try (InputStream in = new ByteArrayInputStream(data)) { + zipExtractor.getText(in, params3); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().toLowerCase().contains("recursion")); + } + + // depth=11 at maxArchiveDepth=20 → passes + zipExtractor.setMaxArchiveDepth(20); + final Map params11 = new HashMap<>(); + params11.put(AbstractExtractor.EXTRACTOR_DEPTH_KEY, "11"); + try (InputStream in = new ByteArrayInputStream(data)) { + final String content = zipExtractor.getText(in, params11).getContent(); + assertTrue(content.contains("hi")); + } + } } From bc2ab790e2d0848c86fde2ccfc78ee9c7431aef0 Mon Sep 17 00:00:00 2001 From: Shinsuke Sugaya Date: Sat, 16 May 2026 13:24:07 +0900 Subject: [PATCH 6/6] fix(extractor): import CrawlerSystemException in LhaExtractor for javadoc Javadoc tag @throws CrawlerSystemException on getText() referred to a class not on the file's import list, breaking maven-javadoc-plugin with 'reference not found' on CI. --- .../org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java | 1 + 1 file changed, 1 insertion(+) diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java index 9e1b1585..5938f857 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/LhaExtractor.java @@ -29,6 +29,7 @@ import org.codelibs.core.io.CloseableUtil; import org.codelibs.core.io.FileUtil; import org.codelibs.fess.crawler.entity.ExtractData; +import org.codelibs.fess.crawler.exception.CrawlerSystemException; import org.codelibs.fess.crawler.exception.ExtractException; import org.codelibs.fess.crawler.exception.MaxLengthExceededException; import org.codelibs.fess.crawler.extractor.Extractor;