diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractor.java
index 363fe063..3bf1ecf0 100644
--- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractor.java
+++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractor.java
@@ -15,14 +15,22 @@
*/
package org.codelibs.fess.crawler.extractor.impl;
+import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.StandardCharsets;
+import java.nio.charset.UnsupportedCharsetException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
+import java.util.ArrayList;
import java.util.Date;
import java.util.Enumeration;
import java.util.HashMap;
+import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.TimeZone;
@@ -33,6 +41,7 @@
import org.codelibs.fess.crawler.Constants;
import org.codelibs.fess.crawler.entity.ExtractData;
import org.codelibs.fess.crawler.exception.ExtractException;
+import org.codelibs.fess.crawler.exception.MaxLengthExceededException;
import org.codelibs.fess.crawler.extractor.Extractor;
import org.codelibs.fess.crawler.extractor.ExtractorFactory;
import org.codelibs.fess.crawler.helper.MimeTypeHelper;
@@ -45,6 +54,7 @@
import jakarta.mail.Multipart;
import jakarta.mail.Part;
import jakarta.mail.Session;
+import jakarta.mail.internet.ContentType;
import jakarta.mail.internet.MailDateFormat;
import jakarta.mail.internet.MimeMessage;
import jakarta.mail.internet.MimeUtility;
@@ -52,6 +62,25 @@
/**
* Gets a text from .eml file.
*
+ *
EML content is treated as untrusted. The extractor enforces the following
+ * defensive bounds against malformed or malicious messages:
+ *
+ * - {@link #maxMessageBytes} (default 100 MiB) is the first-line defense:
+ * the raw input stream is capped before {@code MimeMessage} even begins
+ * to parse, preventing memory exhaustion from pathologically large
+ * messages.
+ * - {@link #maxRecursionDepth} (default 10) caps how deeply nested
+ * {@code message/rfc822} or {@code multipart/*} parts may be.
+ * - {@link #maxParts} (default 1000) caps the total number of MIME parts
+ * traversed across the whole message.
+ * - {@link #maxBodyBytes} (default 50 MiB) caps the total UTF-8 byte size
+ * of body text appended to the output.
+ *
+ * RFC 2047 encoded-word headers (e.g. {@code Subject},
+ * {@code From}, {@code To}) are decoded via {@link MimeUtility#decodeText}.
+ * The legacy {@code Subject} metadata key is RFC 2047-decoded for
+ * compatibility with older callers.
+ *
* @author shinsuke
*
*/
@@ -65,6 +94,18 @@ public class EmlExtractor extends AbstractExtractor {
/** Properties used for mail processing */
protected Properties mailProperties = new Properties();
+ /** Maximum allowed nesting depth for multipart / message/rfc822 parts. */
+ protected int maxRecursionDepth = 10;
+
+ /** Maximum allowed total number of MIME parts visited per message. */
+ protected int maxParts = 1000;
+
+ /** Maximum total body bytes (UTF-8) appended to the extracted content. */
+ protected long maxBodyBytes = 50L * 1024 * 1024;
+
+ /** Maximum allowed total stream bytes consumed while parsing the EML. */
+ protected long maxMessageBytes = 100L * 1024 * 1024;
+
/**
* Constructs a new EmlExtractor.
*/
@@ -83,11 +124,16 @@ public ExtractData getText(final InputStream in, final Map param
props.put(entry.getKey(), entry.getValue());
}
}
+ if (in == null) {
+ throw new ExtractException("Input stream is null.");
+ }
+ final LimitedInputStream limited = new LimitedInputStream(in, maxMessageBytes);
try {
- final Session mailSession = Session.getDefaultInstance(props, null);
- final MimeMessage message = new MimeMessage(mailSession, in);
- final String content = getBodyText(message);
- final ExtractData data = new ExtractData(content != null ? content : StringUtil.EMPTY);
+ final Session mailSession = Session.getInstance(props, null);
+ final MimeMessage message = new MimeMessage(mailSession, limited);
+ final BodyExtractionContext ctx = new BodyExtractionContext();
+ extractBody(message, ctx, 0);
+ final ExtractData data = new ExtractData(ctx.body.toString());
final Enumeration headers = message.getAllHeaders();
while (headers.hasMoreElements()) {
final Header header = headers.nextElement();
@@ -104,7 +150,8 @@ public ExtractData getText(final InputStream in, final Map param
putValue(data, "Line-Count", message.getLineCount());
putValue(data, "Message-ID", message.getMessageID());
putValue(data, "Message-Number", message.getMessageNumber());
- putValue(data, "Received-Date", getReceivedDate(message));
+ final Date receivedDate = getReceivedDate(message);
+ putValue(data, "Received-Date", receivedDate);
putValue(data, "Reply-To", message.getReplyTo());
putValue(data, "Sender", message.getSender());
putValue(data, "Sent-Date", message.getSentDate());
@@ -114,8 +161,33 @@ public ExtractData getText(final InputStream in, final Map param
putValue(data, "To", message.getRecipients(Message.RecipientType.TO));
putValue(data, "Cc", message.getRecipients(Message.RecipientType.CC));
putValue(data, "Bcc", message.getRecipients(Message.RecipientType.BCC));
+
+ // normalized convenience metadata (always RFC 2047 decoded)
+ putDecodedHeaderValue(data, "subject", message.getSubject());
+ putDecodedAddressValues(data, "from", message.getFrom());
+ putDecodedAddressValues(data, "to", message.getRecipients(Message.RecipientType.TO));
+ putDecodedAddressValues(data, "cc", message.getRecipients(Message.RecipientType.CC));
+ putDecodedAddressValues(data, "bcc", message.getRecipients(Message.RecipientType.BCC));
+ putDecodedAddressValues(data, "replyTo", message.getReplyTo());
+ putDateValue(data, "sentDate", message.getSentDate());
+ putDateValue(data, "receivedDate", receivedDate);
+ if (message.getMessageID() != null) {
+ data.putValue("messageId", message.getMessageID());
+ }
+
+ if (!ctx.attachmentNames.isEmpty()) {
+ data.putValues("attachmentNames", ctx.attachmentNames.toArray(new String[0]));
+ }
return data;
} catch (final MessagingException e) {
+ if (limited.isExceeded()) {
+ throw new MaxLengthExceededException("EML message size exceeded: max=" + maxMessageBytes);
+ }
+ throw new ExtractException(e);
+ } catch (final IOException e) {
+ if (limited.isExceeded()) {
+ throw new MaxLengthExceededException("EML message size exceeded: max=" + maxMessageBytes);
+ }
throw new ExtractException(e);
}
}
@@ -154,18 +226,71 @@ protected void putValue(final ExtractData data, final String key, final Object v
} else if (value != null) {
data.putValue(key, value.toString());
}
- } catch (final Exception e) {
- if (logger.isDebugEnabled()) {
- logger.debug("Failed to put {}:{}", key, value, e);
- }
+ } catch (final RuntimeException e) {
+ logger.warn("Failed to put header value. key={}", key, e);
+ }
+ }
+
+ /**
+ * Stores a decoded header value if non-null/non-blank.
+ *
+ * @param data the extract data
+ * @param key the metadata key
+ * @param raw the raw header value, may be {@code null}
+ */
+ protected void putDecodedHeaderValue(final ExtractData data, final String key, final String raw) {
+ if (raw == null) {
+ return;
+ }
+ final String decoded = getDecodeText(raw);
+ if (!StringUtil.isEmpty(decoded)) {
+ data.putValue(key, decoded);
+ }
+ }
+
+ /**
+ * Stores a decoded address array as a multivalue metadata entry.
+ *
+ * @param data the extract data
+ * @param key the metadata key
+ * @param addresses the address array, may be {@code null}
+ */
+ protected void putDecodedAddressValues(final ExtractData data, final String key, final Address[] addresses) {
+ if (addresses == null || addresses.length == 0) {
+ return;
}
+ final String[] values = new String[addresses.length];
+ for (int i = 0; i < addresses.length; i++) {
+ values[i] = getDecodeText(addresses[i].toString());
+ }
+ data.putValues(key, values);
+ }
+
+ /**
+ * Stores a Date as an ISO-8601 UTC string under the given key.
+ *
+ * @param data the extract data
+ * @param key the metadata key
+ * @param date the date, may be {@code null}
+ */
+ protected void putDateValue(final ExtractData data, final String key, final Date date) {
+ if (date == null) {
+ return;
+ }
+ final SimpleDateFormat sdf = new SimpleDateFormat(Constants.ISO_DATETIME_FORMAT);
+ sdf.setTimeZone(TimeZone.getTimeZone("UTC"));
+ data.putValue(key, sdf.format(date));
}
/**
* Decodes MIME-encoded text.
*
+ * On {@link UnsupportedEncodingException} (caused by an unrecognised RFC 2047
+ * charset), logs a warning and returns the raw value unchanged so
+ * callers still receive some usable output rather than an empty string.
+ *
* @param value the encoded text to decode
- * @return the decoded text or empty string if decoding fails
+ * @return the decoded text, the raw value on encoding failure, or empty string for null input
*/
protected String getDecodeText(final String value) {
if (value == null) {
@@ -174,8 +299,8 @@ protected String getDecodeText(final String value) {
try {
return MimeUtility.decodeText(value);
} catch (final UnsupportedEncodingException e) {
- logger.warn("Invalid encoding.", e);
- return StringUtil.EMPTY;
+ logger.warn("Invalid RFC 2047 encoding, returning raw value. value={}", value, e);
+ return value;
}
}
@@ -197,52 +322,389 @@ public void setMailProperties(final Properties mailProperties) {
this.mailProperties = mailProperties;
}
+ /**
+ * Returns the maximum allowed recursion depth.
+ *
+ * @return the maximum recursion depth
+ */
+ public int getMaxRecursionDepth() {
+ return maxRecursionDepth;
+ }
+
+ /**
+ * Sets the maximum allowed recursion depth for nested multipart /
+ * {@code message/rfc822} parts. A value of {@code 0} means only the root
+ * part is processed (no recursion). Negative values are rejected.
+ *
+ * @param maxRecursionDepth the maximum recursion depth; must be >= 0
+ * @throws IllegalArgumentException if the value is negative
+ */
+ public void setMaxRecursionDepth(final int maxRecursionDepth) {
+ if (maxRecursionDepth < 0) {
+ throw new IllegalArgumentException("maxRecursionDepth must be positive: " + maxRecursionDepth);
+ }
+ this.maxRecursionDepth = maxRecursionDepth;
+ }
+
+ /**
+ * Returns the maximum total number of MIME parts visited per message.
+ *
+ * @return the maximum number of parts
+ */
+ public int getMaxParts() {
+ return maxParts;
+ }
+
+ /**
+ * Sets the maximum total number of MIME parts visited per message.
+ *
+ * @param maxParts the maximum number of parts; must be > 0
+ * @throws IllegalArgumentException if the value is <= 0
+ */
+ public void setMaxParts(final int maxParts) {
+ if (maxParts <= 0) {
+ throw new IllegalArgumentException("maxParts must be positive: " + maxParts);
+ }
+ this.maxParts = maxParts;
+ }
+
+ /**
+ * Returns the maximum total UTF-8 body bytes appended to extracted content.
+ *
+ * @return the maximum body bytes
+ */
+ public long getMaxBodyBytes() {
+ return maxBodyBytes;
+ }
+
+ /**
+ * Sets the maximum total UTF-8 body bytes appended to extracted content.
+ *
+ * @param maxBodyBytes the maximum body bytes; must be > 0
+ * @throws IllegalArgumentException if the value is <= 0
+ */
+ public void setMaxBodyBytes(final long maxBodyBytes) {
+ if (maxBodyBytes <= 0) {
+ throw new IllegalArgumentException("maxBodyBytes must be positive: " + maxBodyBytes);
+ }
+ this.maxBodyBytes = maxBodyBytes;
+ }
+
+ /**
+ * Returns the maximum allowed total stream bytes consumed while parsing the EML.
+ *
+ * @return the maximum message bytes
+ */
+ public long getMaxMessageBytes() {
+ return maxMessageBytes;
+ }
+
+ /**
+ * Sets the maximum allowed total stream bytes consumed while parsing the EML.
+ * This is the first-line defense before {@link MimeMessage} parses the input.
+ *
+ * @param maxMessageBytes the maximum message bytes; must be > 0
+ * @throws IllegalArgumentException if the value is <= 0
+ */
+ public void setMaxMessageBytes(final long maxMessageBytes) {
+ if (maxMessageBytes <= 0) {
+ throw new IllegalArgumentException("maxMessageBytes must be positive: " + maxMessageBytes);
+ }
+ this.maxMessageBytes = maxMessageBytes;
+ }
+
/**
* Extracts the body text from a MIME message.
*
+ * Retained for backwards compatibility. Internally delegates to
+ * {@link #extractBody(Part, BodyExtractionContext, int)} with a fresh
+ * context.
+ *
* @param message the MIME message to extract text from
* @return the extracted body text
* @throws ExtractException if extraction fails
*/
protected String getBodyText(final MimeMessage message) {
- final StringBuilder buf = new StringBuilder(1000);
try {
- final Object content = message.getContent();
- if (content instanceof final Multipart multipart) {
- final int count = multipart.getCount();
+ final BodyExtractionContext ctx = new BodyExtractionContext();
+ extractBody(message, ctx, 0);
+ return ctx.body.toString();
+ } catch (MessagingException | IOException e) {
+ throw new ExtractException(e);
+ }
+ }
+
+ /**
+ * Recursively extracts text content from a MIME part, enforcing recursion,
+ * part-count, and body-byte bounds.
+ *
+ * @param part the current MIME part
+ * @param ctx the extraction context tracking accumulated state
+ * @param depth the current recursion depth (root = 0)
+ * @throws MessagingException if a JavaMail call fails
+ * @throws IOException if reading part content fails
+ */
+ protected void extractBody(final Part part, final BodyExtractionContext ctx, final int depth) throws MessagingException, IOException {
+ if (depth > maxRecursionDepth) {
+ throw new MaxLengthExceededException("EML recursion too deep: depth=" + depth + " max=" + maxRecursionDepth);
+ }
+ ctx.partCount++;
+ if (ctx.partCount > maxParts) {
+ throw new MaxLengthExceededException("EML part count exceeded: max=" + maxParts);
+ }
+
+ // Treat explicitly-marked attachments as attachments regardless of mime type.
+ if (Part.ATTACHMENT.equalsIgnoreCase(part.getDisposition())) {
+ recordAttachment(ctx, part);
+ return;
+ }
+
+ if (part.isMimeType("text/*")) {
+ appendTextPart(ctx, part);
+ return;
+ }
+
+ if (part.isMimeType("multipart/alternative")) {
+ final Object content = part.getContent();
+ if (content instanceof Multipart) {
+ final Multipart mp = (Multipart) content;
+ final int count = mp.getCount();
+ // Prefer text/plain alternative; fall back to first text/* alternative.
+ BodyPart chosen = null;
for (int i = 0; i < count; i++) {
- final BodyPart bodyPart = multipart.getBodyPart(i);
- if (Part.ATTACHMENT.equalsIgnoreCase(bodyPart.getDisposition())) {
- appendAttachment(buf, bodyPart);
- } else if (bodyPart.isMimeType("text/plain") || bodyPart.isMimeType("text/html")) {
- buf.append(bodyPart.getContent().toString()).append(' ');
- } else if (bodyPart.isMimeType("multipart/alternative") && bodyPart.getContent() instanceof Multipart) {
- final Multipart alternativePart = (Multipart) bodyPart.getContent();
- for (int j = 0; j < alternativePart.getCount(); j++) {
- final BodyPart innerBodyPart = alternativePart.getBodyPart(j);
- if (innerBodyPart.isMimeType("text/plain")) {
- buf.append(innerBodyPart.getContent().toString()).append(' ');
- break;
- }
+ final BodyPart bp = mp.getBodyPart(i);
+ if (bp.isMimeType("text/plain")) {
+ chosen = bp;
+ break;
+ }
+ }
+ if (chosen == null) {
+ for (int i = 0; i < count; i++) {
+ final BodyPart bp = mp.getBodyPart(i);
+ if (bp.isMimeType("text/*")) {
+ chosen = bp;
+ break;
+ }
+ }
+ }
+ if (chosen != null) {
+ // Charge the partCount budget for every alternative — even those we
+ // don't recurse into — so an attacker can't bypass maxParts by
+ // stuffing thousands of unused alternatives. The chosen part is
+ // counted via its own extractBody call below, so charge count - 1.
+ if (count > 1) {
+ ctx.partCount += count - 1;
+ if (ctx.partCount > maxParts) {
+ throw new MaxLengthExceededException("EML part count exceeded: max=" + maxParts);
}
}
+ extractBody(chosen, ctx, depth + 1);
+ } else {
+ // No text alternative; recurse into all parts (each counted normally).
+ for (int i = 0; i < count; i++) {
+ extractBody(mp.getBodyPart(i), ctx, depth + 1);
+ }
}
- } else if (content instanceof String) {
- buf.append(content.toString());
}
- } catch (MessagingException | IOException e) {
- throw new ExtractException(e);
+ return;
+ }
+
+ if (part.isMimeType("multipart/*")) {
+ final Object content = part.getContent();
+ if (content instanceof Multipart) {
+ final Multipart mp = (Multipart) content;
+ for (int i = 0; i < mp.getCount(); i++) {
+ extractBody(mp.getBodyPart(i), ctx, depth + 1);
+ }
+ }
+ return;
+ }
+
+ if (part.isMimeType("message/rfc822")) {
+ final Object content = part.getContent();
+ if (content instanceof Part) {
+ extractBody((Part) content, ctx, depth + 1);
+ }
+ return;
+ }
+
+ // Anything else with a filename is an inline attachment-like part.
+ recordAttachment(ctx, part);
+ }
+
+ /**
+ * Records an attachment filename (decoded) and attempts in-extractor text
+ * extraction for known mime types, mirroring previous behavior.
+ *
+ * @param ctx the extraction context
+ * @param part the attachment-like part
+ */
+ protected void recordAttachment(final BodyExtractionContext ctx, final Part part) {
+ try {
+ final String rawName = part.getFileName();
+ if (!StringUtil.isEmpty(rawName)) {
+ final String decoded = getDecodeText(rawName);
+ if (!StringUtil.isEmpty(decoded)) {
+ ctx.attachmentNames.add(decoded);
+ }
+ }
+ } catch (final MessagingException e) {
+ if (logger.isDebugEnabled()) {
+ logger.debug("Failed to read attachment filename.", e);
+ }
+ }
+ if (part instanceof BodyPart) {
+ appendAttachment(ctx, (BodyPart) part);
+ }
+ }
+
+ /**
+ * Appends body text to the extraction context, enforcing
+ * {@link #maxBodyBytes}. Truncates any text that would push the total over
+ * the limit (including the trailing separator space).
+ *
+ * Encodes the text once with {@link String#getBytes(java.nio.charset.Charset)}
+ * (memory proportional to the input, not to the configured budget). When
+ * truncation is needed, walks back over UTF-8 continuation bytes (at most
+ * three steps) so the cut lands on a code-point boundary.
+ *
+ * @param ctx the extraction context
+ * @param text the text to append
+ */
+ protected void appendBody(final BodyExtractionContext ctx, final String text) {
+ if (text == null || text.isEmpty()) {
+ return;
+ }
+ if (ctx.bodyBytes >= maxBodyBytes) {
+ return;
+ }
+ final byte[] bytes = text.getBytes(StandardCharsets.UTF_8);
+ final long remaining = maxBodyBytes - ctx.bodyBytes;
+ // Reserve 1 byte for the trailing separator space so the strict cap holds.
+ if ((long) bytes.length + 1L <= remaining) {
+ ctx.body.append(text).append(' ');
+ ctx.bodyBytes += (long) bytes.length + 1L;
+ return;
+ }
+ // Truncate at a UTF-8 code-point boundary that fits within the remaining
+ // budget. Continuation bytes have the bit pattern 10xxxxxx, so walk back
+ // until we land on a start byte (or zero). Bounded by 3 iterations.
+ int cutoff = (int) Math.min(remaining, (long) bytes.length);
+ while (cutoff > 0 && cutoff < bytes.length && (bytes[cutoff] & 0xC0) == 0x80) {
+ cutoff--;
+ }
+ if (cutoff > 0) {
+ ctx.body.append(new String(bytes, 0, cutoff, StandardCharsets.UTF_8));
+ }
+ ctx.bodyBytes = maxBodyBytes;
+ if (logger.isDebugEnabled()) {
+ logger.debug("EML body truncated. maxBytes={}", maxBodyBytes);
+ }
+ }
+
+ /**
+ * Returns the content type of a part as a string, or {@code "unknown"} on
+ * {@link MessagingException}.
+ *
+ * @param part the MIME part
+ * @return the content-type string or {@code "unknown"}
+ */
+ private static String safeGetContentType(final Part part) {
+ try {
+ return part.getContentType();
+ } catch (final MessagingException e) {
+ return "unknown";
}
- return buf.toString();
}
/**
- * Appends attachment content to the buffer if it can be extracted.
+ * Streams a text part's content into the extraction buffer, reading
+ * {@code remaining} chars at most via {@link InputStreamReader}, then
+ * delegates to {@link #appendBody} for byte-accurate truncation.
+ *
+ * The charset is resolved from the part's Content-Type header; if absent
+ * or unrecognised, UTF-8 is used as the fallback.
+ *
+ * @param ctx the extraction context
+ * @param part the {@code text/*} part
+ */
+ protected void appendTextPart(final BodyExtractionContext ctx, final Part part) {
+ if (ctx.bodyBytes >= maxBodyBytes) {
+ return;
+ }
+ final long remaining = maxBodyBytes - ctx.bodyBytes;
+ final int charCap = (int) Math.min(remaining, (long) Integer.MAX_VALUE / 4);
+
+ Charset charset = StandardCharsets.UTF_8;
+ try {
+ final String contentType = part.getContentType();
+ if (contentType != null) {
+ final String cs = new ContentType(contentType).getParameter("charset");
+ if (cs != null && !cs.isEmpty()) {
+ try {
+ charset = Charset.forName(MimeUtility.javaCharset(cs));
+ } catch (final IllegalCharsetNameException | UnsupportedCharsetException e) {
+ logger.warn("Unsupported EML text part charset, fallback=UTF-8. charset={}", cs, e);
+ }
+ }
+ }
+ } catch (final MessagingException e) {
+ if (logger.isDebugEnabled()) {
+ logger.debug("Failed to parse content type of text part.", e);
+ }
+ }
+
+ try (InputStream is = part.getInputStream(); InputStreamReader reader = new InputStreamReader(is, charset)) {
+ final char[] buf = new char[Math.min(charCap, 8 * 1024)];
+ final StringBuilder sb = new StringBuilder(Math.min(charCap, 64 * 1024));
+ int total = 0;
+ int n;
+ while (total < charCap && (n = reader.read(buf, 0, Math.min(buf.length, charCap - total))) > 0) {
+ sb.append(buf, 0, n);
+ total += n;
+ }
+ if (total > 0) {
+ appendBody(ctx, sb.toString());
+ }
+ } catch (final IOException e) {
+ logger.warn("Failed to read text part content. contentType={}", safeGetContentType(part), e);
+ } catch (final MessagingException e) {
+ logger.warn("Failed to access text part input stream. contentType={}", safeGetContentType(part), e);
+ }
+ }
+
+ /**
+ * Backwards-compatible attachment text extraction. Kept for subclasses that
+ * may have overridden it; new code should prefer
+ * {@link #appendAttachment(BodyExtractionContext, BodyPart)}.
+ *
+ * @deprecated Use {@link #appendAttachment(BodyExtractionContext, BodyPart)} instead.
+ * This shim creates a fresh extraction context with {@code bodyBytes=0}, so
+ * the {@link #maxBodyBytes} cap is enforced per call rather than cumulatively
+ * across a single message. Subclasses overriding this method should migrate to
+ * the context-aware overload.
*
* @param buf the buffer to append content to
* @param bodyPart the body part containing the attachment
*/
+ @Deprecated
protected void appendAttachment(final StringBuilder buf, final BodyPart bodyPart) {
+ final BodyExtractionContext ctx = new BodyExtractionContext();
+ ctx.body = buf;
+ appendAttachment(ctx, bodyPart);
+ }
+
+ /**
+ * Attempts to extract text from an attachment using a registered
+ * {@link Extractor} for its detected MIME type. Failures are silently
+ * swallowed unless they are {@link MaxLengthExceededException}, which is
+ * re-thrown so the caller can enforce overall message size limits.
+ *
+ * @param ctx the extraction context
+ * @param bodyPart the attachment body part
+ * @throws MaxLengthExceededException if the nested extractor signals a size limit violation
+ */
+ protected void appendAttachment(final BodyExtractionContext ctx, final BodyPart bodyPart) {
final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper();
final ExtractorFactory extractorFactory = getExtractorFactory();
try {
@@ -251,18 +713,34 @@ protected void appendAttachment(final StringBuilder buf, final BodyPart bodyPart
if (mimeType != null) {
final Extractor extractor = extractorFactory.getExtractor(mimeType);
if (extractor != null) {
- try (final InputStream in = bodyPart.getInputStream()) {
+ if (ctx.bodyBytes >= maxBodyBytes) {
+ return;
+ }
+ final long remaining = maxBodyBytes - ctx.bodyBytes;
+ final long sourceCapL;
+ if (remaining > (Integer.MAX_VALUE - 16L) / 4L) {
+ sourceCapL = Integer.MAX_VALUE - 16L;
+ } else {
+ sourceCapL = remaining * 4L + 16L;
+ }
+ try (final InputStream in = new LimitedInputStream(bodyPart.getInputStream(), sourceCapL)) {
final Map map = new HashMap<>();
map.put(ExtractData.RESOURCE_NAME_KEY, filename);
final String content = extractor.getText(in, map).getContent();
- buf.append(content).append(' ');
+ if (content != null) {
+ appendBody(ctx, content);
+ }
+ } catch (final MaxLengthExceededException e) {
+ throw e;
} catch (final Exception e) {
if (logger.isDebugEnabled()) {
- logger.debug("Exception in an internal extractor.", e);
+ logger.debug("Exception in an internal extractor. filename={}", filename, e);
}
}
}
}
+ } catch (final MaxLengthExceededException e) {
+ throw e;
} catch (final MessagingException e) {
if (logger.isDebugEnabled()) {
logger.debug("Exception in parsing BodyPart.", e);
@@ -272,6 +750,8 @@ protected void appendAttachment(final StringBuilder buf, final BodyPart bodyPart
/**
* Gets the received date from a message by parsing the received headers.
+ * Caps inspection to the first 100 headers to avoid unbounded work on
+ * messages with pathologically many {@code Received} lines.
*
* @param message the message to get the received date from
* @return the received date or null if not found
@@ -280,17 +760,25 @@ protected void appendAttachment(final StringBuilder buf, final BodyPart bodyPart
protected static Date getReceivedDate(final Message message) throws MessagingException {
final Date today = new Date();
final String[] received = message.getHeader("received");
- if (received != null) {
- for (final String v : received) {
- String dateStr = null;
- try {
- dateStr = getDateString(v);
- final Date receivedDate = new MailDateFormat().parse(dateStr);
- if (!receivedDate.after(today)) {
- return receivedDate;
- }
- } catch (final ParseException e) {
- // ignore
+ if (received == null) {
+ return null;
+ }
+ final MailDateFormat format = new MailDateFormat();
+ final int limit = Math.min(received.length, 100);
+ for (int i = 0; i < limit; i++) {
+ final String v = received[i];
+ try {
+ final String dateStr = getDateString(v);
+ if (dateStr == null) {
+ continue;
+ }
+ final Date receivedDate = format.parse(dateStr);
+ if (!receivedDate.after(today)) {
+ return receivedDate;
+ }
+ } catch (final ParseException e) {
+ if (logger.isDebugEnabled()) {
+ logger.debug("Failed to parse received header. value={}", v, e);
}
}
}
@@ -300,10 +788,21 @@ protected static Date getReceivedDate(final Message message) throws MessagingExc
/**
* Extracts a date string from the received header text.
*
+ * Per RFC 5322 §3.6.7 the date portion follows the last {@code ;} in
+ * the header. If no {@code ;} is present, falls back to scanning for a
+ * day-of-week abbreviation.
+ *
* @param text the received header text
- * @return the date string starting from the day of week, or null if not found
+ * @return the date string, or null if not found
*/
private static String getDateString(final String text) {
+ if (text == null) {
+ return null;
+ }
+ final int semicolon = text.lastIndexOf(';');
+ if (semicolon != -1 && semicolon + 1 < text.length()) {
+ return text.substring(semicolon + 1).trim();
+ }
for (final String dow : DAY_OF_WEEK) {
final int i = text.lastIndexOf(dow);
if (i != -1) {
@@ -312,4 +811,68 @@ private static String getDateString(final String text) {
}
return null;
}
+
+ /**
+ * Mutable state shared across recursive body extraction.
+ */
+ protected static class BodyExtractionContext {
+ /** Accumulated body text. */
+ protected StringBuilder body = new StringBuilder(1000);
+
+ /** Number of MIME parts visited so far. */
+ protected int partCount;
+
+ /** UTF-8 bytes already appended to {@link #body}. */
+ protected long bodyBytes;
+
+ /** Decoded attachment filenames. */
+ protected List attachmentNames = new ArrayList<>();
+ }
+
+ /**
+ * A {@link FilterInputStream} that throws {@link IOException} once the
+ * number of bytes read exceeds a configured limit. Used to cap raw EML
+ * stream consumption before {@link MimeMessage} parses the input.
+ */
+ private static final class LimitedInputStream extends FilterInputStream {
+ private final long limit;
+ private long bytesRead;
+ private boolean exceeded;
+
+ LimitedInputStream(final InputStream in, final long limit) {
+ super(in);
+ this.limit = limit;
+ }
+
+ @Override
+ public int read() throws IOException {
+ final int b = super.read();
+ if (b != -1) {
+ bytesRead++;
+ if (bytesRead > limit) {
+ exceeded = true;
+ throw new IOException("EML message size exceeded.");
+ }
+ }
+ return b;
+ }
+
+ @Override
+ public int read(final byte[] b, final int off, final int len) throws IOException {
+ final int n = super.read(b, off, len);
+ if (n > 0) {
+ bytesRead += n;
+ if (bytesRead > limit) {
+ exceeded = true;
+ throw new IOException("EML message size exceeded.");
+ }
+ }
+ return n;
+ }
+
+ /** Returns {@code true} if the limit was exceeded during reading. */
+ boolean isExceeded() {
+ return exceeded;
+ }
+ }
}
diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractorTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractorTest.java
index c0e39b25..b125d53f 100644
--- a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractorTest.java
+++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractorTest.java
@@ -15,8 +15,16 @@
*/
package org.codelibs.fess.crawler.extractor.impl;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.text.SimpleDateFormat;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.Map;
+import java.util.Properties;
+import java.util.TimeZone;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
@@ -24,13 +32,21 @@
import org.codelibs.fess.crawler.container.StandardCrawlerContainer;
import org.codelibs.fess.crawler.entity.ExtractData;
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
+import org.codelibs.fess.crawler.exception.MaxLengthExceededException;
+import org.codelibs.fess.crawler.extractor.Extractor;
import org.codelibs.fess.crawler.extractor.ExtractorFactory;
import org.codelibs.fess.crawler.helper.impl.MimeTypeHelperImpl;
import org.dbflute.utflute.core.PlainTestCase;
-import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.TestInfo;
+import jakarta.mail.Message;
+import jakarta.mail.Session;
+import jakarta.mail.internet.InternetAddress;
+import jakarta.mail.internet.MimeBodyPart;
+import jakarta.mail.internet.MimeMessage;
+import jakarta.mail.internet.MimeMultipart;
+
/**
* @author shinsuke
*
@@ -115,4 +131,888 @@ public void test_getText_null() {
// NOP
}
}
+
+ // --------------------------------------------------------------------
+ // Programmatically-built fixtures
+ // --------------------------------------------------------------------
+
+ private static Session newSession() {
+ return Session.getInstance(new Properties(), null);
+ }
+
+ private static InputStream toStream(final MimeMessage msg) throws Exception {
+ final ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ msg.writeTo(baos);
+ return new ByteArrayInputStream(baos.toByteArray());
+ }
+
+ @Test
+ public void test_extractsBody() throws Exception {
+ final MimeMessage msg = new MimeMessage(newSession());
+ msg.setFrom(new InternetAddress("sender@example.com"));
+ msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") });
+ msg.setSubject("Hello", "UTF-8");
+ msg.setText("Hello, world!", "UTF-8");
+ msg.saveChanges();
+
+ try (final InputStream in = toStream(msg)) {
+ final ExtractData data = emlExtractor.getText(in, null);
+ assertTrue(data.getContent().contains("Hello, world!"));
+ assertEquals("Hello", data.getValues("subject")[0]);
+ }
+ }
+
+ @Test
+ public void test_decodesRfc2047Subject() throws Exception {
+ final MimeMessage msg = new MimeMessage(newSession());
+ msg.setFrom(new InternetAddress("sender@example.com"));
+ msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") });
+ // setSubject(text, charset) auto-encodes as RFC 2047 when non-ASCII
+ msg.setSubject("こんにちは", "UTF-8");
+ msg.setText("body", "UTF-8");
+ msg.saveChanges();
+
+ try (final InputStream in = toStream(msg)) {
+ final ExtractData data = emlExtractor.getText(in, null);
+ // Legacy `Subject` metadata key is also RFC 2047-decoded for caller convenience.
+ final String raw = data.getValues("Subject")[0];
+ assertEquals("こんにちは", raw);
+ // Normalized "subject" metadata is decoded
+ assertEquals("こんにちは", data.getValues("subject")[0]);
+ }
+ }
+
+ @Test
+ public void test_decodesRfc2047From() throws Exception {
+ final MimeMessage msg = new MimeMessage(newSession());
+ // Personal name in non-ASCII triggers RFC 2047 encoding on serialization
+ final InternetAddress from = new InternetAddress("sender@example.com", "山田 太郎", "UTF-8");
+ msg.setFrom(from);
+ msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") });
+ msg.setSubject("test", "UTF-8");
+ msg.setText("body", "UTF-8");
+ msg.saveChanges();
+
+ try (final InputStream in = toStream(msg)) {
+ final ExtractData data = emlExtractor.getText(in, null);
+ final String[] fromValues = data.getValues("from");
+ assertNotNull(fromValues);
+ assertTrue(fromValues.length >= 1);
+ final String decoded = fromValues[0];
+ assertTrue(decoded.contains("山田 太郎"));
+ assertTrue(decoded.contains("sender@example.com"));
+ }
+ }
+
+ @Test
+ public void test_extractsAttachmentFilenames() throws Exception {
+ final MimeMessage msg = new MimeMessage(newSession());
+ msg.setFrom(new InternetAddress("sender@example.com"));
+ msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") });
+ msg.setSubject("with attachment", "UTF-8");
+
+ final MimeMultipart mp = new MimeMultipart();
+ final MimeBodyPart textPart = new MimeBodyPart();
+ textPart.setText("see attached", "UTF-8");
+ mp.addBodyPart(textPart);
+
+ final MimeBodyPart attachment = new MimeBodyPart();
+ // tiny PDF-like payload; content does not need to be valid for filename extraction
+ attachment.setContent(new byte[] { '%', 'P', 'D', 'F' }, "application/pdf");
+ attachment.setFileName("report.pdf");
+ attachment.setDisposition(jakarta.mail.Part.ATTACHMENT);
+ mp.addBodyPart(attachment);
+
+ msg.setContent(mp);
+ msg.saveChanges();
+
+ try (final InputStream in = toStream(msg)) {
+ final ExtractData data = emlExtractor.getText(in, null);
+ final String[] names = data.getValues("attachmentNames");
+ assertNotNull(names);
+ assertTrue(Arrays.stream(names).anyMatch(n -> n.contains("report.pdf")));
+ }
+ }
+
+ @Test
+ public void test_recursionBomb_throwsException() throws Exception {
+ // Build a chain of nested message/rfc822 parts deeper than the configured limit.
+ final EmlExtractor extractor = new EmlExtractor();
+ extractor.setMaxRecursionDepth(3);
+ // Reuse the surrounding container's helper / factory wiring for a fair test:
+ // delegate directly via a fresh instance is fine because we don't traverse into attachments here.
+
+ final Session session = newSession();
+ // innermost message
+ MimeMessage current = new MimeMessage(session);
+ current.setFrom(new InternetAddress("inner@example.com"));
+ current.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") });
+ current.setSubject("inner", "UTF-8");
+ current.setText("innermost body", "UTF-8");
+ current.saveChanges();
+
+ // Wrap in N layers of message/rfc822 inside a multipart, exceeding the bound
+ final int wrapCount = 8;
+ for (int i = 0; i < wrapCount; i++) {
+ final MimeMessage outer = new MimeMessage(session);
+ outer.setFrom(new InternetAddress("layer" + i + "@example.com"));
+ outer.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") });
+ outer.setSubject("layer " + i, "UTF-8");
+ final MimeMultipart mp = new MimeMultipart();
+ final MimeBodyPart nested = new MimeBodyPart();
+ nested.setContent(current, "message/rfc822");
+ mp.addBodyPart(nested);
+ outer.setContent(mp);
+ outer.saveChanges();
+ current = outer;
+ }
+
+ try (final InputStream in = toStream(current)) {
+ extractor.getText(in, null);
+ fail();
+ } catch (final MaxLengthExceededException e) {
+ assertTrue(e.getMessage().contains("recursion"));
+ }
+ }
+
+ @Test
+ public void test_maxParts_throwsException() throws Exception {
+ final EmlExtractor extractor = new EmlExtractor();
+ extractor.setMaxParts(5);
+
+ final MimeMessage msg = new MimeMessage(newSession());
+ msg.setFrom(new InternetAddress("sender@example.com"));
+ msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") });
+ msg.setSubject("many parts", "UTF-8");
+
+ final MimeMultipart mp = new MimeMultipart();
+ for (int i = 0; i < 50; i++) {
+ final MimeBodyPart p = new MimeBodyPart();
+ p.setText("part " + i, "UTF-8");
+ mp.addBodyPart(p);
+ }
+ msg.setContent(mp);
+ msg.saveChanges();
+
+ try (final InputStream in = toStream(msg)) {
+ extractor.getText(in, null);
+ fail();
+ } catch (final MaxLengthExceededException e) {
+ assertTrue(e.getMessage().contains("part count"));
+ }
+ }
+
+ @Test
+ public void test_maxBodyBytes_truncates() throws Exception {
+ final EmlExtractor extractor = new EmlExtractor();
+ extractor.setMaxBodyBytes(32);
+
+ final MimeMessage msg = new MimeMessage(newSession());
+ msg.setFrom(new InternetAddress("sender@example.com"));
+ msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") });
+ msg.setSubject("long body", "UTF-8");
+ // body comfortably exceeds 32 bytes
+ final StringBuilder body = new StringBuilder();
+ for (int i = 0; i < 200; i++) {
+ body.append('a');
+ }
+ msg.setText(body.toString(), "UTF-8");
+ msg.saveChanges();
+
+ try (final InputStream in = toStream(msg)) {
+ final ExtractData data = extractor.getText(in, null);
+ final String content = data.getContent();
+ // Body must be truncated; the 200-char input is no longer there in full.
+ assertTrue(content.length() <= 33);
+ assertTrue(content.length() < 200);
+ }
+ }
+
+ @Test
+ public void test_maxBodyBytes_largeInputIsBounded() throws Exception {
+ // Regression: previous binary-search truncation called text.substring(0, mid).getBytes(UTF_8)
+ // O(log N) times, each allocating up to ~N bytes — catastrophically slow on multi-MiB
+ // text parts. The current path encodes once and walks back over UTF-8 continuation
+ // bytes to land on a code-point boundary.
+ final EmlExtractor extractor = new EmlExtractor();
+ extractor.setMaxBodyBytes(1024);
+
+ final MimeMessage msg = new MimeMessage(newSession());
+ msg.setFrom(new InternetAddress("sender@example.com"));
+ msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") });
+ msg.setSubject("huge body", "UTF-8");
+
+ // 5 MiB of 'a' characters — well within typical heap, but large enough that the
+ // old O(N log N) truncation would be visibly slow.
+ final int size = 5 * 1024 * 1024;
+ final char[] chars = new char[size];
+ Arrays.fill(chars, 'a');
+ msg.setText(new String(chars), "UTF-8");
+ msg.saveChanges();
+
+ try (final InputStream in = toStream(msg)) {
+ final long start = System.nanoTime();
+ final ExtractData data = extractor.getText(in, null);
+ final long elapsedMs = (System.nanoTime() - start) / 1_000_000L;
+ final String content = data.getContent();
+ // Bounded by maxBodyBytes (allow a small overhead for trailing space etc.).
+ assertTrue(content.length() <= 2048);
+ // Sanity: the streaming truncation must complete quickly (well under a second).
+ logger.info("test_maxBodyBytes_largeInputIsBounded elapsed={}ms contentLen={}", elapsedMs, content.length());
+ }
+ }
+
+ @Test
+ public void test_maxBodyBytes_truncatesAtUtf8CodePointBoundary() throws Exception {
+ // The body is 10 copies of "あ" (3 bytes each in UTF-8 = 30 bytes total).
+ // With maxBodyBytes=10, the cap falls inside the 4th character. The truncation
+ // must walk back over continuation bytes and land at byte 9 (3 complete chars),
+ // never producing a half-encoded code point or a U+FFFD replacement.
+ final EmlExtractor extractor = new EmlExtractor();
+ extractor.setMaxBodyBytes(10);
+
+ final MimeMessage msg = new MimeMessage(newSession());
+ msg.setFrom(new InternetAddress("sender@example.com"));
+ msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") });
+ msg.setSubject("multibyte", "UTF-8");
+ final StringBuilder body = new StringBuilder();
+ for (int i = 0; i < 10; i++) {
+ body.append('あ'); // あ
+ }
+ msg.setText(body.toString(), "UTF-8");
+ msg.saveChanges();
+
+ try (final InputStream in = toStream(msg)) {
+ final ExtractData data = extractor.getText(in, null);
+ final String content = data.getContent();
+ // Truncation must not leak U+FFFD from a partial code point.
+ assertFalse(content.contains("�"));
+ }
+ }
+
+ @Test
+ public void test_multipartAlternative_partsCountedTowardMaxParts() throws Exception {
+ // Regression: multipart/alternative previously charged only the chosen
+ // part (and the parent multipart node) to ctx.partCount, letting an
+ // attacker bypass maxParts by stuffing thousands of unused
+ // alternatives. The fix charges every alternative to the budget.
+ final EmlExtractor extractor = new EmlExtractor();
+ extractor.setMaxParts(5);
+
+ final MimeMessage msg = new MimeMessage(newSession());
+ msg.setFrom(new InternetAddress("sender@example.com"));
+ msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") });
+ msg.setSubject("alt bomb", "UTF-8");
+
+ final MimeMultipart alt = new MimeMultipart("alternative");
+ // 50 text/html alternatives + 1 text/plain that would otherwise be the
+ // only counted child; under the old code partCount stays at 2.
+ for (int i = 0; i < 50; i++) {
+ final MimeBodyPart bp = new MimeBodyPart();
+ bp.setContent("HTML " + i + "", "text/html; charset=UTF-8");
+ alt.addBodyPart(bp);
+ }
+ final MimeBodyPart plain = new MimeBodyPart();
+ plain.setText("plain", "UTF-8");
+ alt.addBodyPart(plain);
+
+ msg.setContent(alt);
+ msg.saveChanges();
+
+ try (final InputStream in = toStream(msg)) {
+ extractor.getText(in, null);
+ fail();
+ } catch (final MaxLengthExceededException e) {
+ assertTrue(e.getMessage().contains("part count"));
+ }
+ }
+
+ @Test
+ public void test_maxBodyBytes_strictCapIncludesTrailingSeparator() throws Exception {
+ // Regression: when the encoded body length exactly equals the
+ // remaining budget, the old code still appended a trailing space,
+ // pushing bodyBytes one byte past maxBodyBytes. The fix reserves the
+ // separator byte before deciding to append the full text.
+ final EmlExtractor extractor = new EmlExtractor();
+ extractor.setMaxBodyBytes(8);
+
+ final MimeMessage msg = new MimeMessage(newSession());
+ msg.setFrom(new InternetAddress("sender@example.com"));
+ msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") });
+ msg.setSubject("exact", "UTF-8");
+ // 8 ASCII bytes — exactly equals maxBodyBytes; the fit branch must NOT
+ // append a trailing space and exceed the cap.
+ msg.setText("12345678", "UTF-8");
+ msg.saveChanges();
+
+ try (final InputStream in = toStream(msg)) {
+ final ExtractData data = extractor.getText(in, null);
+ final String content = data.getContent();
+ // Must not exceed maxBodyBytes (8 bytes / 8 ASCII chars).
+ logger.info("test_maxBodyBytes_strictCapIncludesTrailingSeparator content.length={}", content.length());
+ assertTrue(content.length() <= 8);
+ }
+ }
+
+ @Test
+ public void test_multipartAlternative_prefersPlainText() throws Exception {
+ final MimeMessage msg = new MimeMessage(newSession());
+ msg.setFrom(new InternetAddress("sender@example.com"));
+ msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") });
+ msg.setSubject("alt", "UTF-8");
+
+ final MimeMultipart alt = new MimeMultipart("alternative");
+ final MimeBodyPart textPart = new MimeBodyPart();
+ textPart.setText("PLAIN_BODY", "UTF-8");
+ alt.addBodyPart(textPart);
+ final MimeBodyPart htmlPart = new MimeBodyPart();
+ htmlPart.setContent("HTML_BODY", "text/html; charset=UTF-8");
+ alt.addBodyPart(htmlPart);
+
+ msg.setContent(alt);
+ msg.saveChanges();
+
+ try (final InputStream in = toStream(msg)) {
+ final ExtractData data = emlExtractor.getText(in, null);
+ final String content = data.getContent();
+ assertTrue(content.contains("PLAIN_BODY"));
+ assertFalse(content.contains("HTML_BODY"));
+ }
+ }
+
+ // --------------------------------------------------------------------
+ // New tests
+ // --------------------------------------------------------------------
+
+ @Test
+ public void test_maxMessageBytes_enforcedBeforeParsing() throws Exception {
+ // Build a small valid EML, then set maxMessageBytes very small (64 bytes)
+ // so that even a minimal message stream exceeds it.
+ final MimeMessage msg = new MimeMessage(newSession());
+ msg.setFrom(new InternetAddress("sender@example.com"));
+ msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") });
+ msg.setSubject("test subject", "UTF-8");
+ msg.setText("Hello, this is a test EML body that is longer than 64 bytes definitely!", "UTF-8");
+ msg.saveChanges();
+
+ final EmlExtractor extractor = new EmlExtractor();
+ extractor.setMaxMessageBytes(64);
+
+ try (final InputStream in = toStream(msg)) {
+ extractor.getText(in, null);
+ fail();
+ } catch (final MaxLengthExceededException e) {
+ assertTrue(e.getMessage().contains("message size"));
+ }
+ }
+
+ @Test
+ public void test_attachment_extractorOutputRespectsMaxBodyBytes() throws Exception {
+ // Build a stub extractor that returns 1 MiB of content
+ final String largeContent = "x".repeat(1024 * 1024);
+ final Extractor stubExtractor = new Extractor() {
+ @Override
+ public ExtractData getText(final InputStream in, final Map params) {
+ return new ExtractData(largeContent);
+ }
+ };
+
+ // Register stub via a fresh container with the stub registered for application/pdf
+ final StandardCrawlerContainer container = new StandardCrawlerContainer().singleton("emlExtractor", EmlExtractor.class);
+ container.singleton("mimeTypeHelper", MimeTypeHelperImpl.class)
+ . singleton("extractorFactory", ExtractorFactory.class, factory -> {
+ factory.addExtractor("application/pdf", stubExtractor);
+ });
+ final EmlExtractor extractor = container.getComponent("emlExtractor");
+ extractor.setMaxBodyBytes(1024);
+
+ // Build an EML with a text body and an application/pdf attachment
+ final MimeMessage msg = new MimeMessage(newSession());
+ msg.setFrom(new InternetAddress("sender@example.com"));
+ msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") });
+ msg.setSubject("attachment test", "UTF-8");
+
+ final MimeMultipart mp = new MimeMultipart();
+ final MimeBodyPart textPart = new MimeBodyPart();
+ textPart.setText("body text", "UTF-8");
+ mp.addBodyPart(textPart);
+
+ final MimeBodyPart attachment = new MimeBodyPart();
+ attachment.setContent(new byte[] { '%', 'P', 'D', 'F' }, "application/pdf");
+ attachment.setFileName("report.pdf");
+ attachment.setDisposition(jakarta.mail.Part.ATTACHMENT);
+ mp.addBodyPart(attachment);
+
+ msg.setContent(mp);
+ msg.saveChanges();
+
+ try (final InputStream in = toStream(msg)) {
+ final ExtractData data = extractor.getText(in, null);
+ // Allow small overhead for separator
+ assertTrue(data.getContent().length() <= 2048);
+ }
+ }
+
+ @Test
+ public void test_appendAttachment_propagatesMaxLengthExceededException() throws Exception {
+ // Stub extractor that always throws MaxLengthExceededException
+ final Extractor stubExtractor = new Extractor() {
+ @Override
+ public ExtractData getText(final InputStream in, final Map params) {
+ throw new MaxLengthExceededException("stub size exceeded");
+ }
+ };
+
+ final StandardCrawlerContainer container = new StandardCrawlerContainer().singleton("emlExtractor", EmlExtractor.class);
+ container.singleton("mimeTypeHelper", MimeTypeHelperImpl.class)
+ . singleton("extractorFactory", ExtractorFactory.class, factory -> {
+ factory.addExtractor("application/pdf", stubExtractor);
+ });
+ final EmlExtractor extractor = container.getComponent("emlExtractor");
+
+ final MimeMessage msg = new MimeMessage(newSession());
+ msg.setFrom(new InternetAddress("sender@example.com"));
+ msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") });
+ msg.setSubject("propagation test", "UTF-8");
+
+ final MimeMultipart mp = new MimeMultipart();
+ final MimeBodyPart textPart = new MimeBodyPart();
+ textPart.setText("body", "UTF-8");
+ mp.addBodyPart(textPart);
+
+ final MimeBodyPart attachment = new MimeBodyPart();
+ attachment.setContent(new byte[] { '%', 'P', 'D', 'F' }, "application/pdf");
+ attachment.setFileName("big.pdf");
+ attachment.setDisposition(jakarta.mail.Part.ATTACHMENT);
+ mp.addBodyPart(attachment);
+
+ msg.setContent(mp);
+ msg.saveChanges();
+
+ try (final InputStream in = toStream(msg)) {
+ extractor.getText(in, null);
+ fail();
+ } catch (final MaxLengthExceededException e) {
+ // Expected — exception must propagate, not be swallowed
+ }
+ }
+
+ @Test
+ public void test_recursion_exactlyAtMaxDepth_succeeds() throws Exception {
+ // Depth accounting (each wrap contributes 2 depth levels: multipart + rfc822 part):
+ // root message (depth 0) → multipart bp (depth 1) → message/rfc822 content (depth 2) → inner text/* (depth 3)
+ // With maxRecursionDepth=3, depth=3 is allowed (3 <= 3), so 1 wrap must succeed.
+ // With maxRecursionDepth=1, depth=2 > 1 fails, so 1 wrap with max=1 must fail.
+ final Session session = newSession();
+
+ // Build innermost leaf message with setText
+ final MimeMessage inner = new MimeMessage(session);
+ inner.setFrom(new InternetAddress("inner@example.com"));
+ inner.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") });
+ inner.setSubject("inner", "UTF-8");
+ inner.setText("innermost", "UTF-8");
+ inner.saveChanges();
+
+ // Wrap once: root → multipart → rfc822 bodypart → inner (text/plain at depth 3)
+ final MimeMessage outer = new MimeMessage(session);
+ outer.setFrom(new InternetAddress("outer@example.com"));
+ outer.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") });
+ outer.setSubject("outer", "UTF-8");
+ final MimeMultipart mp = new MimeMultipart();
+ final MimeBodyPart nested = new MimeBodyPart();
+ nested.setContent(inner, "message/rfc822");
+ mp.addBodyPart(nested);
+ outer.setContent(mp);
+ outer.saveChanges();
+
+ final EmlExtractor extractor = new EmlExtractor();
+ extractor.setMaxRecursionDepth(3);
+
+ // 1 wrap at maxRecursionDepth=3 must succeed (inner text at depth 3)
+ try (final InputStream in = toStream(outer)) {
+ final ExtractData data = extractor.getText(in, null);
+ assertTrue(data.getContent().contains("innermost"));
+ }
+
+ // With maxRecursionDepth=1, the rfc822 content at depth 2 exceeds the limit
+ extractor.setMaxRecursionDepth(1);
+ try (final InputStream in = toStream(outer)) {
+ extractor.getText(in, null);
+ fail();
+ } catch (final MaxLengthExceededException e) {
+ assertTrue(e.getMessage().contains("recursion"));
+ }
+ }
+
+ @Test
+ public void test_decodesRfc2047_recipientsAndReplyTo() throws Exception {
+ final MimeMessage msg = new MimeMessage(newSession());
+ msg.setFrom(new InternetAddress("sender@example.com"));
+
+ final InternetAddress toAddr = new InternetAddress("to@example.com", "田中 一郎", "UTF-8");
+ final InternetAddress ccAddr = new InternetAddress("cc@example.com", "鈴木 花子", "UTF-8");
+ final InternetAddress bccAddr = new InternetAddress("bcc@example.com", "佐藤 次郎", "UTF-8");
+ final InternetAddress replyAddr = new InternetAddress("reply@example.com", "山本 三郎", "UTF-8");
+
+ msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { toAddr });
+ msg.setRecipients(Message.RecipientType.CC, new InternetAddress[] { ccAddr });
+ msg.setRecipients(Message.RecipientType.BCC, new InternetAddress[] { bccAddr });
+ msg.setReplyTo(new InternetAddress[] { replyAddr });
+ msg.setSubject("multi-recipient", "UTF-8");
+ msg.setText("body", "UTF-8");
+ msg.saveChanges();
+
+ try (final InputStream in = toStream(msg)) {
+ final ExtractData data = emlExtractor.getText(in, null);
+
+ final String[] toValues = data.getValues("to");
+ assertNotNull(toValues);
+ assertTrue(toValues[0].contains("田中 一郎"));
+
+ final String[] ccValues = data.getValues("cc");
+ assertNotNull(ccValues);
+ assertTrue(ccValues[0].contains("鈴木 花子"));
+
+ final String[] bccValues = data.getValues("bcc");
+ assertNotNull(bccValues);
+ assertTrue(bccValues[0].contains("佐藤 次郎"));
+
+ final String[] replyToValues = data.getValues("replyTo");
+ assertNotNull(replyToValues);
+ assertTrue(replyToValues[0].contains("山本 三郎"));
+ }
+ }
+
+ @Test
+ public void test_normalizedDateAndMessageIdMetadata() throws Exception {
+ final MimeMessage msg = new MimeMessage(newSession());
+ msg.setFrom(new InternetAddress("sender@example.com"));
+ msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") });
+ msg.setSubject("date test", "UTF-8");
+ msg.setText("body", "UTF-8");
+
+ // Set a known sent date
+ final SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'");
+ sdf.setTimeZone(TimeZone.getTimeZone("UTC"));
+ final Date sentDate = sdf.parse("2025-01-15T10:30:00.000Z");
+ msg.setSentDate(sentDate);
+ msg.saveChanges();
+
+ try (final InputStream in = toStream(msg)) {
+ final ExtractData data = emlExtractor.getText(in, null);
+
+ // sentDate must be ISO-8601 UTC
+ final String[] sentDateValues = data.getValues("sentDate");
+ assertNotNull(sentDateValues);
+ assertEquals("2025-01-15T10:30:00.000Z", sentDateValues[0]);
+
+ // messageId must be absent when not explicitly set (JavaMail may auto-generate one)
+ // In this test we verify it is present since saveChanges() generates a Message-ID
+ // Just ensure the key exists and is non-empty when present
+ final String[] msgIdValues = data.getValues("messageId");
+ // JavaMail always generates a Message-ID on saveChanges, so it must be present
+ assertNotNull(msgIdValues);
+ assertTrue(msgIdValues[0].length() > 0);
+ }
+
+ // Verify messageId absent when message has no Message-ID header
+ // Build message without calling saveChanges to avoid auto-generation
+ final MimeMessage msg2 = new MimeMessage(newSession());
+ msg2.setFrom(new InternetAddress("sender@example.com"));
+ msg2.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") });
+ msg2.setSubject("no message id", "UTF-8");
+ msg2.setText("body", "UTF-8");
+ // Do not call saveChanges; remove Message-ID header if present
+ msg2.removeHeader("Message-ID");
+ msg2.saveChanges();
+ msg2.removeHeader("Message-ID");
+
+ try (final InputStream in = toStream(msg2)) {
+ final ExtractData data = emlExtractor.getText(in, null);
+ // messageId should be absent since we removed the Message-ID header
+ assertNull(data.getValues("messageId"));
+ }
+ }
+
+ @Test
+ public void test_textPart_iso2022jp_decodedCorrectly() throws Exception {
+ final MimeMessage msg = new MimeMessage(newSession());
+ msg.setFrom(new InternetAddress("sender@example.com"));
+ msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") });
+ msg.setSubject("iso-2022-jp test", "UTF-8");
+ msg.setText("こんにちは", "ISO-2022-JP");
+ msg.saveChanges();
+
+ try (final InputStream in = toStream(msg)) {
+ final ExtractData data = emlExtractor.getText(in, null);
+ assertTrue(data.getContent().contains("こんにちは"));
+ }
+ }
+
+ @Test
+ public void test_textPart_unknownCharset_fallsBackToUtf8() throws Exception {
+ // Build raw EML bytes to avoid JavaMail rejecting the bogus charset during serialization.
+ // The body text is pure ASCII ("hello") which is valid in any charset including the fallback UTF-8.
+ final String boundary = "----=_Part_0_12345678.90";
+ final String rawEml = "From: sender@example.com\r\n" + "To: r@example.com\r\n" + "Subject: unknown charset\r\n"
+ + "MIME-Version: 1.0\r\n" + "Content-Type: multipart/mixed; boundary=\"" + boundary + "\"\r\n" + "\r\n" + "--" + boundary
+ + "\r\n" + "Content-Type: text/plain; charset=bogus-cs-9\r\n" + "Content-Transfer-Encoding: 7bit\r\n" + "\r\n" + "hello\r\n"
+ + "--" + boundary + "--\r\n";
+
+ try (final InputStream in = new ByteArrayInputStream(rawEml.getBytes(java.nio.charset.StandardCharsets.US_ASCII))) {
+ final ExtractData data = emlExtractor.getText(in, null);
+ assertTrue(data.getContent().contains("hello"));
+ }
+ }
+
+ @Test
+ public void test_textPart_noCharsetParameter_decodesAsUtf8() throws Exception {
+ final MimeMessage msg = new MimeMessage(newSession());
+ msg.setFrom(new InternetAddress("sender@example.com"));
+ msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") });
+ msg.setSubject("no charset", "UTF-8");
+
+ final MimeMultipart mp = new MimeMultipart();
+ final MimeBodyPart textPart = new MimeBodyPart();
+ // Content-Type without charset parameter
+ textPart.setContent("hello world", "text/plain");
+ mp.addBodyPart(textPart);
+ msg.setContent(mp);
+ msg.saveChanges();
+
+ try (final InputStream in = toStream(msg)) {
+ final ExtractData data = emlExtractor.getText(in, null);
+ assertTrue(data.getContent().contains("hello world"));
+ }
+ }
+
+ @Test
+ public void test_multipleAttachments_allRecorded() throws Exception {
+ final MimeMessage msg = new MimeMessage(newSession());
+ msg.setFrom(new InternetAddress("sender@example.com"));
+ msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") });
+ msg.setSubject("multiple attachments", "UTF-8");
+
+ final MimeMultipart mp = new MimeMultipart();
+
+ final MimeBodyPart textPart = new MimeBodyPart();
+ textPart.setText("body", "UTF-8");
+ mp.addBodyPart(textPart);
+
+ final String[] filenames = { "file1.txt", "file2.doc", "file3.xml" };
+ for (final String name : filenames) {
+ final MimeBodyPart att = new MimeBodyPart();
+ att.setContent("content of " + name, "application/octet-stream");
+ att.setFileName(name);
+ att.setDisposition(jakarta.mail.Part.ATTACHMENT);
+ mp.addBodyPart(att);
+ }
+
+ msg.setContent(mp);
+ msg.saveChanges();
+
+ try (final InputStream in = toStream(msg)) {
+ final ExtractData data = emlExtractor.getText(in, null);
+ final String[] names = data.getValues("attachmentNames");
+ assertNotNull(names);
+ final java.util.List nameList = Arrays.asList(names);
+ assertTrue(nameList.contains("file1.txt"));
+ assertTrue(nameList.contains("file2.doc"));
+ assertTrue(nameList.contains("file3.xml"));
+ }
+ }
+
+ @Test
+ public void test_inlineDispositionWithFilename_recordedAsAttachment() throws Exception {
+ final MimeMessage msg = new MimeMessage(newSession());
+ msg.setFrom(new InternetAddress("sender@example.com"));
+ msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") });
+ msg.setSubject("inline attachment", "UTF-8");
+
+ final MimeMultipart mp = new MimeMultipart("related");
+
+ final MimeBodyPart textPart = new MimeBodyPart();
+ textPart.setText("body with inline", "UTF-8");
+ mp.addBodyPart(textPart);
+
+ // Inline disposition with filename — should be recorded as an attachment
+ final MimeBodyPart inlinePart = new MimeBodyPart();
+ inlinePart.setContent(new byte[] { (byte) 0x89, 0x50, 0x4E, 0x47 }, "image/png");
+ inlinePart.setFileName("logo.png");
+ inlinePart.setDisposition(jakarta.mail.Part.INLINE);
+ mp.addBodyPart(inlinePart);
+
+ msg.setContent(mp);
+ msg.saveChanges();
+
+ try (final InputStream in = toStream(msg)) {
+ final ExtractData data = emlExtractor.getText(in, null);
+ final String[] names = data.getValues("attachmentNames");
+ assertNotNull(names);
+ assertTrue(Arrays.stream(names).anyMatch(n -> n.contains("logo.png")));
+ }
+ }
+
+ @Test
+ public void test_maxBodyBytes_acrossMultipleParts() throws Exception {
+ final int maxBytes = 50;
+ final EmlExtractor extractor = new EmlExtractor();
+ extractor.setMaxBodyBytes(maxBytes);
+
+ final MimeMessage msg = new MimeMessage(newSession());
+ msg.setFrom(new InternetAddress("sender@example.com"));
+ msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") });
+ msg.setSubject("two parts", "UTF-8");
+
+ final MimeMultipart mp = new MimeMultipart();
+
+ // First part: 30 ASCII bytes
+ final MimeBodyPart part1 = new MimeBodyPart();
+ part1.setText("a".repeat(30), "UTF-8");
+ mp.addBodyPart(part1);
+
+ // Second part: 30 ASCII bytes — combined exceeds maxBytes
+ final MimeBodyPart part2 = new MimeBodyPart();
+ part2.setText("b".repeat(30), "UTF-8");
+ mp.addBodyPart(part2);
+
+ msg.setContent(mp);
+ msg.saveChanges();
+
+ try (final InputStream in = toStream(msg)) {
+ final ExtractData data = extractor.getText(in, null);
+ final String content = data.getContent();
+ // Total must not exceed maxBodyBytes
+ assertTrue(content.length() <= maxBytes);
+ }
+ }
+
+ @Test
+ public void test_setters_rejectInvalidValues() {
+ final EmlExtractor extractor = new EmlExtractor();
+
+ try {
+ extractor.setMaxParts(0);
+ fail();
+ } catch (final IllegalArgumentException e) {
+ // expected
+ }
+
+ try {
+ extractor.setMaxParts(-1);
+ fail();
+ } catch (final IllegalArgumentException e) {
+ // expected
+ }
+
+ try {
+ extractor.setMaxBodyBytes(0);
+ fail();
+ } catch (final IllegalArgumentException e) {
+ // expected
+ }
+
+ try {
+ extractor.setMaxMessageBytes(0);
+ fail();
+ } catch (final IllegalArgumentException e) {
+ // expected
+ }
+
+ try {
+ extractor.setMaxRecursionDepth(-1);
+ fail();
+ } catch (final IllegalArgumentException e) {
+ // expected
+ }
+
+ // setMaxRecursionDepth(0) must be accepted (root-only is valid)
+ extractor.setMaxRecursionDepth(0);
+ assertEquals(0, extractor.getMaxRecursionDepth());
+ }
+
+ @Test
+ public void test_getReceivedDate_parsesWithSemicolon() throws Exception {
+ // Build a message with a Received header in standard RFC 5322 form
+ final MimeMessage msg = new MimeMessage(newSession());
+ msg.setFrom(new InternetAddress("sender@example.com"));
+ msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") });
+ msg.setSubject("received date test", "UTF-8");
+ msg.setText("body", "UTF-8");
+ // Add a Received header with semicolon-separated date
+ msg.addHeader("Received", "from foo.example.com by bar.example.com; Sun, 11 Nov 2012 02:39:59 +0000");
+ msg.saveChanges();
+
+ try (final InputStream in = toStream(msg)) {
+ final ExtractData data = emlExtractor.getText(in, null);
+ final String[] receivedDate = data.getValues("Received-Date");
+ assertNotNull(receivedDate);
+ assertEquals("2012-11-11T02:39:59.000Z", receivedDate[0]);
+ }
+ }
+
+ @Test
+ public void test_getReceivedDate_skipsMalformedDowInComment() throws Exception {
+ // DOW abbreviation in a comment, but valid date after semicolon
+ final MimeMessage msg = new MimeMessage(newSession());
+ msg.setFrom(new InternetAddress("sender@example.com"));
+ msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") });
+ msg.setSubject("received comment test", "UTF-8");
+ msg.setText("body", "UTF-8");
+ // The "(Mon)" in the routing portion should not confuse the parser;
+ // the date after ";" is the authoritative date
+ msg.addHeader("Received", "from foo (Mon gateway) by bar; Mon, 11 Nov 2013 05:00:00 +0000");
+ msg.saveChanges();
+
+ try (final InputStream in = toStream(msg)) {
+ final ExtractData data = emlExtractor.getText(in, null);
+ final String[] receivedDate = data.getValues("Received-Date");
+ assertNotNull(receivedDate);
+ assertEquals("2013-11-11T05:00:00.000Z", receivedDate[0]);
+ }
+ }
+
+ @Test
+ public void test_manyReceivedHeaders_bounded() throws Exception {
+ final MimeMessage msg = new MimeMessage(newSession());
+ msg.setFrom(new InternetAddress("sender@example.com"));
+ msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") });
+ msg.setSubject("many received headers", "UTF-8");
+ msg.setText("body", "UTF-8");
+
+ // Add 500 garbage Received headers first
+ for (int i = 0; i < 500; i++) {
+ msg.addHeader("Received", "garbage entry number " + i);
+ }
+ // Then add one valid Received header — but since we cap at 100, this valid one
+ // at index 500 will NOT be seen. We verify that extraction at least completes
+ // without error and does not blow up on unbounded iteration.
+ // (The valid header is beyond the 100-entry cap, so receivedDate may be null.)
+ msg.addHeader("Received", "from x by y; Mon, 11 Nov 2013 05:00:00 +0000");
+ msg.saveChanges();
+
+ try (final InputStream in = toStream(msg)) {
+ final ExtractData data = emlExtractor.getText(in, null);
+ // Just verify it completes without exception and content is non-null
+ assertNotNull(data.getContent());
+ }
+ }
+
+ @Test
+ public void test_getDecodeText_returnsRawOnUnsupportedEncoding() {
+ // An encoded-word with an unknown charset should return the raw input, not empty string.
+ // Use a charset that is genuinely unsupported in the JVM.
+ // Note: if the JVM happens to support the charset, this test may fall back gracefully.
+ // We use a clearly bogus encoding name to guarantee UnsupportedEncodingException.
+ final String raw = "=?bogus-cs-9?B?dGVzdA==?=";
+ // MimeUtility.decodeText will throw UnsupportedEncodingException for unknown charset;
+ // getDecodeText must return the raw value unchanged in that case.
+ final String result = emlExtractor.getDecodeText(raw);
+ // Either successfully decoded (if JVM finds charset) or returns raw value
+ // The contract is: never return empty string when input is non-empty
+ assertNotNull(result);
+ assertTrue(result.length() > 0);
+ // If decoding fails, must return the raw string, not empty string
+ // (We can't force the failure path here without mocking, but we verify no empty return)
+ }
}