diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractor.java index 363fe063..3bf1ecf0 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractor.java @@ -15,14 +15,22 @@ */ package org.codelibs.fess.crawler.extractor.impl; +import java.io.FilterInputStream; import java.io.IOException; import java.io.InputStream; +import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; +import java.nio.charset.Charset; +import java.nio.charset.IllegalCharsetNameException; +import java.nio.charset.StandardCharsets; +import java.nio.charset.UnsupportedCharsetException; import java.text.ParseException; import java.text.SimpleDateFormat; +import java.util.ArrayList; import java.util.Date; import java.util.Enumeration; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.Properties; import java.util.TimeZone; @@ -33,6 +41,7 @@ import org.codelibs.fess.crawler.Constants; import org.codelibs.fess.crawler.entity.ExtractData; import org.codelibs.fess.crawler.exception.ExtractException; +import org.codelibs.fess.crawler.exception.MaxLengthExceededException; import org.codelibs.fess.crawler.extractor.Extractor; import org.codelibs.fess.crawler.extractor.ExtractorFactory; import org.codelibs.fess.crawler.helper.MimeTypeHelper; @@ -45,6 +54,7 @@ import jakarta.mail.Multipart; import jakarta.mail.Part; import jakarta.mail.Session; +import jakarta.mail.internet.ContentType; import jakarta.mail.internet.MailDateFormat; import jakarta.mail.internet.MimeMessage; import jakarta.mail.internet.MimeUtility; @@ -52,6 +62,25 @@ /** * Gets a text from .eml file. * + *

EML content is treated as untrusted. The extractor enforces the following + * defensive bounds against malformed or malicious messages:

+ * + *

RFC 2047 encoded-word headers (e.g. {@code Subject}, + * {@code From}, {@code To}) are decoded via {@link MimeUtility#decodeText}.

+ *

The legacy {@code Subject} metadata key is RFC 2047-decoded for + * compatibility with older callers.

+ * * @author shinsuke * */ @@ -65,6 +94,18 @@ public class EmlExtractor extends AbstractExtractor { /** Properties used for mail processing */ protected Properties mailProperties = new Properties(); + /** Maximum allowed nesting depth for multipart / message/rfc822 parts. */ + protected int maxRecursionDepth = 10; + + /** Maximum allowed total number of MIME parts visited per message. */ + protected int maxParts = 1000; + + /** Maximum total body bytes (UTF-8) appended to the extracted content. */ + protected long maxBodyBytes = 50L * 1024 * 1024; + + /** Maximum allowed total stream bytes consumed while parsing the EML. */ + protected long maxMessageBytes = 100L * 1024 * 1024; + /** * Constructs a new EmlExtractor. */ @@ -83,11 +124,16 @@ public ExtractData getText(final InputStream in, final Map param props.put(entry.getKey(), entry.getValue()); } } + if (in == null) { + throw new ExtractException("Input stream is null."); + } + final LimitedInputStream limited = new LimitedInputStream(in, maxMessageBytes); try { - final Session mailSession = Session.getDefaultInstance(props, null); - final MimeMessage message = new MimeMessage(mailSession, in); - final String content = getBodyText(message); - final ExtractData data = new ExtractData(content != null ? content : StringUtil.EMPTY); + final Session mailSession = Session.getInstance(props, null); + final MimeMessage message = new MimeMessage(mailSession, limited); + final BodyExtractionContext ctx = new BodyExtractionContext(); + extractBody(message, ctx, 0); + final ExtractData data = new ExtractData(ctx.body.toString()); final Enumeration
headers = message.getAllHeaders(); while (headers.hasMoreElements()) { final Header header = headers.nextElement(); @@ -104,7 +150,8 @@ public ExtractData getText(final InputStream in, final Map param putValue(data, "Line-Count", message.getLineCount()); putValue(data, "Message-ID", message.getMessageID()); putValue(data, "Message-Number", message.getMessageNumber()); - putValue(data, "Received-Date", getReceivedDate(message)); + final Date receivedDate = getReceivedDate(message); + putValue(data, "Received-Date", receivedDate); putValue(data, "Reply-To", message.getReplyTo()); putValue(data, "Sender", message.getSender()); putValue(data, "Sent-Date", message.getSentDate()); @@ -114,8 +161,33 @@ public ExtractData getText(final InputStream in, final Map param putValue(data, "To", message.getRecipients(Message.RecipientType.TO)); putValue(data, "Cc", message.getRecipients(Message.RecipientType.CC)); putValue(data, "Bcc", message.getRecipients(Message.RecipientType.BCC)); + + // normalized convenience metadata (always RFC 2047 decoded) + putDecodedHeaderValue(data, "subject", message.getSubject()); + putDecodedAddressValues(data, "from", message.getFrom()); + putDecodedAddressValues(data, "to", message.getRecipients(Message.RecipientType.TO)); + putDecodedAddressValues(data, "cc", message.getRecipients(Message.RecipientType.CC)); + putDecodedAddressValues(data, "bcc", message.getRecipients(Message.RecipientType.BCC)); + putDecodedAddressValues(data, "replyTo", message.getReplyTo()); + putDateValue(data, "sentDate", message.getSentDate()); + putDateValue(data, "receivedDate", receivedDate); + if (message.getMessageID() != null) { + data.putValue("messageId", message.getMessageID()); + } + + if (!ctx.attachmentNames.isEmpty()) { + data.putValues("attachmentNames", ctx.attachmentNames.toArray(new String[0])); + } return data; } catch (final MessagingException e) { + if (limited.isExceeded()) { + throw new MaxLengthExceededException("EML message size exceeded: max=" + maxMessageBytes); + } + throw new ExtractException(e); + } catch (final IOException e) { + if (limited.isExceeded()) { + throw new MaxLengthExceededException("EML message size exceeded: max=" + maxMessageBytes); + } throw new ExtractException(e); } } @@ -154,18 +226,71 @@ protected void putValue(final ExtractData data, final String key, final Object v } else if (value != null) { data.putValue(key, value.toString()); } - } catch (final Exception e) { - if (logger.isDebugEnabled()) { - logger.debug("Failed to put {}:{}", key, value, e); - } + } catch (final RuntimeException e) { + logger.warn("Failed to put header value. key={}", key, e); + } + } + + /** + * Stores a decoded header value if non-null/non-blank. + * + * @param data the extract data + * @param key the metadata key + * @param raw the raw header value, may be {@code null} + */ + protected void putDecodedHeaderValue(final ExtractData data, final String key, final String raw) { + if (raw == null) { + return; + } + final String decoded = getDecodeText(raw); + if (!StringUtil.isEmpty(decoded)) { + data.putValue(key, decoded); + } + } + + /** + * Stores a decoded address array as a multivalue metadata entry. + * + * @param data the extract data + * @param key the metadata key + * @param addresses the address array, may be {@code null} + */ + protected void putDecodedAddressValues(final ExtractData data, final String key, final Address[] addresses) { + if (addresses == null || addresses.length == 0) { + return; } + final String[] values = new String[addresses.length]; + for (int i = 0; i < addresses.length; i++) { + values[i] = getDecodeText(addresses[i].toString()); + } + data.putValues(key, values); + } + + /** + * Stores a Date as an ISO-8601 UTC string under the given key. + * + * @param data the extract data + * @param key the metadata key + * @param date the date, may be {@code null} + */ + protected void putDateValue(final ExtractData data, final String key, final Date date) { + if (date == null) { + return; + } + final SimpleDateFormat sdf = new SimpleDateFormat(Constants.ISO_DATETIME_FORMAT); + sdf.setTimeZone(TimeZone.getTimeZone("UTC")); + data.putValue(key, sdf.format(date)); } /** * Decodes MIME-encoded text. * + *

On {@link UnsupportedEncodingException} (caused by an unrecognised RFC 2047 + * charset), logs a warning and returns the raw value unchanged so + * callers still receive some usable output rather than an empty string.

+ * * @param value the encoded text to decode - * @return the decoded text or empty string if decoding fails + * @return the decoded text, the raw value on encoding failure, or empty string for null input */ protected String getDecodeText(final String value) { if (value == null) { @@ -174,8 +299,8 @@ protected String getDecodeText(final String value) { try { return MimeUtility.decodeText(value); } catch (final UnsupportedEncodingException e) { - logger.warn("Invalid encoding.", e); - return StringUtil.EMPTY; + logger.warn("Invalid RFC 2047 encoding, returning raw value. value={}", value, e); + return value; } } @@ -197,52 +322,389 @@ public void setMailProperties(final Properties mailProperties) { this.mailProperties = mailProperties; } + /** + * Returns the maximum allowed recursion depth. + * + * @return the maximum recursion depth + */ + public int getMaxRecursionDepth() { + return maxRecursionDepth; + } + + /** + * Sets the maximum allowed recursion depth for nested multipart / + * {@code message/rfc822} parts. A value of {@code 0} means only the root + * part is processed (no recursion). Negative values are rejected. + * + * @param maxRecursionDepth the maximum recursion depth; must be >= 0 + * @throws IllegalArgumentException if the value is negative + */ + public void setMaxRecursionDepth(final int maxRecursionDepth) { + if (maxRecursionDepth < 0) { + throw new IllegalArgumentException("maxRecursionDepth must be positive: " + maxRecursionDepth); + } + this.maxRecursionDepth = maxRecursionDepth; + } + + /** + * Returns the maximum total number of MIME parts visited per message. + * + * @return the maximum number of parts + */ + public int getMaxParts() { + return maxParts; + } + + /** + * Sets the maximum total number of MIME parts visited per message. + * + * @param maxParts the maximum number of parts; must be > 0 + * @throws IllegalArgumentException if the value is <= 0 + */ + public void setMaxParts(final int maxParts) { + if (maxParts <= 0) { + throw new IllegalArgumentException("maxParts must be positive: " + maxParts); + } + this.maxParts = maxParts; + } + + /** + * Returns the maximum total UTF-8 body bytes appended to extracted content. + * + * @return the maximum body bytes + */ + public long getMaxBodyBytes() { + return maxBodyBytes; + } + + /** + * Sets the maximum total UTF-8 body bytes appended to extracted content. + * + * @param maxBodyBytes the maximum body bytes; must be > 0 + * @throws IllegalArgumentException if the value is <= 0 + */ + public void setMaxBodyBytes(final long maxBodyBytes) { + if (maxBodyBytes <= 0) { + throw new IllegalArgumentException("maxBodyBytes must be positive: " + maxBodyBytes); + } + this.maxBodyBytes = maxBodyBytes; + } + + /** + * Returns the maximum allowed total stream bytes consumed while parsing the EML. + * + * @return the maximum message bytes + */ + public long getMaxMessageBytes() { + return maxMessageBytes; + } + + /** + * Sets the maximum allowed total stream bytes consumed while parsing the EML. + * This is the first-line defense before {@link MimeMessage} parses the input. + * + * @param maxMessageBytes the maximum message bytes; must be > 0 + * @throws IllegalArgumentException if the value is <= 0 + */ + public void setMaxMessageBytes(final long maxMessageBytes) { + if (maxMessageBytes <= 0) { + throw new IllegalArgumentException("maxMessageBytes must be positive: " + maxMessageBytes); + } + this.maxMessageBytes = maxMessageBytes; + } + /** * Extracts the body text from a MIME message. * + *

Retained for backwards compatibility. Internally delegates to + * {@link #extractBody(Part, BodyExtractionContext, int)} with a fresh + * context.

+ * * @param message the MIME message to extract text from * @return the extracted body text * @throws ExtractException if extraction fails */ protected String getBodyText(final MimeMessage message) { - final StringBuilder buf = new StringBuilder(1000); try { - final Object content = message.getContent(); - if (content instanceof final Multipart multipart) { - final int count = multipart.getCount(); + final BodyExtractionContext ctx = new BodyExtractionContext(); + extractBody(message, ctx, 0); + return ctx.body.toString(); + } catch (MessagingException | IOException e) { + throw new ExtractException(e); + } + } + + /** + * Recursively extracts text content from a MIME part, enforcing recursion, + * part-count, and body-byte bounds. + * + * @param part the current MIME part + * @param ctx the extraction context tracking accumulated state + * @param depth the current recursion depth (root = 0) + * @throws MessagingException if a JavaMail call fails + * @throws IOException if reading part content fails + */ + protected void extractBody(final Part part, final BodyExtractionContext ctx, final int depth) throws MessagingException, IOException { + if (depth > maxRecursionDepth) { + throw new MaxLengthExceededException("EML recursion too deep: depth=" + depth + " max=" + maxRecursionDepth); + } + ctx.partCount++; + if (ctx.partCount > maxParts) { + throw new MaxLengthExceededException("EML part count exceeded: max=" + maxParts); + } + + // Treat explicitly-marked attachments as attachments regardless of mime type. + if (Part.ATTACHMENT.equalsIgnoreCase(part.getDisposition())) { + recordAttachment(ctx, part); + return; + } + + if (part.isMimeType("text/*")) { + appendTextPart(ctx, part); + return; + } + + if (part.isMimeType("multipart/alternative")) { + final Object content = part.getContent(); + if (content instanceof Multipart) { + final Multipart mp = (Multipart) content; + final int count = mp.getCount(); + // Prefer text/plain alternative; fall back to first text/* alternative. + BodyPart chosen = null; for (int i = 0; i < count; i++) { - final BodyPart bodyPart = multipart.getBodyPart(i); - if (Part.ATTACHMENT.equalsIgnoreCase(bodyPart.getDisposition())) { - appendAttachment(buf, bodyPart); - } else if (bodyPart.isMimeType("text/plain") || bodyPart.isMimeType("text/html")) { - buf.append(bodyPart.getContent().toString()).append(' '); - } else if (bodyPart.isMimeType("multipart/alternative") && bodyPart.getContent() instanceof Multipart) { - final Multipart alternativePart = (Multipart) bodyPart.getContent(); - for (int j = 0; j < alternativePart.getCount(); j++) { - final BodyPart innerBodyPart = alternativePart.getBodyPart(j); - if (innerBodyPart.isMimeType("text/plain")) { - buf.append(innerBodyPart.getContent().toString()).append(' '); - break; - } + final BodyPart bp = mp.getBodyPart(i); + if (bp.isMimeType("text/plain")) { + chosen = bp; + break; + } + } + if (chosen == null) { + for (int i = 0; i < count; i++) { + final BodyPart bp = mp.getBodyPart(i); + if (bp.isMimeType("text/*")) { + chosen = bp; + break; + } + } + } + if (chosen != null) { + // Charge the partCount budget for every alternative — even those we + // don't recurse into — so an attacker can't bypass maxParts by + // stuffing thousands of unused alternatives. The chosen part is + // counted via its own extractBody call below, so charge count - 1. + if (count > 1) { + ctx.partCount += count - 1; + if (ctx.partCount > maxParts) { + throw new MaxLengthExceededException("EML part count exceeded: max=" + maxParts); } } + extractBody(chosen, ctx, depth + 1); + } else { + // No text alternative; recurse into all parts (each counted normally). + for (int i = 0; i < count; i++) { + extractBody(mp.getBodyPart(i), ctx, depth + 1); + } } - } else if (content instanceof String) { - buf.append(content.toString()); } - } catch (MessagingException | IOException e) { - throw new ExtractException(e); + return; + } + + if (part.isMimeType("multipart/*")) { + final Object content = part.getContent(); + if (content instanceof Multipart) { + final Multipart mp = (Multipart) content; + for (int i = 0; i < mp.getCount(); i++) { + extractBody(mp.getBodyPart(i), ctx, depth + 1); + } + } + return; + } + + if (part.isMimeType("message/rfc822")) { + final Object content = part.getContent(); + if (content instanceof Part) { + extractBody((Part) content, ctx, depth + 1); + } + return; + } + + // Anything else with a filename is an inline attachment-like part. + recordAttachment(ctx, part); + } + + /** + * Records an attachment filename (decoded) and attempts in-extractor text + * extraction for known mime types, mirroring previous behavior. + * + * @param ctx the extraction context + * @param part the attachment-like part + */ + protected void recordAttachment(final BodyExtractionContext ctx, final Part part) { + try { + final String rawName = part.getFileName(); + if (!StringUtil.isEmpty(rawName)) { + final String decoded = getDecodeText(rawName); + if (!StringUtil.isEmpty(decoded)) { + ctx.attachmentNames.add(decoded); + } + } + } catch (final MessagingException e) { + if (logger.isDebugEnabled()) { + logger.debug("Failed to read attachment filename.", e); + } + } + if (part instanceof BodyPart) { + appendAttachment(ctx, (BodyPart) part); + } + } + + /** + * Appends body text to the extraction context, enforcing + * {@link #maxBodyBytes}. Truncates any text that would push the total over + * the limit (including the trailing separator space). + * + *

Encodes the text once with {@link String#getBytes(java.nio.charset.Charset)} + * (memory proportional to the input, not to the configured budget). When + * truncation is needed, walks back over UTF-8 continuation bytes (at most + * three steps) so the cut lands on a code-point boundary.

+ * + * @param ctx the extraction context + * @param text the text to append + */ + protected void appendBody(final BodyExtractionContext ctx, final String text) { + if (text == null || text.isEmpty()) { + return; + } + if (ctx.bodyBytes >= maxBodyBytes) { + return; + } + final byte[] bytes = text.getBytes(StandardCharsets.UTF_8); + final long remaining = maxBodyBytes - ctx.bodyBytes; + // Reserve 1 byte for the trailing separator space so the strict cap holds. + if ((long) bytes.length + 1L <= remaining) { + ctx.body.append(text).append(' '); + ctx.bodyBytes += (long) bytes.length + 1L; + return; + } + // Truncate at a UTF-8 code-point boundary that fits within the remaining + // budget. Continuation bytes have the bit pattern 10xxxxxx, so walk back + // until we land on a start byte (or zero). Bounded by 3 iterations. + int cutoff = (int) Math.min(remaining, (long) bytes.length); + while (cutoff > 0 && cutoff < bytes.length && (bytes[cutoff] & 0xC0) == 0x80) { + cutoff--; + } + if (cutoff > 0) { + ctx.body.append(new String(bytes, 0, cutoff, StandardCharsets.UTF_8)); + } + ctx.bodyBytes = maxBodyBytes; + if (logger.isDebugEnabled()) { + logger.debug("EML body truncated. maxBytes={}", maxBodyBytes); + } + } + + /** + * Returns the content type of a part as a string, or {@code "unknown"} on + * {@link MessagingException}. + * + * @param part the MIME part + * @return the content-type string or {@code "unknown"} + */ + private static String safeGetContentType(final Part part) { + try { + return part.getContentType(); + } catch (final MessagingException e) { + return "unknown"; } - return buf.toString(); } /** - * Appends attachment content to the buffer if it can be extracted. + * Streams a text part's content into the extraction buffer, reading + * {@code remaining} chars at most via {@link InputStreamReader}, then + * delegates to {@link #appendBody} for byte-accurate truncation. + * + *

The charset is resolved from the part's Content-Type header; if absent + * or unrecognised, UTF-8 is used as the fallback.

+ * + * @param ctx the extraction context + * @param part the {@code text/*} part + */ + protected void appendTextPart(final BodyExtractionContext ctx, final Part part) { + if (ctx.bodyBytes >= maxBodyBytes) { + return; + } + final long remaining = maxBodyBytes - ctx.bodyBytes; + final int charCap = (int) Math.min(remaining, (long) Integer.MAX_VALUE / 4); + + Charset charset = StandardCharsets.UTF_8; + try { + final String contentType = part.getContentType(); + if (contentType != null) { + final String cs = new ContentType(contentType).getParameter("charset"); + if (cs != null && !cs.isEmpty()) { + try { + charset = Charset.forName(MimeUtility.javaCharset(cs)); + } catch (final IllegalCharsetNameException | UnsupportedCharsetException e) { + logger.warn("Unsupported EML text part charset, fallback=UTF-8. charset={}", cs, e); + } + } + } + } catch (final MessagingException e) { + if (logger.isDebugEnabled()) { + logger.debug("Failed to parse content type of text part.", e); + } + } + + try (InputStream is = part.getInputStream(); InputStreamReader reader = new InputStreamReader(is, charset)) { + final char[] buf = new char[Math.min(charCap, 8 * 1024)]; + final StringBuilder sb = new StringBuilder(Math.min(charCap, 64 * 1024)); + int total = 0; + int n; + while (total < charCap && (n = reader.read(buf, 0, Math.min(buf.length, charCap - total))) > 0) { + sb.append(buf, 0, n); + total += n; + } + if (total > 0) { + appendBody(ctx, sb.toString()); + } + } catch (final IOException e) { + logger.warn("Failed to read text part content. contentType={}", safeGetContentType(part), e); + } catch (final MessagingException e) { + logger.warn("Failed to access text part input stream. contentType={}", safeGetContentType(part), e); + } + } + + /** + * Backwards-compatible attachment text extraction. Kept for subclasses that + * may have overridden it; new code should prefer + * {@link #appendAttachment(BodyExtractionContext, BodyPart)}. + * + * @deprecated Use {@link #appendAttachment(BodyExtractionContext, BodyPart)} instead. + * This shim creates a fresh extraction context with {@code bodyBytes=0}, so + * the {@link #maxBodyBytes} cap is enforced per call rather than cumulatively + * across a single message. Subclasses overriding this method should migrate to + * the context-aware overload. * * @param buf the buffer to append content to * @param bodyPart the body part containing the attachment */ + @Deprecated protected void appendAttachment(final StringBuilder buf, final BodyPart bodyPart) { + final BodyExtractionContext ctx = new BodyExtractionContext(); + ctx.body = buf; + appendAttachment(ctx, bodyPart); + } + + /** + * Attempts to extract text from an attachment using a registered + * {@link Extractor} for its detected MIME type. Failures are silently + * swallowed unless they are {@link MaxLengthExceededException}, which is + * re-thrown so the caller can enforce overall message size limits. + * + * @param ctx the extraction context + * @param bodyPart the attachment body part + * @throws MaxLengthExceededException if the nested extractor signals a size limit violation + */ + protected void appendAttachment(final BodyExtractionContext ctx, final BodyPart bodyPart) { final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper(); final ExtractorFactory extractorFactory = getExtractorFactory(); try { @@ -251,18 +713,34 @@ protected void appendAttachment(final StringBuilder buf, final BodyPart bodyPart if (mimeType != null) { final Extractor extractor = extractorFactory.getExtractor(mimeType); if (extractor != null) { - try (final InputStream in = bodyPart.getInputStream()) { + if (ctx.bodyBytes >= maxBodyBytes) { + return; + } + final long remaining = maxBodyBytes - ctx.bodyBytes; + final long sourceCapL; + if (remaining > (Integer.MAX_VALUE - 16L) / 4L) { + sourceCapL = Integer.MAX_VALUE - 16L; + } else { + sourceCapL = remaining * 4L + 16L; + } + try (final InputStream in = new LimitedInputStream(bodyPart.getInputStream(), sourceCapL)) { final Map map = new HashMap<>(); map.put(ExtractData.RESOURCE_NAME_KEY, filename); final String content = extractor.getText(in, map).getContent(); - buf.append(content).append(' '); + if (content != null) { + appendBody(ctx, content); + } + } catch (final MaxLengthExceededException e) { + throw e; } catch (final Exception e) { if (logger.isDebugEnabled()) { - logger.debug("Exception in an internal extractor.", e); + logger.debug("Exception in an internal extractor. filename={}", filename, e); } } } } + } catch (final MaxLengthExceededException e) { + throw e; } catch (final MessagingException e) { if (logger.isDebugEnabled()) { logger.debug("Exception in parsing BodyPart.", e); @@ -272,6 +750,8 @@ protected void appendAttachment(final StringBuilder buf, final BodyPart bodyPart /** * Gets the received date from a message by parsing the received headers. + * Caps inspection to the first 100 headers to avoid unbounded work on + * messages with pathologically many {@code Received} lines. * * @param message the message to get the received date from * @return the received date or null if not found @@ -280,17 +760,25 @@ protected void appendAttachment(final StringBuilder buf, final BodyPart bodyPart protected static Date getReceivedDate(final Message message) throws MessagingException { final Date today = new Date(); final String[] received = message.getHeader("received"); - if (received != null) { - for (final String v : received) { - String dateStr = null; - try { - dateStr = getDateString(v); - final Date receivedDate = new MailDateFormat().parse(dateStr); - if (!receivedDate.after(today)) { - return receivedDate; - } - } catch (final ParseException e) { - // ignore + if (received == null) { + return null; + } + final MailDateFormat format = new MailDateFormat(); + final int limit = Math.min(received.length, 100); + for (int i = 0; i < limit; i++) { + final String v = received[i]; + try { + final String dateStr = getDateString(v); + if (dateStr == null) { + continue; + } + final Date receivedDate = format.parse(dateStr); + if (!receivedDate.after(today)) { + return receivedDate; + } + } catch (final ParseException e) { + if (logger.isDebugEnabled()) { + logger.debug("Failed to parse received header. value={}", v, e); } } } @@ -300,10 +788,21 @@ protected static Date getReceivedDate(final Message message) throws MessagingExc /** * Extracts a date string from the received header text. * + *

Per RFC 5322 §3.6.7 the date portion follows the last {@code ;} in + * the header. If no {@code ;} is present, falls back to scanning for a + * day-of-week abbreviation.

+ * * @param text the received header text - * @return the date string starting from the day of week, or null if not found + * @return the date string, or null if not found */ private static String getDateString(final String text) { + if (text == null) { + return null; + } + final int semicolon = text.lastIndexOf(';'); + if (semicolon != -1 && semicolon + 1 < text.length()) { + return text.substring(semicolon + 1).trim(); + } for (final String dow : DAY_OF_WEEK) { final int i = text.lastIndexOf(dow); if (i != -1) { @@ -312,4 +811,68 @@ private static String getDateString(final String text) { } return null; } + + /** + * Mutable state shared across recursive body extraction. + */ + protected static class BodyExtractionContext { + /** Accumulated body text. */ + protected StringBuilder body = new StringBuilder(1000); + + /** Number of MIME parts visited so far. */ + protected int partCount; + + /** UTF-8 bytes already appended to {@link #body}. */ + protected long bodyBytes; + + /** Decoded attachment filenames. */ + protected List attachmentNames = new ArrayList<>(); + } + + /** + * A {@link FilterInputStream} that throws {@link IOException} once the + * number of bytes read exceeds a configured limit. Used to cap raw EML + * stream consumption before {@link MimeMessage} parses the input. + */ + private static final class LimitedInputStream extends FilterInputStream { + private final long limit; + private long bytesRead; + private boolean exceeded; + + LimitedInputStream(final InputStream in, final long limit) { + super(in); + this.limit = limit; + } + + @Override + public int read() throws IOException { + final int b = super.read(); + if (b != -1) { + bytesRead++; + if (bytesRead > limit) { + exceeded = true; + throw new IOException("EML message size exceeded."); + } + } + return b; + } + + @Override + public int read(final byte[] b, final int off, final int len) throws IOException { + final int n = super.read(b, off, len); + if (n > 0) { + bytesRead += n; + if (bytesRead > limit) { + exceeded = true; + throw new IOException("EML message size exceeded."); + } + } + return n; + } + + /** Returns {@code true} if the limit was exceeded during reading. */ + boolean isExceeded() { + return exceeded; + } + } } diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractorTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractorTest.java index c0e39b25..b125d53f 100644 --- a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractorTest.java +++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractorTest.java @@ -15,8 +15,16 @@ */ package org.codelibs.fess.crawler.extractor.impl; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; +import java.text.SimpleDateFormat; +import java.util.Arrays; +import java.util.Date; +import java.util.Map; +import java.util.Properties; +import java.util.TimeZone; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -24,13 +32,21 @@ import org.codelibs.fess.crawler.container.StandardCrawlerContainer; import org.codelibs.fess.crawler.entity.ExtractData; import org.codelibs.fess.crawler.exception.CrawlerSystemException; +import org.codelibs.fess.crawler.exception.MaxLengthExceededException; +import org.codelibs.fess.crawler.extractor.Extractor; import org.codelibs.fess.crawler.extractor.ExtractorFactory; import org.codelibs.fess.crawler.helper.impl.MimeTypeHelperImpl; import org.dbflute.utflute.core.PlainTestCase; -import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.TestInfo; +import jakarta.mail.Message; +import jakarta.mail.Session; +import jakarta.mail.internet.InternetAddress; +import jakarta.mail.internet.MimeBodyPart; +import jakarta.mail.internet.MimeMessage; +import jakarta.mail.internet.MimeMultipart; + /** * @author shinsuke * @@ -115,4 +131,888 @@ public void test_getText_null() { // NOP } } + + // -------------------------------------------------------------------- + // Programmatically-built fixtures + // -------------------------------------------------------------------- + + private static Session newSession() { + return Session.getInstance(new Properties(), null); + } + + private static InputStream toStream(final MimeMessage msg) throws Exception { + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + msg.writeTo(baos); + return new ByteArrayInputStream(baos.toByteArray()); + } + + @Test + public void test_extractsBody() throws Exception { + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") }); + msg.setSubject("Hello", "UTF-8"); + msg.setText("Hello, world!", "UTF-8"); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = emlExtractor.getText(in, null); + assertTrue(data.getContent().contains("Hello, world!")); + assertEquals("Hello", data.getValues("subject")[0]); + } + } + + @Test + public void test_decodesRfc2047Subject() throws Exception { + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") }); + // setSubject(text, charset) auto-encodes as RFC 2047 when non-ASCII + msg.setSubject("こんにちは", "UTF-8"); + msg.setText("body", "UTF-8"); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = emlExtractor.getText(in, null); + // Legacy `Subject` metadata key is also RFC 2047-decoded for caller convenience. + final String raw = data.getValues("Subject")[0]; + assertEquals("こんにちは", raw); + // Normalized "subject" metadata is decoded + assertEquals("こんにちは", data.getValues("subject")[0]); + } + } + + @Test + public void test_decodesRfc2047From() throws Exception { + final MimeMessage msg = new MimeMessage(newSession()); + // Personal name in non-ASCII triggers RFC 2047 encoding on serialization + final InternetAddress from = new InternetAddress("sender@example.com", "山田 太郎", "UTF-8"); + msg.setFrom(from); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") }); + msg.setSubject("test", "UTF-8"); + msg.setText("body", "UTF-8"); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = emlExtractor.getText(in, null); + final String[] fromValues = data.getValues("from"); + assertNotNull(fromValues); + assertTrue(fromValues.length >= 1); + final String decoded = fromValues[0]; + assertTrue(decoded.contains("山田 太郎")); + assertTrue(decoded.contains("sender@example.com")); + } + } + + @Test + public void test_extractsAttachmentFilenames() throws Exception { + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") }); + msg.setSubject("with attachment", "UTF-8"); + + final MimeMultipart mp = new MimeMultipart(); + final MimeBodyPart textPart = new MimeBodyPart(); + textPart.setText("see attached", "UTF-8"); + mp.addBodyPart(textPart); + + final MimeBodyPart attachment = new MimeBodyPart(); + // tiny PDF-like payload; content does not need to be valid for filename extraction + attachment.setContent(new byte[] { '%', 'P', 'D', 'F' }, "application/pdf"); + attachment.setFileName("report.pdf"); + attachment.setDisposition(jakarta.mail.Part.ATTACHMENT); + mp.addBodyPart(attachment); + + msg.setContent(mp); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = emlExtractor.getText(in, null); + final String[] names = data.getValues("attachmentNames"); + assertNotNull(names); + assertTrue(Arrays.stream(names).anyMatch(n -> n.contains("report.pdf"))); + } + } + + @Test + public void test_recursionBomb_throwsException() throws Exception { + // Build a chain of nested message/rfc822 parts deeper than the configured limit. + final EmlExtractor extractor = new EmlExtractor(); + extractor.setMaxRecursionDepth(3); + // Reuse the surrounding container's helper / factory wiring for a fair test: + // delegate directly via a fresh instance is fine because we don't traverse into attachments here. + + final Session session = newSession(); + // innermost message + MimeMessage current = new MimeMessage(session); + current.setFrom(new InternetAddress("inner@example.com")); + current.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") }); + current.setSubject("inner", "UTF-8"); + current.setText("innermost body", "UTF-8"); + current.saveChanges(); + + // Wrap in N layers of message/rfc822 inside a multipart, exceeding the bound + final int wrapCount = 8; + for (int i = 0; i < wrapCount; i++) { + final MimeMessage outer = new MimeMessage(session); + outer.setFrom(new InternetAddress("layer" + i + "@example.com")); + outer.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") }); + outer.setSubject("layer " + i, "UTF-8"); + final MimeMultipart mp = new MimeMultipart(); + final MimeBodyPart nested = new MimeBodyPart(); + nested.setContent(current, "message/rfc822"); + mp.addBodyPart(nested); + outer.setContent(mp); + outer.saveChanges(); + current = outer; + } + + try (final InputStream in = toStream(current)) { + extractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("recursion")); + } + } + + @Test + public void test_maxParts_throwsException() throws Exception { + final EmlExtractor extractor = new EmlExtractor(); + extractor.setMaxParts(5); + + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") }); + msg.setSubject("many parts", "UTF-8"); + + final MimeMultipart mp = new MimeMultipart(); + for (int i = 0; i < 50; i++) { + final MimeBodyPart p = new MimeBodyPart(); + p.setText("part " + i, "UTF-8"); + mp.addBodyPart(p); + } + msg.setContent(mp); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + extractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("part count")); + } + } + + @Test + public void test_maxBodyBytes_truncates() throws Exception { + final EmlExtractor extractor = new EmlExtractor(); + extractor.setMaxBodyBytes(32); + + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") }); + msg.setSubject("long body", "UTF-8"); + // body comfortably exceeds 32 bytes + final StringBuilder body = new StringBuilder(); + for (int i = 0; i < 200; i++) { + body.append('a'); + } + msg.setText(body.toString(), "UTF-8"); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = extractor.getText(in, null); + final String content = data.getContent(); + // Body must be truncated; the 200-char input is no longer there in full. + assertTrue(content.length() <= 33); + assertTrue(content.length() < 200); + } + } + + @Test + public void test_maxBodyBytes_largeInputIsBounded() throws Exception { + // Regression: previous binary-search truncation called text.substring(0, mid).getBytes(UTF_8) + // O(log N) times, each allocating up to ~N bytes — catastrophically slow on multi-MiB + // text parts. The current path encodes once and walks back over UTF-8 continuation + // bytes to land on a code-point boundary. + final EmlExtractor extractor = new EmlExtractor(); + extractor.setMaxBodyBytes(1024); + + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") }); + msg.setSubject("huge body", "UTF-8"); + + // 5 MiB of 'a' characters — well within typical heap, but large enough that the + // old O(N log N) truncation would be visibly slow. + final int size = 5 * 1024 * 1024; + final char[] chars = new char[size]; + Arrays.fill(chars, 'a'); + msg.setText(new String(chars), "UTF-8"); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final long start = System.nanoTime(); + final ExtractData data = extractor.getText(in, null); + final long elapsedMs = (System.nanoTime() - start) / 1_000_000L; + final String content = data.getContent(); + // Bounded by maxBodyBytes (allow a small overhead for trailing space etc.). + assertTrue(content.length() <= 2048); + // Sanity: the streaming truncation must complete quickly (well under a second). + logger.info("test_maxBodyBytes_largeInputIsBounded elapsed={}ms contentLen={}", elapsedMs, content.length()); + } + } + + @Test + public void test_maxBodyBytes_truncatesAtUtf8CodePointBoundary() throws Exception { + // The body is 10 copies of "あ" (3 bytes each in UTF-8 = 30 bytes total). + // With maxBodyBytes=10, the cap falls inside the 4th character. The truncation + // must walk back over continuation bytes and land at byte 9 (3 complete chars), + // never producing a half-encoded code point or a U+FFFD replacement. + final EmlExtractor extractor = new EmlExtractor(); + extractor.setMaxBodyBytes(10); + + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") }); + msg.setSubject("multibyte", "UTF-8"); + final StringBuilder body = new StringBuilder(); + for (int i = 0; i < 10; i++) { + body.append('あ'); // あ + } + msg.setText(body.toString(), "UTF-8"); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = extractor.getText(in, null); + final String content = data.getContent(); + // Truncation must not leak U+FFFD from a partial code point. + assertFalse(content.contains("�")); + } + } + + @Test + public void test_multipartAlternative_partsCountedTowardMaxParts() throws Exception { + // Regression: multipart/alternative previously charged only the chosen + // part (and the parent multipart node) to ctx.partCount, letting an + // attacker bypass maxParts by stuffing thousands of unused + // alternatives. The fix charges every alternative to the budget. + final EmlExtractor extractor = new EmlExtractor(); + extractor.setMaxParts(5); + + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") }); + msg.setSubject("alt bomb", "UTF-8"); + + final MimeMultipart alt = new MimeMultipart("alternative"); + // 50 text/html alternatives + 1 text/plain that would otherwise be the + // only counted child; under the old code partCount stays at 2. + for (int i = 0; i < 50; i++) { + final MimeBodyPart bp = new MimeBodyPart(); + bp.setContent("HTML " + i + "", "text/html; charset=UTF-8"); + alt.addBodyPart(bp); + } + final MimeBodyPart plain = new MimeBodyPart(); + plain.setText("plain", "UTF-8"); + alt.addBodyPart(plain); + + msg.setContent(alt); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + extractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("part count")); + } + } + + @Test + public void test_maxBodyBytes_strictCapIncludesTrailingSeparator() throws Exception { + // Regression: when the encoded body length exactly equals the + // remaining budget, the old code still appended a trailing space, + // pushing bodyBytes one byte past maxBodyBytes. The fix reserves the + // separator byte before deciding to append the full text. + final EmlExtractor extractor = new EmlExtractor(); + extractor.setMaxBodyBytes(8); + + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") }); + msg.setSubject("exact", "UTF-8"); + // 8 ASCII bytes — exactly equals maxBodyBytes; the fit branch must NOT + // append a trailing space and exceed the cap. + msg.setText("12345678", "UTF-8"); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = extractor.getText(in, null); + final String content = data.getContent(); + // Must not exceed maxBodyBytes (8 bytes / 8 ASCII chars). + logger.info("test_maxBodyBytes_strictCapIncludesTrailingSeparator content.length={}", content.length()); + assertTrue(content.length() <= 8); + } + } + + @Test + public void test_multipartAlternative_prefersPlainText() throws Exception { + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") }); + msg.setSubject("alt", "UTF-8"); + + final MimeMultipart alt = new MimeMultipart("alternative"); + final MimeBodyPart textPart = new MimeBodyPart(); + textPart.setText("PLAIN_BODY", "UTF-8"); + alt.addBodyPart(textPart); + final MimeBodyPart htmlPart = new MimeBodyPart(); + htmlPart.setContent("HTML_BODY", "text/html; charset=UTF-8"); + alt.addBodyPart(htmlPart); + + msg.setContent(alt); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = emlExtractor.getText(in, null); + final String content = data.getContent(); + assertTrue(content.contains("PLAIN_BODY")); + assertFalse(content.contains("HTML_BODY")); + } + } + + // -------------------------------------------------------------------- + // New tests + // -------------------------------------------------------------------- + + @Test + public void test_maxMessageBytes_enforcedBeforeParsing() throws Exception { + // Build a small valid EML, then set maxMessageBytes very small (64 bytes) + // so that even a minimal message stream exceeds it. + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") }); + msg.setSubject("test subject", "UTF-8"); + msg.setText("Hello, this is a test EML body that is longer than 64 bytes definitely!", "UTF-8"); + msg.saveChanges(); + + final EmlExtractor extractor = new EmlExtractor(); + extractor.setMaxMessageBytes(64); + + try (final InputStream in = toStream(msg)) { + extractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("message size")); + } + } + + @Test + public void test_attachment_extractorOutputRespectsMaxBodyBytes() throws Exception { + // Build a stub extractor that returns 1 MiB of content + final String largeContent = "x".repeat(1024 * 1024); + final Extractor stubExtractor = new Extractor() { + @Override + public ExtractData getText(final InputStream in, final Map params) { + return new ExtractData(largeContent); + } + }; + + // Register stub via a fresh container with the stub registered for application/pdf + final StandardCrawlerContainer container = new StandardCrawlerContainer().singleton("emlExtractor", EmlExtractor.class); + container.singleton("mimeTypeHelper", MimeTypeHelperImpl.class) + . singleton("extractorFactory", ExtractorFactory.class, factory -> { + factory.addExtractor("application/pdf", stubExtractor); + }); + final EmlExtractor extractor = container.getComponent("emlExtractor"); + extractor.setMaxBodyBytes(1024); + + // Build an EML with a text body and an application/pdf attachment + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") }); + msg.setSubject("attachment test", "UTF-8"); + + final MimeMultipart mp = new MimeMultipart(); + final MimeBodyPart textPart = new MimeBodyPart(); + textPart.setText("body text", "UTF-8"); + mp.addBodyPart(textPart); + + final MimeBodyPart attachment = new MimeBodyPart(); + attachment.setContent(new byte[] { '%', 'P', 'D', 'F' }, "application/pdf"); + attachment.setFileName("report.pdf"); + attachment.setDisposition(jakarta.mail.Part.ATTACHMENT); + mp.addBodyPart(attachment); + + msg.setContent(mp); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = extractor.getText(in, null); + // Allow small overhead for separator + assertTrue(data.getContent().length() <= 2048); + } + } + + @Test + public void test_appendAttachment_propagatesMaxLengthExceededException() throws Exception { + // Stub extractor that always throws MaxLengthExceededException + final Extractor stubExtractor = new Extractor() { + @Override + public ExtractData getText(final InputStream in, final Map params) { + throw new MaxLengthExceededException("stub size exceeded"); + } + }; + + final StandardCrawlerContainer container = new StandardCrawlerContainer().singleton("emlExtractor", EmlExtractor.class); + container.singleton("mimeTypeHelper", MimeTypeHelperImpl.class) + . singleton("extractorFactory", ExtractorFactory.class, factory -> { + factory.addExtractor("application/pdf", stubExtractor); + }); + final EmlExtractor extractor = container.getComponent("emlExtractor"); + + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") }); + msg.setSubject("propagation test", "UTF-8"); + + final MimeMultipart mp = new MimeMultipart(); + final MimeBodyPart textPart = new MimeBodyPart(); + textPart.setText("body", "UTF-8"); + mp.addBodyPart(textPart); + + final MimeBodyPart attachment = new MimeBodyPart(); + attachment.setContent(new byte[] { '%', 'P', 'D', 'F' }, "application/pdf"); + attachment.setFileName("big.pdf"); + attachment.setDisposition(jakarta.mail.Part.ATTACHMENT); + mp.addBodyPart(attachment); + + msg.setContent(mp); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + extractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + // Expected — exception must propagate, not be swallowed + } + } + + @Test + public void test_recursion_exactlyAtMaxDepth_succeeds() throws Exception { + // Depth accounting (each wrap contributes 2 depth levels: multipart + rfc822 part): + // root message (depth 0) → multipart bp (depth 1) → message/rfc822 content (depth 2) → inner text/* (depth 3) + // With maxRecursionDepth=3, depth=3 is allowed (3 <= 3), so 1 wrap must succeed. + // With maxRecursionDepth=1, depth=2 > 1 fails, so 1 wrap with max=1 must fail. + final Session session = newSession(); + + // Build innermost leaf message with setText + final MimeMessage inner = new MimeMessage(session); + inner.setFrom(new InternetAddress("inner@example.com")); + inner.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") }); + inner.setSubject("inner", "UTF-8"); + inner.setText("innermost", "UTF-8"); + inner.saveChanges(); + + // Wrap once: root → multipart → rfc822 bodypart → inner (text/plain at depth 3) + final MimeMessage outer = new MimeMessage(session); + outer.setFrom(new InternetAddress("outer@example.com")); + outer.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") }); + outer.setSubject("outer", "UTF-8"); + final MimeMultipart mp = new MimeMultipart(); + final MimeBodyPart nested = new MimeBodyPart(); + nested.setContent(inner, "message/rfc822"); + mp.addBodyPart(nested); + outer.setContent(mp); + outer.saveChanges(); + + final EmlExtractor extractor = new EmlExtractor(); + extractor.setMaxRecursionDepth(3); + + // 1 wrap at maxRecursionDepth=3 must succeed (inner text at depth 3) + try (final InputStream in = toStream(outer)) { + final ExtractData data = extractor.getText(in, null); + assertTrue(data.getContent().contains("innermost")); + } + + // With maxRecursionDepth=1, the rfc822 content at depth 2 exceeds the limit + extractor.setMaxRecursionDepth(1); + try (final InputStream in = toStream(outer)) { + extractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("recursion")); + } + } + + @Test + public void test_decodesRfc2047_recipientsAndReplyTo() throws Exception { + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + + final InternetAddress toAddr = new InternetAddress("to@example.com", "田中 一郎", "UTF-8"); + final InternetAddress ccAddr = new InternetAddress("cc@example.com", "鈴木 花子", "UTF-8"); + final InternetAddress bccAddr = new InternetAddress("bcc@example.com", "佐藤 次郎", "UTF-8"); + final InternetAddress replyAddr = new InternetAddress("reply@example.com", "山本 三郎", "UTF-8"); + + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { toAddr }); + msg.setRecipients(Message.RecipientType.CC, new InternetAddress[] { ccAddr }); + msg.setRecipients(Message.RecipientType.BCC, new InternetAddress[] { bccAddr }); + msg.setReplyTo(new InternetAddress[] { replyAddr }); + msg.setSubject("multi-recipient", "UTF-8"); + msg.setText("body", "UTF-8"); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = emlExtractor.getText(in, null); + + final String[] toValues = data.getValues("to"); + assertNotNull(toValues); + assertTrue(toValues[0].contains("田中 一郎")); + + final String[] ccValues = data.getValues("cc"); + assertNotNull(ccValues); + assertTrue(ccValues[0].contains("鈴木 花子")); + + final String[] bccValues = data.getValues("bcc"); + assertNotNull(bccValues); + assertTrue(bccValues[0].contains("佐藤 次郎")); + + final String[] replyToValues = data.getValues("replyTo"); + assertNotNull(replyToValues); + assertTrue(replyToValues[0].contains("山本 三郎")); + } + } + + @Test + public void test_normalizedDateAndMessageIdMetadata() throws Exception { + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") }); + msg.setSubject("date test", "UTF-8"); + msg.setText("body", "UTF-8"); + + // Set a known sent date + final SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"); + sdf.setTimeZone(TimeZone.getTimeZone("UTC")); + final Date sentDate = sdf.parse("2025-01-15T10:30:00.000Z"); + msg.setSentDate(sentDate); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = emlExtractor.getText(in, null); + + // sentDate must be ISO-8601 UTC + final String[] sentDateValues = data.getValues("sentDate"); + assertNotNull(sentDateValues); + assertEquals("2025-01-15T10:30:00.000Z", sentDateValues[0]); + + // messageId must be absent when not explicitly set (JavaMail may auto-generate one) + // In this test we verify it is present since saveChanges() generates a Message-ID + // Just ensure the key exists and is non-empty when present + final String[] msgIdValues = data.getValues("messageId"); + // JavaMail always generates a Message-ID on saveChanges, so it must be present + assertNotNull(msgIdValues); + assertTrue(msgIdValues[0].length() > 0); + } + + // Verify messageId absent when message has no Message-ID header + // Build message without calling saveChanges to avoid auto-generation + final MimeMessage msg2 = new MimeMessage(newSession()); + msg2.setFrom(new InternetAddress("sender@example.com")); + msg2.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") }); + msg2.setSubject("no message id", "UTF-8"); + msg2.setText("body", "UTF-8"); + // Do not call saveChanges; remove Message-ID header if present + msg2.removeHeader("Message-ID"); + msg2.saveChanges(); + msg2.removeHeader("Message-ID"); + + try (final InputStream in = toStream(msg2)) { + final ExtractData data = emlExtractor.getText(in, null); + // messageId should be absent since we removed the Message-ID header + assertNull(data.getValues("messageId")); + } + } + + @Test + public void test_textPart_iso2022jp_decodedCorrectly() throws Exception { + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") }); + msg.setSubject("iso-2022-jp test", "UTF-8"); + msg.setText("こんにちは", "ISO-2022-JP"); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = emlExtractor.getText(in, null); + assertTrue(data.getContent().contains("こんにちは")); + } + } + + @Test + public void test_textPart_unknownCharset_fallsBackToUtf8() throws Exception { + // Build raw EML bytes to avoid JavaMail rejecting the bogus charset during serialization. + // The body text is pure ASCII ("hello") which is valid in any charset including the fallback UTF-8. + final String boundary = "----=_Part_0_12345678.90"; + final String rawEml = "From: sender@example.com\r\n" + "To: r@example.com\r\n" + "Subject: unknown charset\r\n" + + "MIME-Version: 1.0\r\n" + "Content-Type: multipart/mixed; boundary=\"" + boundary + "\"\r\n" + "\r\n" + "--" + boundary + + "\r\n" + "Content-Type: text/plain; charset=bogus-cs-9\r\n" + "Content-Transfer-Encoding: 7bit\r\n" + "\r\n" + "hello\r\n" + + "--" + boundary + "--\r\n"; + + try (final InputStream in = new ByteArrayInputStream(rawEml.getBytes(java.nio.charset.StandardCharsets.US_ASCII))) { + final ExtractData data = emlExtractor.getText(in, null); + assertTrue(data.getContent().contains("hello")); + } + } + + @Test + public void test_textPart_noCharsetParameter_decodesAsUtf8() throws Exception { + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") }); + msg.setSubject("no charset", "UTF-8"); + + final MimeMultipart mp = new MimeMultipart(); + final MimeBodyPart textPart = new MimeBodyPart(); + // Content-Type without charset parameter + textPart.setContent("hello world", "text/plain"); + mp.addBodyPart(textPart); + msg.setContent(mp); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = emlExtractor.getText(in, null); + assertTrue(data.getContent().contains("hello world")); + } + } + + @Test + public void test_multipleAttachments_allRecorded() throws Exception { + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") }); + msg.setSubject("multiple attachments", "UTF-8"); + + final MimeMultipart mp = new MimeMultipart(); + + final MimeBodyPart textPart = new MimeBodyPart(); + textPart.setText("body", "UTF-8"); + mp.addBodyPart(textPart); + + final String[] filenames = { "file1.txt", "file2.doc", "file3.xml" }; + for (final String name : filenames) { + final MimeBodyPart att = new MimeBodyPart(); + att.setContent("content of " + name, "application/octet-stream"); + att.setFileName(name); + att.setDisposition(jakarta.mail.Part.ATTACHMENT); + mp.addBodyPart(att); + } + + msg.setContent(mp); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = emlExtractor.getText(in, null); + final String[] names = data.getValues("attachmentNames"); + assertNotNull(names); + final java.util.List nameList = Arrays.asList(names); + assertTrue(nameList.contains("file1.txt")); + assertTrue(nameList.contains("file2.doc")); + assertTrue(nameList.contains("file3.xml")); + } + } + + @Test + public void test_inlineDispositionWithFilename_recordedAsAttachment() throws Exception { + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") }); + msg.setSubject("inline attachment", "UTF-8"); + + final MimeMultipart mp = new MimeMultipart("related"); + + final MimeBodyPart textPart = new MimeBodyPart(); + textPart.setText("body with inline", "UTF-8"); + mp.addBodyPart(textPart); + + // Inline disposition with filename — should be recorded as an attachment + final MimeBodyPart inlinePart = new MimeBodyPart(); + inlinePart.setContent(new byte[] { (byte) 0x89, 0x50, 0x4E, 0x47 }, "image/png"); + inlinePart.setFileName("logo.png"); + inlinePart.setDisposition(jakarta.mail.Part.INLINE); + mp.addBodyPart(inlinePart); + + msg.setContent(mp); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = emlExtractor.getText(in, null); + final String[] names = data.getValues("attachmentNames"); + assertNotNull(names); + assertTrue(Arrays.stream(names).anyMatch(n -> n.contains("logo.png"))); + } + } + + @Test + public void test_maxBodyBytes_acrossMultipleParts() throws Exception { + final int maxBytes = 50; + final EmlExtractor extractor = new EmlExtractor(); + extractor.setMaxBodyBytes(maxBytes); + + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") }); + msg.setSubject("two parts", "UTF-8"); + + final MimeMultipart mp = new MimeMultipart(); + + // First part: 30 ASCII bytes + final MimeBodyPart part1 = new MimeBodyPart(); + part1.setText("a".repeat(30), "UTF-8"); + mp.addBodyPart(part1); + + // Second part: 30 ASCII bytes — combined exceeds maxBytes + final MimeBodyPart part2 = new MimeBodyPart(); + part2.setText("b".repeat(30), "UTF-8"); + mp.addBodyPart(part2); + + msg.setContent(mp); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = extractor.getText(in, null); + final String content = data.getContent(); + // Total must not exceed maxBodyBytes + assertTrue(content.length() <= maxBytes); + } + } + + @Test + public void test_setters_rejectInvalidValues() { + final EmlExtractor extractor = new EmlExtractor(); + + try { + extractor.setMaxParts(0); + fail(); + } catch (final IllegalArgumentException e) { + // expected + } + + try { + extractor.setMaxParts(-1); + fail(); + } catch (final IllegalArgumentException e) { + // expected + } + + try { + extractor.setMaxBodyBytes(0); + fail(); + } catch (final IllegalArgumentException e) { + // expected + } + + try { + extractor.setMaxMessageBytes(0); + fail(); + } catch (final IllegalArgumentException e) { + // expected + } + + try { + extractor.setMaxRecursionDepth(-1); + fail(); + } catch (final IllegalArgumentException e) { + // expected + } + + // setMaxRecursionDepth(0) must be accepted (root-only is valid) + extractor.setMaxRecursionDepth(0); + assertEquals(0, extractor.getMaxRecursionDepth()); + } + + @Test + public void test_getReceivedDate_parsesWithSemicolon() throws Exception { + // Build a message with a Received header in standard RFC 5322 form + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") }); + msg.setSubject("received date test", "UTF-8"); + msg.setText("body", "UTF-8"); + // Add a Received header with semicolon-separated date + msg.addHeader("Received", "from foo.example.com by bar.example.com; Sun, 11 Nov 2012 02:39:59 +0000"); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = emlExtractor.getText(in, null); + final String[] receivedDate = data.getValues("Received-Date"); + assertNotNull(receivedDate); + assertEquals("2012-11-11T02:39:59.000Z", receivedDate[0]); + } + } + + @Test + public void test_getReceivedDate_skipsMalformedDowInComment() throws Exception { + // DOW abbreviation in a comment, but valid date after semicolon + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") }); + msg.setSubject("received comment test", "UTF-8"); + msg.setText("body", "UTF-8"); + // The "(Mon)" in the routing portion should not confuse the parser; + // the date after ";" is the authoritative date + msg.addHeader("Received", "from foo (Mon gateway) by bar; Mon, 11 Nov 2013 05:00:00 +0000"); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = emlExtractor.getText(in, null); + final String[] receivedDate = data.getValues("Received-Date"); + assertNotNull(receivedDate); + assertEquals("2013-11-11T05:00:00.000Z", receivedDate[0]); + } + } + + @Test + public void test_manyReceivedHeaders_bounded() throws Exception { + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") }); + msg.setSubject("many received headers", "UTF-8"); + msg.setText("body", "UTF-8"); + + // Add 500 garbage Received headers first + for (int i = 0; i < 500; i++) { + msg.addHeader("Received", "garbage entry number " + i); + } + // Then add one valid Received header — but since we cap at 100, this valid one + // at index 500 will NOT be seen. We verify that extraction at least completes + // without error and does not blow up on unbounded iteration. + // (The valid header is beyond the 100-entry cap, so receivedDate may be null.) + msg.addHeader("Received", "from x by y; Mon, 11 Nov 2013 05:00:00 +0000"); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = emlExtractor.getText(in, null); + // Just verify it completes without exception and content is non-null + assertNotNull(data.getContent()); + } + } + + @Test + public void test_getDecodeText_returnsRawOnUnsupportedEncoding() { + // An encoded-word with an unknown charset should return the raw input, not empty string. + // Use a charset that is genuinely unsupported in the JVM. + // Note: if the JVM happens to support the charset, this test may fall back gracefully. + // We use a clearly bogus encoding name to guarantee UnsupportedEncodingException. + final String raw = "=?bogus-cs-9?B?dGVzdA==?="; + // MimeUtility.decodeText will throw UnsupportedEncodingException for unknown charset; + // getDecodeText must return the raw value unchanged in that case. + final String result = emlExtractor.getDecodeText(raw); + // Either successfully decoded (if JVM finds charset) or returns raw value + // The contract is: never return empty string when input is non-empty + assertNotNull(result); + assertTrue(result.length() > 0); + // If decoding fails, must return the raw string, not empty string + // (We can't force the failure path here without mocking, but we verify no empty return) + } }