From 1b27c0b15a9a9fa0c4a0a4026015468a3ac172da Mon Sep 17 00:00:00 2001 From: Shinsuke Sugaya Date: Tue, 5 May 2026 07:57:29 +0900 Subject: [PATCH 1/5] fix(extractor): decode RFC 2047 headers and bound EML recursion/bytes EML content is untrusted. The new bounds defend against deeply nested or massively multi-part messages that could exhaust memory. - Decode RFC 2047 encoded-word headers (Subject, From, To, Cc, Bcc, Reply-To) via MimeUtility.decodeText for the new normalized metadata keys (subject, from, to, cc, bcc, replyTo). - Add maxRecursionDepth (default 10) for nested message/rfc822 and multipart parts; throw MaxLengthExceededException when exceeded. - Add maxParts (default 1000) and maxBodyBytes (default 50 MiB) DoS guards. - Expose attachmentNames (multivalue metadata) without extracting binary content. - Set common metadata: subject, from, to, cc, bcc, replyTo, sentDate, receivedDate, messageId. - Preserve previous behavior: text alternatives prefer text/plain, legacy headers (Subject, From, To, ...) remain available. Adds tests for body extraction, RFC 2047 decoding (Subject and From display name), attachment filename collection, recursion bomb, max parts, body byte truncation, and multipart/alternative preference. --- .../crawler/extractor/impl/EmlExtractor.java | 391 ++++++++++++++++-- .../extractor/impl/EmlExtractorTest.java | 235 ++++++++++- 2 files changed, 598 insertions(+), 28 deletions(-) diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractor.java index 363fe063..e57b6c36 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractor.java @@ -18,11 +18,14 @@ import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; +import java.nio.charset.StandardCharsets; import java.text.ParseException; import java.text.SimpleDateFormat; +import java.util.ArrayList; import java.util.Date; import java.util.Enumeration; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.Properties; import java.util.TimeZone; @@ -33,6 +36,7 @@ import org.codelibs.fess.crawler.Constants; import org.codelibs.fess.crawler.entity.ExtractData; import org.codelibs.fess.crawler.exception.ExtractException; +import org.codelibs.fess.crawler.exception.MaxLengthExceededException; import org.codelibs.fess.crawler.extractor.Extractor; import org.codelibs.fess.crawler.extractor.ExtractorFactory; import org.codelibs.fess.crawler.helper.MimeTypeHelper; @@ -52,6 +56,19 @@ /** * Gets a text from .eml file. * + *

EML content is treated as untrusted. The extractor enforces the following + * defensive bounds against malformed or malicious messages:

+ * + *

RFC 2047 encoded-word headers (e.g. {@code Subject}, + * {@code From}, {@code To}) are decoded via {@link MimeUtility#decodeText}.

+ * * @author shinsuke * */ @@ -65,6 +82,15 @@ public class EmlExtractor extends AbstractExtractor { /** Properties used for mail processing */ protected Properties mailProperties = new Properties(); + /** Maximum allowed nesting depth for multipart / message/rfc822 parts. */ + protected int maxRecursionDepth = 10; + + /** Maximum allowed total number of MIME parts visited per message. */ + protected int maxParts = 1000; + + /** Maximum total body bytes (UTF-8) appended to the extracted content. */ + protected long maxBodyBytes = 50L * 1024 * 1024; + /** * Constructs a new EmlExtractor. */ @@ -86,8 +112,9 @@ public ExtractData getText(final InputStream in, final Map param try { final Session mailSession = Session.getDefaultInstance(props, null); final MimeMessage message = new MimeMessage(mailSession, in); - final String content = getBodyText(message); - final ExtractData data = new ExtractData(content != null ? content : StringUtil.EMPTY); + final BodyExtractionContext ctx = new BodyExtractionContext(); + extractBody(message, ctx, 0); + final ExtractData data = new ExtractData(ctx.body.toString()); final Enumeration
headers = message.getAllHeaders(); while (headers.hasMoreElements()) { final Header header = headers.nextElement(); @@ -114,9 +141,28 @@ public ExtractData getText(final InputStream in, final Map param putValue(data, "To", message.getRecipients(Message.RecipientType.TO)); putValue(data, "Cc", message.getRecipients(Message.RecipientType.CC)); putValue(data, "Bcc", message.getRecipients(Message.RecipientType.BCC)); + + // normalized convenience metadata (always RFC 2047 decoded) + putDecodedHeaderValue(data, "subject", message.getSubject()); + putDecodedAddressValues(data, "from", message.getFrom()); + putDecodedAddressValues(data, "to", message.getRecipients(Message.RecipientType.TO)); + putDecodedAddressValues(data, "cc", message.getRecipients(Message.RecipientType.CC)); + putDecodedAddressValues(data, "bcc", message.getRecipients(Message.RecipientType.BCC)); + putDecodedAddressValues(data, "replyTo", message.getReplyTo()); + putDateValue(data, "sentDate", message.getSentDate()); + putDateValue(data, "receivedDate", getReceivedDate(message)); + if (message.getMessageID() != null) { + data.putValue("messageId", message.getMessageID()); + } + + if (!ctx.attachmentNames.isEmpty()) { + data.putValues("attachmentNames", ctx.attachmentNames.toArray(new String[0])); + } return data; } catch (final MessagingException e) { throw new ExtractException(e); + } catch (final IOException e) { + throw new ExtractException(e); } } @@ -161,6 +207,57 @@ protected void putValue(final ExtractData data, final String key, final Object v } } + /** + * Stores a decoded header value if non-null/non-blank. + * + * @param data the extract data + * @param key the metadata key + * @param raw the raw header value, may be {@code null} + */ + protected void putDecodedHeaderValue(final ExtractData data, final String key, final String raw) { + if (raw == null) { + return; + } + final String decoded = getDecodeText(raw); + if (!StringUtil.isEmpty(decoded)) { + data.putValue(key, decoded); + } + } + + /** + * Stores a decoded address array as a multivalue metadata entry. + * + * @param data the extract data + * @param key the metadata key + * @param addresses the address array, may be {@code null} + */ + protected void putDecodedAddressValues(final ExtractData data, final String key, final Address[] addresses) { + if (addresses == null || addresses.length == 0) { + return; + } + final String[] values = new String[addresses.length]; + for (int i = 0; i < addresses.length; i++) { + values[i] = getDecodeText(addresses[i].toString()); + } + data.putValues(key, values); + } + + /** + * Stores a Date as an ISO-8601 UTC string under the given key. + * + * @param data the extract data + * @param key the metadata key + * @param date the date, may be {@code null} + */ + protected void putDateValue(final ExtractData data, final String key, final Date date) { + if (date == null) { + return; + } + final SimpleDateFormat sdf = new SimpleDateFormat(Constants.ISO_DATETIME_FORMAT); + sdf.setTimeZone(TimeZone.getTimeZone("UTC")); + data.putValue(key, sdf.format(date)); + } + /** * Decodes MIME-encoded text. * @@ -197,52 +294,273 @@ public void setMailProperties(final Properties mailProperties) { this.mailProperties = mailProperties; } + /** + * Returns the maximum allowed recursion depth. + * + * @return the maximum recursion depth + */ + public int getMaxRecursionDepth() { + return maxRecursionDepth; + } + + /** + * Sets the maximum allowed recursion depth for nested multipart / + * {@code message/rfc822} parts. + * + * @param maxRecursionDepth the maximum recursion depth + */ + public void setMaxRecursionDepth(final int maxRecursionDepth) { + this.maxRecursionDepth = maxRecursionDepth; + } + + /** + * Returns the maximum total number of MIME parts visited per message. + * + * @return the maximum number of parts + */ + public int getMaxParts() { + return maxParts; + } + + /** + * Sets the maximum total number of MIME parts visited per message. + * + * @param maxParts the maximum number of parts + */ + public void setMaxParts(final int maxParts) { + this.maxParts = maxParts; + } + + /** + * Returns the maximum total UTF-8 body bytes appended to extracted content. + * + * @return the maximum body bytes + */ + public long getMaxBodyBytes() { + return maxBodyBytes; + } + + /** + * Sets the maximum total UTF-8 body bytes appended to extracted content. + * + * @param maxBodyBytes the maximum body bytes + */ + public void setMaxBodyBytes(final long maxBodyBytes) { + this.maxBodyBytes = maxBodyBytes; + } + /** * Extracts the body text from a MIME message. * + *

Retained for backwards compatibility. Internally delegates to + * {@link #extractBody(Part, BodyExtractionContext, int)} with a fresh + * context.

+ * * @param message the MIME message to extract text from * @return the extracted body text * @throws ExtractException if extraction fails */ protected String getBodyText(final MimeMessage message) { - final StringBuilder buf = new StringBuilder(1000); try { - final Object content = message.getContent(); - if (content instanceof final Multipart multipart) { - final int count = multipart.getCount(); - for (int i = 0; i < count; i++) { - final BodyPart bodyPart = multipart.getBodyPart(i); - if (Part.ATTACHMENT.equalsIgnoreCase(bodyPart.getDisposition())) { - appendAttachment(buf, bodyPart); - } else if (bodyPart.isMimeType("text/plain") || bodyPart.isMimeType("text/html")) { - buf.append(bodyPart.getContent().toString()).append(' '); - } else if (bodyPart.isMimeType("multipart/alternative") && bodyPart.getContent() instanceof Multipart) { - final Multipart alternativePart = (Multipart) bodyPart.getContent(); - for (int j = 0; j < alternativePart.getCount(); j++) { - final BodyPart innerBodyPart = alternativePart.getBodyPart(j); - if (innerBodyPart.isMimeType("text/plain")) { - buf.append(innerBodyPart.getContent().toString()).append(' '); - break; - } + final BodyExtractionContext ctx = new BodyExtractionContext(); + extractBody(message, ctx, 0); + return ctx.body.toString(); + } catch (MessagingException | IOException e) { + throw new ExtractException(e); + } + } + + /** + * Recursively extracts text content from a MIME part, enforcing recursion, + * part-count, and body-byte bounds. + * + * @param part the current MIME part + * @param ctx the extraction context tracking accumulated state + * @param depth the current recursion depth (root = 0) + * @throws MessagingException if a JavaMail call fails + * @throws IOException if reading part content fails + */ + protected void extractBody(final Part part, final BodyExtractionContext ctx, final int depth) throws MessagingException, IOException { + if (depth > maxRecursionDepth) { + throw new MaxLengthExceededException("EML recursion too deep: depth=" + depth + " max=" + maxRecursionDepth); + } + ctx.partCount++; + if (ctx.partCount > maxParts) { + throw new MaxLengthExceededException("EML part count exceeded: max=" + maxParts); + } + + // Treat explicitly-marked attachments as attachments regardless of mime type. + if (Part.ATTACHMENT.equalsIgnoreCase(part.getDisposition())) { + recordAttachment(ctx, part); + return; + } + + if (part.isMimeType("text/*")) { + final Object content; + try { + content = part.getContent(); + } catch (final IOException e) { + if (logger.isDebugEnabled()) { + logger.debug("Failed to read text part content.", e); + } + return; + } + if (content != null) { + appendBody(ctx, content.toString()); + } + return; + } + + if (part.isMimeType("multipart/alternative")) { + final Object content = part.getContent(); + if (content instanceof Multipart) { + final Multipart mp = (Multipart) content; + // Prefer text/plain alternative; fall back to first text/* alternative. + BodyPart chosen = null; + for (int i = 0; i < mp.getCount(); i++) { + final BodyPart bp = mp.getBodyPart(i); + if (bp.isMimeType("text/plain")) { + chosen = bp; + break; + } + } + if (chosen == null) { + for (int i = 0; i < mp.getCount(); i++) { + final BodyPart bp = mp.getBodyPart(i); + if (bp.isMimeType("text/*")) { + chosen = bp; + break; } } } - } else if (content instanceof String) { - buf.append(content.toString()); + if (chosen != null) { + extractBody(chosen, ctx, depth + 1); + } else { + // No text alternative; recurse into all parts (attachments, nested multipart). + for (int i = 0; i < mp.getCount(); i++) { + extractBody(mp.getBodyPart(i), ctx, depth + 1); + } + } + } + return; + } + + if (part.isMimeType("multipart/*")) { + final Object content = part.getContent(); + if (content instanceof Multipart) { + final Multipart mp = (Multipart) content; + for (int i = 0; i < mp.getCount(); i++) { + extractBody(mp.getBodyPart(i), ctx, depth + 1); + } + } + return; + } + + if (part.isMimeType("message/rfc822")) { + final Object content = part.getContent(); + if (content instanceof Part) { + extractBody((Part) content, ctx, depth + 1); + } + return; + } + + // Anything else with a filename is an inline attachment-like part. + recordAttachment(ctx, part); + } + + /** + * Records an attachment filename (decoded) and attempts in-extractor text + * extraction for known mime types, mirroring previous behavior. + * + * @param ctx the extraction context + * @param part the attachment-like part + */ + protected void recordAttachment(final BodyExtractionContext ctx, final Part part) { + try { + final String rawName = part.getFileName(); + if (!StringUtil.isEmpty(rawName)) { + final String decoded = getDecodeText(rawName); + if (!StringUtil.isEmpty(decoded)) { + ctx.attachmentNames.add(decoded); + } + } + } catch (final MessagingException e) { + if (logger.isDebugEnabled()) { + logger.debug("Failed to read attachment filename.", e); } - } catch (MessagingException | IOException e) { - throw new ExtractException(e); } - return buf.toString(); + if (part instanceof BodyPart) { + appendAttachment(ctx, (BodyPart) part); + } } /** - * Appends attachment content to the buffer if it can be extracted. + * Appends body text to the extraction context, enforcing + * {@link #maxBodyBytes}. Truncates any text that would push the total over + * the limit. + * + * @param ctx the extraction context + * @param text the text to append + */ + protected void appendBody(final BodyExtractionContext ctx, final String text) { + if (text == null || text.isEmpty()) { + return; + } + if (ctx.bodyBytes >= maxBodyBytes) { + return; + } + final byte[] bytes = text.getBytes(StandardCharsets.UTF_8); + final long remaining = maxBodyBytes - ctx.bodyBytes; + if (bytes.length <= remaining) { + ctx.body.append(text).append(' '); + ctx.bodyBytes += bytes.length + 1; + } else { + // Truncate at character boundary that fits within remaining budget. + int lo = 0; + int hi = text.length(); + while (lo < hi) { + final int mid = lo + ((hi - lo + 1) >>> 1); + if (text.substring(0, mid).getBytes(StandardCharsets.UTF_8).length <= remaining) { + lo = mid; + } else { + hi = mid - 1; + } + } + if (lo > 0) { + final String truncated = text.substring(0, lo); + ctx.body.append(truncated); + ctx.bodyBytes += truncated.getBytes(StandardCharsets.UTF_8).length; + } + ctx.bodyBytes = maxBodyBytes; + if (logger.isDebugEnabled()) { + logger.debug("EML body truncated at {} bytes.", maxBodyBytes); + } + } + } + + /** + * Backwards-compatible attachment text extraction. Kept for subclasses that + * may have overridden it; new code should prefer + * {@link #appendAttachment(BodyExtractionContext, BodyPart)}. * * @param buf the buffer to append content to * @param bodyPart the body part containing the attachment */ protected void appendAttachment(final StringBuilder buf, final BodyPart bodyPart) { + final BodyExtractionContext ctx = new BodyExtractionContext(); + ctx.body = buf; + appendAttachment(ctx, bodyPart); + } + + /** + * Attempts to extract text from an attachment using a registered + * {@link Extractor} for its detected MIME type. Failures are silently + * swallowed. + * + * @param ctx the extraction context + * @param bodyPart the attachment body part + */ + protected void appendAttachment(final BodyExtractionContext ctx, final BodyPart bodyPart) { final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper(); final ExtractorFactory extractorFactory = getExtractorFactory(); try { @@ -255,7 +573,9 @@ protected void appendAttachment(final StringBuilder buf, final BodyPart bodyPart final Map map = new HashMap<>(); map.put(ExtractData.RESOURCE_NAME_KEY, filename); final String content = extractor.getText(in, map).getContent(); - buf.append(content).append(' '); + if (content != null) { + appendBody(ctx, content); + } } catch (final Exception e) { if (logger.isDebugEnabled()) { logger.debug("Exception in an internal extractor.", e); @@ -312,4 +632,21 @@ private static String getDateString(final String text) { } return null; } + + /** + * Mutable state shared across recursive body extraction. + */ + protected static class BodyExtractionContext { + /** Accumulated body text. */ + protected StringBuilder body = new StringBuilder(1000); + + /** Number of MIME parts visited so far. */ + protected int partCount; + + /** UTF-8 bytes already appended to {@link #body}. */ + protected long bodyBytes; + + /** Decoded attachment filenames. */ + protected List attachmentNames = new ArrayList<>(); + } } diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractorTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractorTest.java index c0e39b25..ebe0d3dd 100644 --- a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractorTest.java +++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractorTest.java @@ -15,8 +15,12 @@ */ package org.codelibs.fess.crawler.extractor.impl; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; +import java.util.Arrays; +import java.util.Properties; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -24,13 +28,20 @@ import org.codelibs.fess.crawler.container.StandardCrawlerContainer; import org.codelibs.fess.crawler.entity.ExtractData; import org.codelibs.fess.crawler.exception.CrawlerSystemException; +import org.codelibs.fess.crawler.exception.MaxLengthExceededException; import org.codelibs.fess.crawler.extractor.ExtractorFactory; import org.codelibs.fess.crawler.helper.impl.MimeTypeHelperImpl; import org.dbflute.utflute.core.PlainTestCase; -import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.TestInfo; +import jakarta.mail.Message; +import jakarta.mail.Session; +import jakarta.mail.internet.InternetAddress; +import jakarta.mail.internet.MimeBodyPart; +import jakarta.mail.internet.MimeMessage; +import jakarta.mail.internet.MimeMultipart; + /** * @author shinsuke * @@ -115,4 +126,226 @@ public void test_getText_null() { // NOP } } + + // -------------------------------------------------------------------- + // Programmatically-built fixtures + // -------------------------------------------------------------------- + + private static Session newSession() { + return Session.getInstance(new Properties(), null); + } + + private static InputStream toStream(final MimeMessage msg) throws Exception { + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + msg.writeTo(baos); + return new ByteArrayInputStream(baos.toByteArray()); + } + + @Test + public void test_extractsBody() throws Exception { + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") }); + msg.setSubject("Hello", "UTF-8"); + msg.setText("Hello, world!", "UTF-8"); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = emlExtractor.getText(in, null); + assertTrue(data.getContent().contains("Hello, world!")); + assertEquals("Hello", data.getValues("subject")[0]); + } + } + + @Test + public void test_decodesRfc2047Subject() throws Exception { + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") }); + // setSubject(text, charset) auto-encodes as RFC 2047 when non-ASCII + msg.setSubject("こんにちは", "UTF-8"); + msg.setText("body", "UTF-8"); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = emlExtractor.getText(in, null); + // Raw header preserves RFC 2047 encoded form when present + final String raw = data.getValues("Subject")[0]; + assertTrue(raw.contains("=?") || raw.equals("こんにちは")); + // Normalized "subject" metadata is decoded + assertEquals("こんにちは", data.getValues("subject")[0]); + } + } + + @Test + public void test_decodesRfc2047From() throws Exception { + final MimeMessage msg = new MimeMessage(newSession()); + // Personal name in non-ASCII triggers RFC 2047 encoding on serialization + final InternetAddress from = new InternetAddress("sender@example.com", "山田 太郎", "UTF-8"); + msg.setFrom(from); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") }); + msg.setSubject("test", "UTF-8"); + msg.setText("body", "UTF-8"); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = emlExtractor.getText(in, null); + final String[] fromValues = data.getValues("from"); + assertNotNull(fromValues); + assertTrue(fromValues.length >= 1); + final String decoded = fromValues[0]; + assertTrue(decoded.contains("山田 太郎")); + assertTrue(decoded.contains("sender@example.com")); + } + } + + @Test + public void test_extractsAttachmentFilenames() throws Exception { + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") }); + msg.setSubject("with attachment", "UTF-8"); + + final MimeMultipart mp = new MimeMultipart(); + final MimeBodyPart textPart = new MimeBodyPart(); + textPart.setText("see attached", "UTF-8"); + mp.addBodyPart(textPart); + + final MimeBodyPart attachment = new MimeBodyPart(); + // tiny PDF-like payload; content does not need to be valid for filename extraction + attachment.setContent(new byte[] { '%', 'P', 'D', 'F' }, "application/pdf"); + attachment.setFileName("report.pdf"); + attachment.setDisposition(jakarta.mail.Part.ATTACHMENT); + mp.addBodyPart(attachment); + + msg.setContent(mp); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = emlExtractor.getText(in, null); + final String[] names = data.getValues("attachmentNames"); + assertNotNull(names); + assertTrue(Arrays.stream(names).anyMatch(n -> n.contains("report.pdf"))); + } + } + + @Test + public void test_recursionBomb_throwsException() throws Exception { + // Build a chain of nested message/rfc822 parts deeper than the configured limit. + final EmlExtractor extractor = new EmlExtractor(); + extractor.setMaxRecursionDepth(3); + // Reuse the surrounding container's helper / factory wiring for a fair test: + // delegate directly via a fresh instance is fine because we don't traverse into attachments here. + + final Session session = newSession(); + // innermost message + MimeMessage current = new MimeMessage(session); + current.setFrom(new InternetAddress("inner@example.com")); + current.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") }); + current.setSubject("inner", "UTF-8"); + current.setText("innermost body", "UTF-8"); + current.saveChanges(); + + // Wrap in N layers of message/rfc822 inside a multipart, exceeding the bound + final int wrapCount = 8; + for (int i = 0; i < wrapCount; i++) { + final MimeMessage outer = new MimeMessage(session); + outer.setFrom(new InternetAddress("layer" + i + "@example.com")); + outer.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") }); + outer.setSubject("layer " + i, "UTF-8"); + final MimeMultipart mp = new MimeMultipart(); + final MimeBodyPart nested = new MimeBodyPart(); + nested.setContent(current, "message/rfc822"); + mp.addBodyPart(nested); + outer.setContent(mp); + outer.saveChanges(); + current = outer; + } + + try (final InputStream in = toStream(current)) { + extractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("recursion")); + } + } + + @Test + public void test_maxParts_throwsException() throws Exception { + final EmlExtractor extractor = new EmlExtractor(); + extractor.setMaxParts(5); + + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") }); + msg.setSubject("many parts", "UTF-8"); + + final MimeMultipart mp = new MimeMultipart(); + for (int i = 0; i < 50; i++) { + final MimeBodyPart p = new MimeBodyPart(); + p.setText("part " + i, "UTF-8"); + mp.addBodyPart(p); + } + msg.setContent(mp); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + extractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("part count")); + } + } + + @Test + public void test_maxBodyBytes_truncates() throws Exception { + final EmlExtractor extractor = new EmlExtractor(); + extractor.setMaxBodyBytes(32); + + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") }); + msg.setSubject("long body", "UTF-8"); + // body comfortably exceeds 32 bytes + final StringBuilder body = new StringBuilder(); + for (int i = 0; i < 200; i++) { + body.append('a'); + } + msg.setText(body.toString(), "UTF-8"); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = extractor.getText(in, null); + final String content = data.getContent(); + // Body must be truncated; the 200-char input is no longer there in full. + assertTrue(content.length() <= 64); + assertTrue(content.length() < 200); + } + } + + @Test + public void test_multipartAlternative_prefersPlainText() throws Exception { + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") }); + msg.setSubject("alt", "UTF-8"); + + final MimeMultipart alt = new MimeMultipart("alternative"); + final MimeBodyPart textPart = new MimeBodyPart(); + textPart.setText("PLAIN_BODY", "UTF-8"); + alt.addBodyPart(textPart); + final MimeBodyPart htmlPart = new MimeBodyPart(); + htmlPart.setContent("HTML_BODY", "text/html; charset=UTF-8"); + alt.addBodyPart(htmlPart); + + msg.setContent(alt); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = emlExtractor.getText(in, null); + final String content = data.getContent(); + assertTrue(content.contains("PLAIN_BODY")); + assertFalse(content.contains("HTML_BODY")); + } + } } From 51a58595e7a114233b18908c44a13fb4528792bd Mon Sep 17 00:00:00 2001 From: Shinsuke Sugaya Date: Tue, 5 May 2026 12:35:28 +0900 Subject: [PATCH 2/5] fix(extractor): use CharsetEncoder for EML body truncation to avoid O(N log N) OOM --- .../crawler/extractor/impl/EmlExtractor.java | 60 +++++++++++++------ .../extractor/impl/EmlExtractorTest.java | 35 +++++++++++ 2 files changed, 77 insertions(+), 18 deletions(-) diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractor.java index e57b6c36..51efab81 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractor.java @@ -18,6 +18,11 @@ import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CoderResult; +import java.nio.charset.CodingErrorAction; import java.nio.charset.StandardCharsets; import java.text.ParseException; import java.text.SimpleDateFormat; @@ -499,6 +504,12 @@ protected void recordAttachment(final BodyExtractionContext ctx, final Part part * {@link #maxBodyBytes}. Truncates any text that would push the total over * the limit. * + *

Uses a single {@link CharsetEncoder} pass into a fixed-size + * {@link ByteBuffer} sized to the remaining byte budget, so memory usage is + * bounded by the budget rather than the input size. This avoids the + * pathological O(N log N) memory blow-up of repeatedly slicing and + * re-encoding very large strings.

+ * * @param ctx the extraction context * @param text the text to append */ @@ -509,27 +520,40 @@ protected void appendBody(final BodyExtractionContext ctx, final String text) { if (ctx.bodyBytes >= maxBodyBytes) { return; } - final byte[] bytes = text.getBytes(StandardCharsets.UTF_8); - final long remaining = maxBodyBytes - ctx.bodyBytes; - if (bytes.length <= remaining) { + final long remainingLong = maxBodyBytes - ctx.bodyBytes; + // Cap the per-call buffer to Integer.MAX_VALUE; further input is rejected + // once the cumulative limit is reached on subsequent calls. + final int remaining = (int) Math.min(remainingLong, Integer.MAX_VALUE); + + // Allocate a single buffer sized to the remaining budget. This is the + // only large allocation; we never materialize the full input as bytes. + final ByteBuffer out = ByteBuffer.allocate(remaining); + final CharBuffer in = CharBuffer.wrap(text); + final CharsetEncoder encoder = StandardCharsets.UTF_8.newEncoder() + .onMalformedInput(CodingErrorAction.REPLACE) + .onUnmappableCharacter(CodingErrorAction.REPLACE); + final CoderResult cr = encoder.encode(in, out, true); + // Flush any pending state; if it still overflows, we treat as truncated. + CoderResult flush = CoderResult.UNDERFLOW; + if (!cr.isOverflow()) { + flush = encoder.flush(out); + } + final boolean overflow = cr.isOverflow() || flush.isOverflow(); + final int encoded = out.position(); + + if (!overflow) { + // The whole text fit within the remaining budget. ctx.body.append(text).append(' '); - ctx.bodyBytes += bytes.length + 1; + ctx.bodyBytes += encoded + 1; } else { - // Truncate at character boundary that fits within remaining budget. - int lo = 0; - int hi = text.length(); - while (lo < hi) { - final int mid = lo + ((hi - lo + 1) >>> 1); - if (text.substring(0, mid).getBytes(StandardCharsets.UTF_8).length <= remaining) { - lo = mid; - } else { - hi = mid - 1; - } - } - if (lo > 0) { - final String truncated = text.substring(0, lo); + // Decode the actually-encoded prefix bytes back to a String. UTF-8 + // decoding handles partial multi-byte sequences at the boundary by + // producing a replacement character or trimming, so the result is + // always a valid String — no manual character-boundary search. + out.flip(); + final String truncated = StandardCharsets.UTF_8.decode(out).toString(); + if (!truncated.isEmpty()) { ctx.body.append(truncated); - ctx.bodyBytes += truncated.getBytes(StandardCharsets.UTF_8).length; } ctx.bodyBytes = maxBodyBytes; if (logger.isDebugEnabled()) { diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractorTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractorTest.java index ebe0d3dd..9966c1fa 100644 --- a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractorTest.java +++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractorTest.java @@ -323,6 +323,41 @@ public void test_maxBodyBytes_truncates() throws Exception { } } + @Test + public void test_maxBodyBytes_largeInputIsBounded() throws Exception { + // Regression: previous binary-search truncation called text.substring(0, mid).getBytes(UTF_8) + // O(log N) times, each allocating up to ~N bytes. For very large text parts this + // self-OOMs and is also catastrophically slow. The CharsetEncoder-based path + // allocates only ~maxBodyBytes worth of memory once. + final EmlExtractor extractor = new EmlExtractor(); + extractor.setMaxBodyBytes(1024); + + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") }); + msg.setSubject("huge body", "UTF-8"); + + // 5 MiB of 'a' characters — well within typical heap, but large enough that the + // old O(N log N) truncation would be visibly slow. + final int size = 5 * 1024 * 1024; + final char[] chars = new char[size]; + Arrays.fill(chars, 'a'); + msg.setText(new String(chars), "UTF-8"); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final long start = System.nanoTime(); + final ExtractData data = extractor.getText(in, null); + final long elapsedMs = (System.nanoTime() - start) / 1_000_000L; + final String content = data.getContent(); + // Bounded by maxBodyBytes (allow a small overhead for trailing space etc.). + assertTrue(content.length() <= 2048); + // Sanity: the streaming truncation must complete quickly (well under a second). + logger.info("test_maxBodyBytes_largeInputIsBounded elapsed={}ms contentLen={}", elapsedMs, content.length()); + assertTrue(elapsedMs < 1000); + } + } + @Test public void test_multipartAlternative_prefersPlainText() throws Exception { final MimeMessage msg = new MimeMessage(newSession()); From d03d9edddf2aa1eb63d6c20c18fbc0faf48ef3a1 Mon Sep 17 00:00:00 2001 From: Shinsuke Sugaya Date: Tue, 5 May 2026 13:44:50 +0900 Subject: [PATCH 3/5] fix(extractor): replace ByteBuffer truncation with single-pass UTF-8 boundary walk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous CharsetEncoder approach allocated a ByteBuffer sized to the entire remaining maxBodyBytes budget (50 MiB by default) on every appendBody call — even for small text parts. Under concurrent multipart EML processing this multiplied to gigabytes of throwaway allocations. Encode the text once with String.getBytes(UTF_8) (memory proportional to input, not budget) and walk back over UTF-8 continuation bytes to land on a code-point boundary when truncation is needed. Adds test_maxBodyBytes_truncatesAtUtf8CodePointBoundary verifying the boundary walk-back never produces a U+FFFD replacement char. --- .../crawler/extractor/impl/EmlExtractor.java | 71 ++++++------------- .../extractor/impl/EmlExtractorTest.java | 34 ++++++++- 2 files changed, 54 insertions(+), 51 deletions(-) diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractor.java index 51efab81..293e349e 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractor.java @@ -18,11 +18,6 @@ import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; -import java.nio.ByteBuffer; -import java.nio.CharBuffer; -import java.nio.charset.CharsetEncoder; -import java.nio.charset.CoderResult; -import java.nio.charset.CodingErrorAction; import java.nio.charset.StandardCharsets; import java.text.ParseException; import java.text.SimpleDateFormat; @@ -504,11 +499,10 @@ protected void recordAttachment(final BodyExtractionContext ctx, final Part part * {@link #maxBodyBytes}. Truncates any text that would push the total over * the limit. * - *

Uses a single {@link CharsetEncoder} pass into a fixed-size - * {@link ByteBuffer} sized to the remaining byte budget, so memory usage is - * bounded by the budget rather than the input size. This avoids the - * pathological O(N log N) memory blow-up of repeatedly slicing and - * re-encoding very large strings.

+ *

Encodes the text once with {@link String#getBytes(java.nio.charset.Charset)} + * (memory proportional to the input, not to the configured budget). When + * truncation is needed, walks back over UTF-8 continuation bytes (at most + * three steps) so the cut lands on a code-point boundary.

* * @param ctx the extraction context * @param text the text to append @@ -520,45 +514,26 @@ protected void appendBody(final BodyExtractionContext ctx, final String text) { if (ctx.bodyBytes >= maxBodyBytes) { return; } - final long remainingLong = maxBodyBytes - ctx.bodyBytes; - // Cap the per-call buffer to Integer.MAX_VALUE; further input is rejected - // once the cumulative limit is reached on subsequent calls. - final int remaining = (int) Math.min(remainingLong, Integer.MAX_VALUE); - - // Allocate a single buffer sized to the remaining budget. This is the - // only large allocation; we never materialize the full input as bytes. - final ByteBuffer out = ByteBuffer.allocate(remaining); - final CharBuffer in = CharBuffer.wrap(text); - final CharsetEncoder encoder = StandardCharsets.UTF_8.newEncoder() - .onMalformedInput(CodingErrorAction.REPLACE) - .onUnmappableCharacter(CodingErrorAction.REPLACE); - final CoderResult cr = encoder.encode(in, out, true); - // Flush any pending state; if it still overflows, we treat as truncated. - CoderResult flush = CoderResult.UNDERFLOW; - if (!cr.isOverflow()) { - flush = encoder.flush(out); - } - final boolean overflow = cr.isOverflow() || flush.isOverflow(); - final int encoded = out.position(); - - if (!overflow) { - // The whole text fit within the remaining budget. + final byte[] bytes = text.getBytes(StandardCharsets.UTF_8); + final long remaining = maxBodyBytes - ctx.bodyBytes; + if (bytes.length <= remaining) { ctx.body.append(text).append(' '); - ctx.bodyBytes += encoded + 1; - } else { - // Decode the actually-encoded prefix bytes back to a String. UTF-8 - // decoding handles partial multi-byte sequences at the boundary by - // producing a replacement character or trimming, so the result is - // always a valid String — no manual character-boundary search. - out.flip(); - final String truncated = StandardCharsets.UTF_8.decode(out).toString(); - if (!truncated.isEmpty()) { - ctx.body.append(truncated); - } - ctx.bodyBytes = maxBodyBytes; - if (logger.isDebugEnabled()) { - logger.debug("EML body truncated at {} bytes.", maxBodyBytes); - } + ctx.bodyBytes += bytes.length + 1; + return; + } + // Truncate at a UTF-8 code-point boundary that fits within the remaining + // budget. Continuation bytes have the bit pattern 10xxxxxx, so walk back + // until we land on a start byte (or zero). Bounded by 3 iterations. + int cutoff = (int) Math.min(remaining, (long) bytes.length); + while (cutoff > 0 && (bytes[cutoff] & 0xC0) == 0x80) { + cutoff--; + } + if (cutoff > 0) { + ctx.body.append(new String(bytes, 0, cutoff, StandardCharsets.UTF_8)); + } + ctx.bodyBytes = maxBodyBytes; + if (logger.isDebugEnabled()) { + logger.debug("EML body truncated at {} bytes.", maxBodyBytes); } } diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractorTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractorTest.java index 9966c1fa..d4a8def3 100644 --- a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractorTest.java +++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractorTest.java @@ -326,9 +326,9 @@ public void test_maxBodyBytes_truncates() throws Exception { @Test public void test_maxBodyBytes_largeInputIsBounded() throws Exception { // Regression: previous binary-search truncation called text.substring(0, mid).getBytes(UTF_8) - // O(log N) times, each allocating up to ~N bytes. For very large text parts this - // self-OOMs and is also catastrophically slow. The CharsetEncoder-based path - // allocates only ~maxBodyBytes worth of memory once. + // O(log N) times, each allocating up to ~N bytes — catastrophically slow on multi-MiB + // text parts. The current path encodes once and walks back over UTF-8 continuation + // bytes to land on a code-point boundary. final EmlExtractor extractor = new EmlExtractor(); extractor.setMaxBodyBytes(1024); @@ -358,6 +358,34 @@ public void test_maxBodyBytes_largeInputIsBounded() throws Exception { } } + @Test + public void test_maxBodyBytes_truncatesAtUtf8CodePointBoundary() throws Exception { + // The body is 10 copies of "あ" (3 bytes each in UTF-8 = 30 bytes total). + // With maxBodyBytes=10, the cap falls inside the 4th character. The truncation + // must walk back over continuation bytes and land at byte 9 (3 complete chars), + // never producing a half-encoded code point or a U+FFFD replacement. + final EmlExtractor extractor = new EmlExtractor(); + extractor.setMaxBodyBytes(10); + + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") }); + msg.setSubject("multibyte", "UTF-8"); + final StringBuilder body = new StringBuilder(); + for (int i = 0; i < 10; i++) { + body.append('あ'); // あ + } + msg.setText(body.toString(), "UTF-8"); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = extractor.getText(in, null); + final String content = data.getContent(); + // Truncation must not leak U+FFFD from a partial code point. + assertFalse(content.contains("�")); + } + } + @Test public void test_multipartAlternative_prefersPlainText() throws Exception { final MimeMessage msg = new MimeMessage(newSession()); From 636e6a0ea03540a8f938bee47fa073a3db3b4691 Mon Sep 17 00:00:00 2001 From: Shinsuke Sugaya Date: Tue, 5 May 2026 21:17:17 +0900 Subject: [PATCH 4/5] fix(extractor): close maxParts/maxBodyBytes bypasses in EML parsing Three audit findings on PR #166: - multipart/alternative previously charged only the chosen child to ctx.partCount, so an attacker could bypass maxParts by stuffing thousands of unused alternatives. Now charges count - 1 for skipped alternatives (the chosen one is counted via its own extractBody call), re-checking the cap before recursion. - text/* parts were fully decoded into a String via Part.getContent() before any maxBodyBytes check, peaking heap at multiples of the part size. Replaced with a streaming read from Part.getInputStream() capped at 4 * remaining-UTF-8-budget + 16 bytes (enough to fill any UTF-8 cap regardless of source charset, but bounded relative to maxBodyBytes rather than to the part size). - appendBody appended a trailing space even when the encoded text exactly filled the remaining budget, exceeding maxBodyBytes by 1. Reserve the separator byte before taking the fit branch and guard cutoff < bytes.length when walking back continuation bytes. Adds regression tests for the alternative-bypass and the strict cap. --- .../crawler/extractor/impl/EmlExtractor.java | 125 +++++++++++++++--- .../extractor/impl/EmlExtractorTest.java | 64 +++++++++ 2 files changed, 169 insertions(+), 20 deletions(-) diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractor.java index 293e349e..89471cbc 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractor.java @@ -15,9 +15,11 @@ */ package org.codelibs.fess.crawler.extractor.impl; +import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; +import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.text.ParseException; import java.text.SimpleDateFormat; @@ -49,6 +51,7 @@ import jakarta.mail.Multipart; import jakarta.mail.Part; import jakarta.mail.Session; +import jakarta.mail.internet.ContentType; import jakarta.mail.internet.MailDateFormat; import jakarta.mail.internet.MimeMessage; import jakarta.mail.internet.MimeUtility; @@ -396,18 +399,7 @@ protected void extractBody(final Part part, final BodyExtractionContext ctx, fin } if (part.isMimeType("text/*")) { - final Object content; - try { - content = part.getContent(); - } catch (final IOException e) { - if (logger.isDebugEnabled()) { - logger.debug("Failed to read text part content.", e); - } - return; - } - if (content != null) { - appendBody(ctx, content.toString()); - } + appendTextPart(ctx, part); return; } @@ -415,9 +407,10 @@ protected void extractBody(final Part part, final BodyExtractionContext ctx, fin final Object content = part.getContent(); if (content instanceof Multipart) { final Multipart mp = (Multipart) content; + final int count = mp.getCount(); // Prefer text/plain alternative; fall back to first text/* alternative. BodyPart chosen = null; - for (int i = 0; i < mp.getCount(); i++) { + for (int i = 0; i < count; i++) { final BodyPart bp = mp.getBodyPart(i); if (bp.isMimeType("text/plain")) { chosen = bp; @@ -425,7 +418,7 @@ protected void extractBody(final Part part, final BodyExtractionContext ctx, fin } } if (chosen == null) { - for (int i = 0; i < mp.getCount(); i++) { + for (int i = 0; i < count; i++) { final BodyPart bp = mp.getBodyPart(i); if (bp.isMimeType("text/*")) { chosen = bp; @@ -434,10 +427,20 @@ protected void extractBody(final Part part, final BodyExtractionContext ctx, fin } } if (chosen != null) { + // Charge the partCount budget for every alternative — even those we + // don't recurse into — so an attacker can't bypass maxParts by + // stuffing thousands of unused alternatives. The chosen part is + // counted via its own extractBody call below, so charge count - 1. + if (count > 1) { + ctx.partCount += count - 1; + if (ctx.partCount > maxParts) { + throw new MaxLengthExceededException("EML part count exceeded: max=" + maxParts); + } + } extractBody(chosen, ctx, depth + 1); } else { - // No text alternative; recurse into all parts (attachments, nested multipart). - for (int i = 0; i < mp.getCount(); i++) { + // No text alternative; recurse into all parts (each counted normally). + for (int i = 0; i < count; i++) { extractBody(mp.getBodyPart(i), ctx, depth + 1); } } @@ -497,7 +500,7 @@ protected void recordAttachment(final BodyExtractionContext ctx, final Part part /** * Appends body text to the extraction context, enforcing * {@link #maxBodyBytes}. Truncates any text that would push the total over - * the limit. + * the limit (including the trailing separator space). * *

Encodes the text once with {@link String#getBytes(java.nio.charset.Charset)} * (memory proportional to the input, not to the configured budget). When @@ -516,16 +519,17 @@ protected void appendBody(final BodyExtractionContext ctx, final String text) { } final byte[] bytes = text.getBytes(StandardCharsets.UTF_8); final long remaining = maxBodyBytes - ctx.bodyBytes; - if (bytes.length <= remaining) { + // Reserve 1 byte for the trailing separator space so the strict cap holds. + if ((long) bytes.length + 1L <= remaining) { ctx.body.append(text).append(' '); - ctx.bodyBytes += bytes.length + 1; + ctx.bodyBytes += (long) bytes.length + 1L; return; } // Truncate at a UTF-8 code-point boundary that fits within the remaining // budget. Continuation bytes have the bit pattern 10xxxxxx, so walk back // until we land on a start byte (or zero). Bounded by 3 iterations. int cutoff = (int) Math.min(remaining, (long) bytes.length); - while (cutoff > 0 && (bytes[cutoff] & 0xC0) == 0x80) { + while (cutoff > 0 && cutoff < bytes.length && (bytes[cutoff] & 0xC0) == 0x80) { cutoff--; } if (cutoff > 0) { @@ -537,6 +541,87 @@ protected void appendBody(final BodyExtractionContext ctx, final String text) { } } + /** + * Streams a text part's content into the extraction buffer with a hard + * memory cap, then delegates to {@link #appendBody} for byte-accurate + * truncation. + * + *

The previous implementation called {@link Part#getContent()}, which + * fully decoded the part into a Java {@code String}. A multi-GB + * {@code text/plain} part would peak heap usage at multiples of its raw + * size before any {@link #maxBodyBytes} check ran, defeating the DoS + * guard at the memory layer.

+ * + *

This implementation reads from {@link Part#getInputStream} (which + * already decodes Content-Transfer-Encoding) capped at {@code 4 * + * remaining UTF-8 budget + 16} bytes — enough to fill any UTF-8 budget + * regardless of source charset (UTF-8 uses at most 4 bytes per code + * point), but bounded relative to {@link #maxBodyBytes} rather than to + * the part size.

+ * + * @param ctx the extraction context + * @param part the {@code text/*} part + */ + protected void appendTextPart(final BodyExtractionContext ctx, final Part part) { + if (ctx.bodyBytes >= maxBodyBytes) { + return; + } + final long remaining = maxBodyBytes - ctx.bodyBytes; + // Cap source reads at 4× the remaining UTF-8 budget plus a small pad, + // clamped to a sane upper bound so we never allocate a buffer larger + // than is needed to fill the UTF-8 cap. + final long sourceCapL; + if (remaining > (Integer.MAX_VALUE - 16L) / 4L) { + sourceCapL = Integer.MAX_VALUE - 16L; + } else { + sourceCapL = remaining * 4L + 16L; + } + final int sourceCap = (int) sourceCapL; + + Charset charset = StandardCharsets.UTF_8; + try { + final String contentType = part.getContentType(); + if (contentType != null) { + final String cs = new ContentType(contentType).getParameter("charset"); + if (cs != null && !cs.isEmpty()) { + try { + charset = Charset.forName(MimeUtility.javaCharset(cs)); + } catch (final Exception ignored) { + // Unknown / unsupported charset → fall back to UTF-8 + } + } + } + } catch (final MessagingException e) { + if (logger.isDebugEnabled()) { + logger.debug("Failed to parse content type of text part.", e); + } + } + + try (InputStream is = part.getInputStream()) { + // Initial buffer scaled to the source cap, but never larger than 64KiB + // to keep small messages cheap. ByteArrayOutputStream grows on demand. + final ByteArrayOutputStream baos = new ByteArrayOutputStream(Math.min(sourceCap, 64 * 1024)); + final byte[] buf = new byte[Math.min(sourceCap, 8 * 1024)]; + int total = 0; + int n; + while (total < sourceCap && (n = is.read(buf, 0, Math.min(buf.length, sourceCap - total))) > 0) { + baos.write(buf, 0, n); + total += n; + } + if (total > 0) { + appendBody(ctx, new String(baos.toByteArray(), charset)); + } + } catch (final IOException e) { + if (logger.isDebugEnabled()) { + logger.debug("Failed to read text part content.", e); + } + } catch (final MessagingException e) { + if (logger.isDebugEnabled()) { + logger.debug("Failed to access text part input stream.", e); + } + } + } + /** * Backwards-compatible attachment text extraction. Kept for subclasses that * may have overridden it; new code should prefer diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractorTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractorTest.java index d4a8def3..cb92064d 100644 --- a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractorTest.java +++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractorTest.java @@ -386,6 +386,70 @@ public void test_maxBodyBytes_truncatesAtUtf8CodePointBoundary() throws Exceptio } } + @Test + public void test_multipartAlternative_partsCountedTowardMaxParts() throws Exception { + // Regression: multipart/alternative previously charged only the chosen + // part (and the parent multipart node) to ctx.partCount, letting an + // attacker bypass maxParts by stuffing thousands of unused + // alternatives. The fix charges every alternative to the budget. + final EmlExtractor extractor = new EmlExtractor(); + extractor.setMaxParts(5); + + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") }); + msg.setSubject("alt bomb", "UTF-8"); + + final MimeMultipart alt = new MimeMultipart("alternative"); + // 50 text/html alternatives + 1 text/plain that would otherwise be the + // only counted child; under the old code partCount stays at 2. + for (int i = 0; i < 50; i++) { + final MimeBodyPart bp = new MimeBodyPart(); + bp.setContent("HTML " + i + "", "text/html; charset=UTF-8"); + alt.addBodyPart(bp); + } + final MimeBodyPart plain = new MimeBodyPart(); + plain.setText("plain", "UTF-8"); + alt.addBodyPart(plain); + + msg.setContent(alt); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + extractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("part count")); + } + } + + @Test + public void test_maxBodyBytes_strictCapIncludesTrailingSeparator() throws Exception { + // Regression: when the encoded body length exactly equals the + // remaining budget, the old code still appended a trailing space, + // pushing bodyBytes one byte past maxBodyBytes. The fix reserves the + // separator byte before deciding to append the full text. + final EmlExtractor extractor = new EmlExtractor(); + extractor.setMaxBodyBytes(8); + + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") }); + msg.setSubject("exact", "UTF-8"); + // 8 ASCII bytes — exactly equals maxBodyBytes; the fit branch must NOT + // append a trailing space and exceed the cap. + msg.setText("12345678", "UTF-8"); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = extractor.getText(in, null); + final String content = data.getContent(); + // Must not exceed maxBodyBytes (8 bytes / 8 ASCII chars). + logger.info("test_maxBodyBytes_strictCapIncludesTrailingSeparator content.length={}", content.length()); + assertTrue(content.length() <= 8); + } + } + @Test public void test_multipartAlternative_prefersPlainText() throws Exception { final MimeMessage msg = new MimeMessage(newSession()); From 1b26fec3f017183a9f198bb18225873d1c37541e Mon Sep 17 00:00:00 2001 From: Shinsuke Sugaya Date: Sat, 16 May 2026 11:00:31 +0900 Subject: [PATCH 5/5] fix(extractor): harden EmlExtractor against malicious EML inputs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address multi-agent code review findings on PR #166. Tightens defensive bounds, error handling, and input validation in EmlExtractor without weakening any existing happy-path behavior. Security / DoS: - Add maxMessageBytes (default 100 MiB) enforced via LimitedInputStream before MimeMessage parses, closing the parser-stage memory hole. - Wrap attachment streams in LimitedInputStream sized to the remaining UTF-8 budget so nested extractors cannot bypass maxBodyBytes. - Re-throw MaxLengthExceededException from appendAttachment instead of swallowing it, preserving the security signal up the call chain. - Replace ByteArrayOutputStream-based text part decoding with a streaming InputStreamReader capped at remaining chars, eliminating amplification on wide source charsets (UTF-32 etc.). - Cap Received header iteration at 100 entries and hoist MailDateFormat out of the loop. Correctness: - getDateString now anchors on the rightmost ';' per RFC 5322 §3.6.7, falling back to the day-of-week scan only when no ';' is present. Avoids false-positive matches inside Received-header comments. - getDecodeText returns the raw value (not empty string) on UnsupportedEncodingException, so malformed headers stay non-empty. - Cache getReceivedDate(message) so legacy and normalized keys reuse the same result. - Replace Session.getDefaultInstance with Session.getInstance so per-call mail properties are honored. Error handling: - Surface text-part read failures at WARN (matching HtmlExtractor / PdfExtractor / TikaExtractor conventions); previously hidden at DEBUG. - Narrow charset-resolution catch to IllegalCharsetNameException / UnsupportedCharsetException and log fallback to UTF-8 at WARN. - Narrow putValue catch from Exception to RuntimeException and log at WARN with key=value format. - Replace silent '// ignore' on ParseException with a DEBUG log. API: - Reject non-positive values in setMaxRecursionDepth (negative only), setMaxParts, setMaxBodyBytes, setMaxMessageBytes via IllegalArgumentException. - @Deprecated appendAttachment(StringBuilder, BodyPart) and document that it does not honor maxBodyBytes cumulatively. Tests: - Add 17 new tests covering maxMessageBytes enforcement, attachment budget propagation, MaxLengthExceededException re-throw, recursion boundary, RFC 2047 decoding for to/cc/bcc/replyTo, normalized date and messageId metadata, ISO-2022-JP / unknown / missing charsets, multiple and inline attachments, cross-part body budget, setter validation, Received-header parsing edge cases, and getDecodeText fallback. - Tighten existing tests: test_decodesRfc2047Subject now asserts equality instead of permitting either form; remove flaky timing assertion from test_maxBodyBytes_largeInputIsBounded; tighten threshold in test_maxBodyBytes_truncates. --- .../crawler/extractor/impl/EmlExtractor.java | 286 ++++++--- .../extractor/impl/EmlExtractorTest.java | 548 +++++++++++++++++- 2 files changed, 758 insertions(+), 76 deletions(-) diff --git a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractor.java b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractor.java index 89471cbc..3bf1ecf0 100644 --- a/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractor.java +++ b/fess-crawler/src/main/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractor.java @@ -15,12 +15,15 @@ */ package org.codelibs.fess.crawler.extractor.impl; -import java.io.ByteArrayOutputStream; +import java.io.FilterInputStream; import java.io.IOException; import java.io.InputStream; +import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.nio.charset.Charset; +import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.StandardCharsets; +import java.nio.charset.UnsupportedCharsetException; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; @@ -62,6 +65,10 @@ *

EML content is treated as untrusted. The extractor enforces the following * defensive bounds against malformed or malicious messages:

*
    + *
  • {@link #maxMessageBytes} (default 100 MiB) is the first-line defense: + * the raw input stream is capped before {@code MimeMessage} even begins + * to parse, preventing memory exhaustion from pathologically large + * messages.
  • *
  • {@link #maxRecursionDepth} (default 10) caps how deeply nested * {@code message/rfc822} or {@code multipart/*} parts may be.
  • *
  • {@link #maxParts} (default 1000) caps the total number of MIME parts @@ -71,6 +78,8 @@ *
*

RFC 2047 encoded-word headers (e.g. {@code Subject}, * {@code From}, {@code To}) are decoded via {@link MimeUtility#decodeText}.

+ *

The legacy {@code Subject} metadata key is RFC 2047-decoded for + * compatibility with older callers.

* * @author shinsuke * @@ -94,6 +103,9 @@ public class EmlExtractor extends AbstractExtractor { /** Maximum total body bytes (UTF-8) appended to the extracted content. */ protected long maxBodyBytes = 50L * 1024 * 1024; + /** Maximum allowed total stream bytes consumed while parsing the EML. */ + protected long maxMessageBytes = 100L * 1024 * 1024; + /** * Constructs a new EmlExtractor. */ @@ -112,9 +124,13 @@ public ExtractData getText(final InputStream in, final Map param props.put(entry.getKey(), entry.getValue()); } } + if (in == null) { + throw new ExtractException("Input stream is null."); + } + final LimitedInputStream limited = new LimitedInputStream(in, maxMessageBytes); try { - final Session mailSession = Session.getDefaultInstance(props, null); - final MimeMessage message = new MimeMessage(mailSession, in); + final Session mailSession = Session.getInstance(props, null); + final MimeMessage message = new MimeMessage(mailSession, limited); final BodyExtractionContext ctx = new BodyExtractionContext(); extractBody(message, ctx, 0); final ExtractData data = new ExtractData(ctx.body.toString()); @@ -134,7 +150,8 @@ public ExtractData getText(final InputStream in, final Map param putValue(data, "Line-Count", message.getLineCount()); putValue(data, "Message-ID", message.getMessageID()); putValue(data, "Message-Number", message.getMessageNumber()); - putValue(data, "Received-Date", getReceivedDate(message)); + final Date receivedDate = getReceivedDate(message); + putValue(data, "Received-Date", receivedDate); putValue(data, "Reply-To", message.getReplyTo()); putValue(data, "Sender", message.getSender()); putValue(data, "Sent-Date", message.getSentDate()); @@ -153,7 +170,7 @@ public ExtractData getText(final InputStream in, final Map param putDecodedAddressValues(data, "bcc", message.getRecipients(Message.RecipientType.BCC)); putDecodedAddressValues(data, "replyTo", message.getReplyTo()); putDateValue(data, "sentDate", message.getSentDate()); - putDateValue(data, "receivedDate", getReceivedDate(message)); + putDateValue(data, "receivedDate", receivedDate); if (message.getMessageID() != null) { data.putValue("messageId", message.getMessageID()); } @@ -163,8 +180,14 @@ public ExtractData getText(final InputStream in, final Map param } return data; } catch (final MessagingException e) { + if (limited.isExceeded()) { + throw new MaxLengthExceededException("EML message size exceeded: max=" + maxMessageBytes); + } throw new ExtractException(e); } catch (final IOException e) { + if (limited.isExceeded()) { + throw new MaxLengthExceededException("EML message size exceeded: max=" + maxMessageBytes); + } throw new ExtractException(e); } } @@ -203,10 +226,8 @@ protected void putValue(final ExtractData data, final String key, final Object v } else if (value != null) { data.putValue(key, value.toString()); } - } catch (final Exception e) { - if (logger.isDebugEnabled()) { - logger.debug("Failed to put {}:{}", key, value, e); - } + } catch (final RuntimeException e) { + logger.warn("Failed to put header value. key={}", key, e); } } @@ -264,8 +285,12 @@ protected void putDateValue(final ExtractData data, final String key, final Date /** * Decodes MIME-encoded text. * + *

On {@link UnsupportedEncodingException} (caused by an unrecognised RFC 2047 + * charset), logs a warning and returns the raw value unchanged so + * callers still receive some usable output rather than an empty string.

+ * * @param value the encoded text to decode - * @return the decoded text or empty string if decoding fails + * @return the decoded text, the raw value on encoding failure, or empty string for null input */ protected String getDecodeText(final String value) { if (value == null) { @@ -274,8 +299,8 @@ protected String getDecodeText(final String value) { try { return MimeUtility.decodeText(value); } catch (final UnsupportedEncodingException e) { - logger.warn("Invalid encoding.", e); - return StringUtil.EMPTY; + logger.warn("Invalid RFC 2047 encoding, returning raw value. value={}", value, e); + return value; } } @@ -308,11 +333,16 @@ public int getMaxRecursionDepth() { /** * Sets the maximum allowed recursion depth for nested multipart / - * {@code message/rfc822} parts. + * {@code message/rfc822} parts. A value of {@code 0} means only the root + * part is processed (no recursion). Negative values are rejected. * - * @param maxRecursionDepth the maximum recursion depth + * @param maxRecursionDepth the maximum recursion depth; must be >= 0 + * @throws IllegalArgumentException if the value is negative */ public void setMaxRecursionDepth(final int maxRecursionDepth) { + if (maxRecursionDepth < 0) { + throw new IllegalArgumentException("maxRecursionDepth must be positive: " + maxRecursionDepth); + } this.maxRecursionDepth = maxRecursionDepth; } @@ -328,9 +358,13 @@ public int getMaxParts() { /** * Sets the maximum total number of MIME parts visited per message. * - * @param maxParts the maximum number of parts + * @param maxParts the maximum number of parts; must be > 0 + * @throws IllegalArgumentException if the value is <= 0 */ public void setMaxParts(final int maxParts) { + if (maxParts <= 0) { + throw new IllegalArgumentException("maxParts must be positive: " + maxParts); + } this.maxParts = maxParts; } @@ -346,12 +380,39 @@ public long getMaxBodyBytes() { /** * Sets the maximum total UTF-8 body bytes appended to extracted content. * - * @param maxBodyBytes the maximum body bytes + * @param maxBodyBytes the maximum body bytes; must be > 0 + * @throws IllegalArgumentException if the value is <= 0 */ public void setMaxBodyBytes(final long maxBodyBytes) { + if (maxBodyBytes <= 0) { + throw new IllegalArgumentException("maxBodyBytes must be positive: " + maxBodyBytes); + } this.maxBodyBytes = maxBodyBytes; } + /** + * Returns the maximum allowed total stream bytes consumed while parsing the EML. + * + * @return the maximum message bytes + */ + public long getMaxMessageBytes() { + return maxMessageBytes; + } + + /** + * Sets the maximum allowed total stream bytes consumed while parsing the EML. + * This is the first-line defense before {@link MimeMessage} parses the input. + * + * @param maxMessageBytes the maximum message bytes; must be > 0 + * @throws IllegalArgumentException if the value is <= 0 + */ + public void setMaxMessageBytes(final long maxMessageBytes) { + if (maxMessageBytes <= 0) { + throw new IllegalArgumentException("maxMessageBytes must be positive: " + maxMessageBytes); + } + this.maxMessageBytes = maxMessageBytes; + } + /** * Extracts the body text from a MIME message. * @@ -537,27 +598,32 @@ protected void appendBody(final BodyExtractionContext ctx, final String text) { } ctx.bodyBytes = maxBodyBytes; if (logger.isDebugEnabled()) { - logger.debug("EML body truncated at {} bytes.", maxBodyBytes); + logger.debug("EML body truncated. maxBytes={}", maxBodyBytes); } } /** - * Streams a text part's content into the extraction buffer with a hard - * memory cap, then delegates to {@link #appendBody} for byte-accurate - * truncation. + * Returns the content type of a part as a string, or {@code "unknown"} on + * {@link MessagingException}. * - *

The previous implementation called {@link Part#getContent()}, which - * fully decoded the part into a Java {@code String}. A multi-GB - * {@code text/plain} part would peak heap usage at multiples of its raw - * size before any {@link #maxBodyBytes} check ran, defeating the DoS - * guard at the memory layer.

+ * @param part the MIME part + * @return the content-type string or {@code "unknown"} + */ + private static String safeGetContentType(final Part part) { + try { + return part.getContentType(); + } catch (final MessagingException e) { + return "unknown"; + } + } + + /** + * Streams a text part's content into the extraction buffer, reading + * {@code remaining} chars at most via {@link InputStreamReader}, then + * delegates to {@link #appendBody} for byte-accurate truncation. * - *

This implementation reads from {@link Part#getInputStream} (which - * already decodes Content-Transfer-Encoding) capped at {@code 4 * - * remaining UTF-8 budget + 16} bytes — enough to fill any UTF-8 budget - * regardless of source charset (UTF-8 uses at most 4 bytes per code - * point), but bounded relative to {@link #maxBodyBytes} rather than to - * the part size.

+ *

The charset is resolved from the part's Content-Type header; if absent + * or unrecognised, UTF-8 is used as the fallback.

* * @param ctx the extraction context * @param part the {@code text/*} part @@ -567,16 +633,7 @@ protected void appendTextPart(final BodyExtractionContext ctx, final Part part) return; } final long remaining = maxBodyBytes - ctx.bodyBytes; - // Cap source reads at 4× the remaining UTF-8 budget plus a small pad, - // clamped to a sane upper bound so we never allocate a buffer larger - // than is needed to fill the UTF-8 cap. - final long sourceCapL; - if (remaining > (Integer.MAX_VALUE - 16L) / 4L) { - sourceCapL = Integer.MAX_VALUE - 16L; - } else { - sourceCapL = remaining * 4L + 16L; - } - final int sourceCap = (int) sourceCapL; + final int charCap = (int) Math.min(remaining, (long) Integer.MAX_VALUE / 4); Charset charset = StandardCharsets.UTF_8; try { @@ -586,8 +643,8 @@ protected void appendTextPart(final BodyExtractionContext ctx, final Part part) if (cs != null && !cs.isEmpty()) { try { charset = Charset.forName(MimeUtility.javaCharset(cs)); - } catch (final Exception ignored) { - // Unknown / unsupported charset → fall back to UTF-8 + } catch (final IllegalCharsetNameException | UnsupportedCharsetException e) { + logger.warn("Unsupported EML text part charset, fallback=UTF-8. charset={}", cs, e); } } } @@ -597,28 +654,22 @@ protected void appendTextPart(final BodyExtractionContext ctx, final Part part) } } - try (InputStream is = part.getInputStream()) { - // Initial buffer scaled to the source cap, but never larger than 64KiB - // to keep small messages cheap. ByteArrayOutputStream grows on demand. - final ByteArrayOutputStream baos = new ByteArrayOutputStream(Math.min(sourceCap, 64 * 1024)); - final byte[] buf = new byte[Math.min(sourceCap, 8 * 1024)]; + try (InputStream is = part.getInputStream(); InputStreamReader reader = new InputStreamReader(is, charset)) { + final char[] buf = new char[Math.min(charCap, 8 * 1024)]; + final StringBuilder sb = new StringBuilder(Math.min(charCap, 64 * 1024)); int total = 0; int n; - while (total < sourceCap && (n = is.read(buf, 0, Math.min(buf.length, sourceCap - total))) > 0) { - baos.write(buf, 0, n); + while (total < charCap && (n = reader.read(buf, 0, Math.min(buf.length, charCap - total))) > 0) { + sb.append(buf, 0, n); total += n; } if (total > 0) { - appendBody(ctx, new String(baos.toByteArray(), charset)); + appendBody(ctx, sb.toString()); } } catch (final IOException e) { - if (logger.isDebugEnabled()) { - logger.debug("Failed to read text part content.", e); - } + logger.warn("Failed to read text part content. contentType={}", safeGetContentType(part), e); } catch (final MessagingException e) { - if (logger.isDebugEnabled()) { - logger.debug("Failed to access text part input stream.", e); - } + logger.warn("Failed to access text part input stream. contentType={}", safeGetContentType(part), e); } } @@ -627,9 +678,16 @@ protected void appendTextPart(final BodyExtractionContext ctx, final Part part) * may have overridden it; new code should prefer * {@link #appendAttachment(BodyExtractionContext, BodyPart)}. * + * @deprecated Use {@link #appendAttachment(BodyExtractionContext, BodyPart)} instead. + * This shim creates a fresh extraction context with {@code bodyBytes=0}, so + * the {@link #maxBodyBytes} cap is enforced per call rather than cumulatively + * across a single message. Subclasses overriding this method should migrate to + * the context-aware overload. + * * @param buf the buffer to append content to * @param bodyPart the body part containing the attachment */ + @Deprecated protected void appendAttachment(final StringBuilder buf, final BodyPart bodyPart) { final BodyExtractionContext ctx = new BodyExtractionContext(); ctx.body = buf; @@ -639,10 +697,12 @@ protected void appendAttachment(final StringBuilder buf, final BodyPart bodyPart /** * Attempts to extract text from an attachment using a registered * {@link Extractor} for its detected MIME type. Failures are silently - * swallowed. + * swallowed unless they are {@link MaxLengthExceededException}, which is + * re-thrown so the caller can enforce overall message size limits. * * @param ctx the extraction context * @param bodyPart the attachment body part + * @throws MaxLengthExceededException if the nested extractor signals a size limit violation */ protected void appendAttachment(final BodyExtractionContext ctx, final BodyPart bodyPart) { final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper(); @@ -653,20 +713,34 @@ protected void appendAttachment(final BodyExtractionContext ctx, final BodyPart if (mimeType != null) { final Extractor extractor = extractorFactory.getExtractor(mimeType); if (extractor != null) { - try (final InputStream in = bodyPart.getInputStream()) { + if (ctx.bodyBytes >= maxBodyBytes) { + return; + } + final long remaining = maxBodyBytes - ctx.bodyBytes; + final long sourceCapL; + if (remaining > (Integer.MAX_VALUE - 16L) / 4L) { + sourceCapL = Integer.MAX_VALUE - 16L; + } else { + sourceCapL = remaining * 4L + 16L; + } + try (final InputStream in = new LimitedInputStream(bodyPart.getInputStream(), sourceCapL)) { final Map map = new HashMap<>(); map.put(ExtractData.RESOURCE_NAME_KEY, filename); final String content = extractor.getText(in, map).getContent(); if (content != null) { appendBody(ctx, content); } + } catch (final MaxLengthExceededException e) { + throw e; } catch (final Exception e) { if (logger.isDebugEnabled()) { - logger.debug("Exception in an internal extractor.", e); + logger.debug("Exception in an internal extractor. filename={}", filename, e); } } } } + } catch (final MaxLengthExceededException e) { + throw e; } catch (final MessagingException e) { if (logger.isDebugEnabled()) { logger.debug("Exception in parsing BodyPart.", e); @@ -676,6 +750,8 @@ protected void appendAttachment(final BodyExtractionContext ctx, final BodyPart /** * Gets the received date from a message by parsing the received headers. + * Caps inspection to the first 100 headers to avoid unbounded work on + * messages with pathologically many {@code Received} lines. * * @param message the message to get the received date from * @return the received date or null if not found @@ -684,17 +760,25 @@ protected void appendAttachment(final BodyExtractionContext ctx, final BodyPart protected static Date getReceivedDate(final Message message) throws MessagingException { final Date today = new Date(); final String[] received = message.getHeader("received"); - if (received != null) { - for (final String v : received) { - String dateStr = null; - try { - dateStr = getDateString(v); - final Date receivedDate = new MailDateFormat().parse(dateStr); - if (!receivedDate.after(today)) { - return receivedDate; - } - } catch (final ParseException e) { - // ignore + if (received == null) { + return null; + } + final MailDateFormat format = new MailDateFormat(); + final int limit = Math.min(received.length, 100); + for (int i = 0; i < limit; i++) { + final String v = received[i]; + try { + final String dateStr = getDateString(v); + if (dateStr == null) { + continue; + } + final Date receivedDate = format.parse(dateStr); + if (!receivedDate.after(today)) { + return receivedDate; + } + } catch (final ParseException e) { + if (logger.isDebugEnabled()) { + logger.debug("Failed to parse received header. value={}", v, e); } } } @@ -704,10 +788,21 @@ protected static Date getReceivedDate(final Message message) throws MessagingExc /** * Extracts a date string from the received header text. * + *

Per RFC 5322 §3.6.7 the date portion follows the last {@code ;} in + * the header. If no {@code ;} is present, falls back to scanning for a + * day-of-week abbreviation.

+ * * @param text the received header text - * @return the date string starting from the day of week, or null if not found + * @return the date string, or null if not found */ private static String getDateString(final String text) { + if (text == null) { + return null; + } + final int semicolon = text.lastIndexOf(';'); + if (semicolon != -1 && semicolon + 1 < text.length()) { + return text.substring(semicolon + 1).trim(); + } for (final String dow : DAY_OF_WEEK) { final int i = text.lastIndexOf(dow); if (i != -1) { @@ -733,4 +828,51 @@ protected static class BodyExtractionContext { /** Decoded attachment filenames. */ protected List attachmentNames = new ArrayList<>(); } + + /** + * A {@link FilterInputStream} that throws {@link IOException} once the + * number of bytes read exceeds a configured limit. Used to cap raw EML + * stream consumption before {@link MimeMessage} parses the input. + */ + private static final class LimitedInputStream extends FilterInputStream { + private final long limit; + private long bytesRead; + private boolean exceeded; + + LimitedInputStream(final InputStream in, final long limit) { + super(in); + this.limit = limit; + } + + @Override + public int read() throws IOException { + final int b = super.read(); + if (b != -1) { + bytesRead++; + if (bytesRead > limit) { + exceeded = true; + throw new IOException("EML message size exceeded."); + } + } + return b; + } + + @Override + public int read(final byte[] b, final int off, final int len) throws IOException { + final int n = super.read(b, off, len); + if (n > 0) { + bytesRead += n; + if (bytesRead > limit) { + exceeded = true; + throw new IOException("EML message size exceeded."); + } + } + return n; + } + + /** Returns {@code true} if the limit was exceeded during reading. */ + boolean isExceeded() { + return exceeded; + } + } } diff --git a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractorTest.java b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractorTest.java index cb92064d..b125d53f 100644 --- a/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractorTest.java +++ b/fess-crawler/src/test/java/org/codelibs/fess/crawler/extractor/impl/EmlExtractorTest.java @@ -19,8 +19,12 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; +import java.text.SimpleDateFormat; import java.util.Arrays; +import java.util.Date; +import java.util.Map; import java.util.Properties; +import java.util.TimeZone; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -29,6 +33,7 @@ import org.codelibs.fess.crawler.entity.ExtractData; import org.codelibs.fess.crawler.exception.CrawlerSystemException; import org.codelibs.fess.crawler.exception.MaxLengthExceededException; +import org.codelibs.fess.crawler.extractor.Extractor; import org.codelibs.fess.crawler.extractor.ExtractorFactory; import org.codelibs.fess.crawler.helper.impl.MimeTypeHelperImpl; import org.dbflute.utflute.core.PlainTestCase; @@ -169,9 +174,9 @@ public void test_decodesRfc2047Subject() throws Exception { try (final InputStream in = toStream(msg)) { final ExtractData data = emlExtractor.getText(in, null); - // Raw header preserves RFC 2047 encoded form when present + // Legacy `Subject` metadata key is also RFC 2047-decoded for caller convenience. final String raw = data.getValues("Subject")[0]; - assertTrue(raw.contains("=?") || raw.equals("こんにちは")); + assertEquals("こんにちは", raw); // Normalized "subject" metadata is decoded assertEquals("こんにちは", data.getValues("subject")[0]); } @@ -318,7 +323,7 @@ public void test_maxBodyBytes_truncates() throws Exception { final ExtractData data = extractor.getText(in, null); final String content = data.getContent(); // Body must be truncated; the 200-char input is no longer there in full. - assertTrue(content.length() <= 64); + assertTrue(content.length() <= 33); assertTrue(content.length() < 200); } } @@ -354,7 +359,6 @@ public void test_maxBodyBytes_largeInputIsBounded() throws Exception { assertTrue(content.length() <= 2048); // Sanity: the streaming truncation must complete quickly (well under a second). logger.info("test_maxBodyBytes_largeInputIsBounded elapsed={}ms contentLen={}", elapsedMs, content.length()); - assertTrue(elapsedMs < 1000); } } @@ -475,4 +479,540 @@ public void test_multipartAlternative_prefersPlainText() throws Exception { assertFalse(content.contains("HTML_BODY")); } } + + // -------------------------------------------------------------------- + // New tests + // -------------------------------------------------------------------- + + @Test + public void test_maxMessageBytes_enforcedBeforeParsing() throws Exception { + // Build a small valid EML, then set maxMessageBytes very small (64 bytes) + // so that even a minimal message stream exceeds it. + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") }); + msg.setSubject("test subject", "UTF-8"); + msg.setText("Hello, this is a test EML body that is longer than 64 bytes definitely!", "UTF-8"); + msg.saveChanges(); + + final EmlExtractor extractor = new EmlExtractor(); + extractor.setMaxMessageBytes(64); + + try (final InputStream in = toStream(msg)) { + extractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("message size")); + } + } + + @Test + public void test_attachment_extractorOutputRespectsMaxBodyBytes() throws Exception { + // Build a stub extractor that returns 1 MiB of content + final String largeContent = "x".repeat(1024 * 1024); + final Extractor stubExtractor = new Extractor() { + @Override + public ExtractData getText(final InputStream in, final Map params) { + return new ExtractData(largeContent); + } + }; + + // Register stub via a fresh container with the stub registered for application/pdf + final StandardCrawlerContainer container = new StandardCrawlerContainer().singleton("emlExtractor", EmlExtractor.class); + container.singleton("mimeTypeHelper", MimeTypeHelperImpl.class) + . singleton("extractorFactory", ExtractorFactory.class, factory -> { + factory.addExtractor("application/pdf", stubExtractor); + }); + final EmlExtractor extractor = container.getComponent("emlExtractor"); + extractor.setMaxBodyBytes(1024); + + // Build an EML with a text body and an application/pdf attachment + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") }); + msg.setSubject("attachment test", "UTF-8"); + + final MimeMultipart mp = new MimeMultipart(); + final MimeBodyPart textPart = new MimeBodyPart(); + textPart.setText("body text", "UTF-8"); + mp.addBodyPart(textPart); + + final MimeBodyPart attachment = new MimeBodyPart(); + attachment.setContent(new byte[] { '%', 'P', 'D', 'F' }, "application/pdf"); + attachment.setFileName("report.pdf"); + attachment.setDisposition(jakarta.mail.Part.ATTACHMENT); + mp.addBodyPart(attachment); + + msg.setContent(mp); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = extractor.getText(in, null); + // Allow small overhead for separator + assertTrue(data.getContent().length() <= 2048); + } + } + + @Test + public void test_appendAttachment_propagatesMaxLengthExceededException() throws Exception { + // Stub extractor that always throws MaxLengthExceededException + final Extractor stubExtractor = new Extractor() { + @Override + public ExtractData getText(final InputStream in, final Map params) { + throw new MaxLengthExceededException("stub size exceeded"); + } + }; + + final StandardCrawlerContainer container = new StandardCrawlerContainer().singleton("emlExtractor", EmlExtractor.class); + container.singleton("mimeTypeHelper", MimeTypeHelperImpl.class) + . singleton("extractorFactory", ExtractorFactory.class, factory -> { + factory.addExtractor("application/pdf", stubExtractor); + }); + final EmlExtractor extractor = container.getComponent("emlExtractor"); + + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("recipient@example.com") }); + msg.setSubject("propagation test", "UTF-8"); + + final MimeMultipart mp = new MimeMultipart(); + final MimeBodyPart textPart = new MimeBodyPart(); + textPart.setText("body", "UTF-8"); + mp.addBodyPart(textPart); + + final MimeBodyPart attachment = new MimeBodyPart(); + attachment.setContent(new byte[] { '%', 'P', 'D', 'F' }, "application/pdf"); + attachment.setFileName("big.pdf"); + attachment.setDisposition(jakarta.mail.Part.ATTACHMENT); + mp.addBodyPart(attachment); + + msg.setContent(mp); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + extractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + // Expected — exception must propagate, not be swallowed + } + } + + @Test + public void test_recursion_exactlyAtMaxDepth_succeeds() throws Exception { + // Depth accounting (each wrap contributes 2 depth levels: multipart + rfc822 part): + // root message (depth 0) → multipart bp (depth 1) → message/rfc822 content (depth 2) → inner text/* (depth 3) + // With maxRecursionDepth=3, depth=3 is allowed (3 <= 3), so 1 wrap must succeed. + // With maxRecursionDepth=1, depth=2 > 1 fails, so 1 wrap with max=1 must fail. + final Session session = newSession(); + + // Build innermost leaf message with setText + final MimeMessage inner = new MimeMessage(session); + inner.setFrom(new InternetAddress("inner@example.com")); + inner.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") }); + inner.setSubject("inner", "UTF-8"); + inner.setText("innermost", "UTF-8"); + inner.saveChanges(); + + // Wrap once: root → multipart → rfc822 bodypart → inner (text/plain at depth 3) + final MimeMessage outer = new MimeMessage(session); + outer.setFrom(new InternetAddress("outer@example.com")); + outer.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") }); + outer.setSubject("outer", "UTF-8"); + final MimeMultipart mp = new MimeMultipart(); + final MimeBodyPart nested = new MimeBodyPart(); + nested.setContent(inner, "message/rfc822"); + mp.addBodyPart(nested); + outer.setContent(mp); + outer.saveChanges(); + + final EmlExtractor extractor = new EmlExtractor(); + extractor.setMaxRecursionDepth(3); + + // 1 wrap at maxRecursionDepth=3 must succeed (inner text at depth 3) + try (final InputStream in = toStream(outer)) { + final ExtractData data = extractor.getText(in, null); + assertTrue(data.getContent().contains("innermost")); + } + + // With maxRecursionDepth=1, the rfc822 content at depth 2 exceeds the limit + extractor.setMaxRecursionDepth(1); + try (final InputStream in = toStream(outer)) { + extractor.getText(in, null); + fail(); + } catch (final MaxLengthExceededException e) { + assertTrue(e.getMessage().contains("recursion")); + } + } + + @Test + public void test_decodesRfc2047_recipientsAndReplyTo() throws Exception { + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + + final InternetAddress toAddr = new InternetAddress("to@example.com", "田中 一郎", "UTF-8"); + final InternetAddress ccAddr = new InternetAddress("cc@example.com", "鈴木 花子", "UTF-8"); + final InternetAddress bccAddr = new InternetAddress("bcc@example.com", "佐藤 次郎", "UTF-8"); + final InternetAddress replyAddr = new InternetAddress("reply@example.com", "山本 三郎", "UTF-8"); + + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { toAddr }); + msg.setRecipients(Message.RecipientType.CC, new InternetAddress[] { ccAddr }); + msg.setRecipients(Message.RecipientType.BCC, new InternetAddress[] { bccAddr }); + msg.setReplyTo(new InternetAddress[] { replyAddr }); + msg.setSubject("multi-recipient", "UTF-8"); + msg.setText("body", "UTF-8"); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = emlExtractor.getText(in, null); + + final String[] toValues = data.getValues("to"); + assertNotNull(toValues); + assertTrue(toValues[0].contains("田中 一郎")); + + final String[] ccValues = data.getValues("cc"); + assertNotNull(ccValues); + assertTrue(ccValues[0].contains("鈴木 花子")); + + final String[] bccValues = data.getValues("bcc"); + assertNotNull(bccValues); + assertTrue(bccValues[0].contains("佐藤 次郎")); + + final String[] replyToValues = data.getValues("replyTo"); + assertNotNull(replyToValues); + assertTrue(replyToValues[0].contains("山本 三郎")); + } + } + + @Test + public void test_normalizedDateAndMessageIdMetadata() throws Exception { + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") }); + msg.setSubject("date test", "UTF-8"); + msg.setText("body", "UTF-8"); + + // Set a known sent date + final SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"); + sdf.setTimeZone(TimeZone.getTimeZone("UTC")); + final Date sentDate = sdf.parse("2025-01-15T10:30:00.000Z"); + msg.setSentDate(sentDate); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = emlExtractor.getText(in, null); + + // sentDate must be ISO-8601 UTC + final String[] sentDateValues = data.getValues("sentDate"); + assertNotNull(sentDateValues); + assertEquals("2025-01-15T10:30:00.000Z", sentDateValues[0]); + + // messageId must be absent when not explicitly set (JavaMail may auto-generate one) + // In this test we verify it is present since saveChanges() generates a Message-ID + // Just ensure the key exists and is non-empty when present + final String[] msgIdValues = data.getValues("messageId"); + // JavaMail always generates a Message-ID on saveChanges, so it must be present + assertNotNull(msgIdValues); + assertTrue(msgIdValues[0].length() > 0); + } + + // Verify messageId absent when message has no Message-ID header + // Build message without calling saveChanges to avoid auto-generation + final MimeMessage msg2 = new MimeMessage(newSession()); + msg2.setFrom(new InternetAddress("sender@example.com")); + msg2.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") }); + msg2.setSubject("no message id", "UTF-8"); + msg2.setText("body", "UTF-8"); + // Do not call saveChanges; remove Message-ID header if present + msg2.removeHeader("Message-ID"); + msg2.saveChanges(); + msg2.removeHeader("Message-ID"); + + try (final InputStream in = toStream(msg2)) { + final ExtractData data = emlExtractor.getText(in, null); + // messageId should be absent since we removed the Message-ID header + assertNull(data.getValues("messageId")); + } + } + + @Test + public void test_textPart_iso2022jp_decodedCorrectly() throws Exception { + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") }); + msg.setSubject("iso-2022-jp test", "UTF-8"); + msg.setText("こんにちは", "ISO-2022-JP"); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = emlExtractor.getText(in, null); + assertTrue(data.getContent().contains("こんにちは")); + } + } + + @Test + public void test_textPart_unknownCharset_fallsBackToUtf8() throws Exception { + // Build raw EML bytes to avoid JavaMail rejecting the bogus charset during serialization. + // The body text is pure ASCII ("hello") which is valid in any charset including the fallback UTF-8. + final String boundary = "----=_Part_0_12345678.90"; + final String rawEml = "From: sender@example.com\r\n" + "To: r@example.com\r\n" + "Subject: unknown charset\r\n" + + "MIME-Version: 1.0\r\n" + "Content-Type: multipart/mixed; boundary=\"" + boundary + "\"\r\n" + "\r\n" + "--" + boundary + + "\r\n" + "Content-Type: text/plain; charset=bogus-cs-9\r\n" + "Content-Transfer-Encoding: 7bit\r\n" + "\r\n" + "hello\r\n" + + "--" + boundary + "--\r\n"; + + try (final InputStream in = new ByteArrayInputStream(rawEml.getBytes(java.nio.charset.StandardCharsets.US_ASCII))) { + final ExtractData data = emlExtractor.getText(in, null); + assertTrue(data.getContent().contains("hello")); + } + } + + @Test + public void test_textPart_noCharsetParameter_decodesAsUtf8() throws Exception { + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") }); + msg.setSubject("no charset", "UTF-8"); + + final MimeMultipart mp = new MimeMultipart(); + final MimeBodyPart textPart = new MimeBodyPart(); + // Content-Type without charset parameter + textPart.setContent("hello world", "text/plain"); + mp.addBodyPart(textPart); + msg.setContent(mp); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = emlExtractor.getText(in, null); + assertTrue(data.getContent().contains("hello world")); + } + } + + @Test + public void test_multipleAttachments_allRecorded() throws Exception { + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") }); + msg.setSubject("multiple attachments", "UTF-8"); + + final MimeMultipart mp = new MimeMultipart(); + + final MimeBodyPart textPart = new MimeBodyPart(); + textPart.setText("body", "UTF-8"); + mp.addBodyPart(textPart); + + final String[] filenames = { "file1.txt", "file2.doc", "file3.xml" }; + for (final String name : filenames) { + final MimeBodyPart att = new MimeBodyPart(); + att.setContent("content of " + name, "application/octet-stream"); + att.setFileName(name); + att.setDisposition(jakarta.mail.Part.ATTACHMENT); + mp.addBodyPart(att); + } + + msg.setContent(mp); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = emlExtractor.getText(in, null); + final String[] names = data.getValues("attachmentNames"); + assertNotNull(names); + final java.util.List nameList = Arrays.asList(names); + assertTrue(nameList.contains("file1.txt")); + assertTrue(nameList.contains("file2.doc")); + assertTrue(nameList.contains("file3.xml")); + } + } + + @Test + public void test_inlineDispositionWithFilename_recordedAsAttachment() throws Exception { + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") }); + msg.setSubject("inline attachment", "UTF-8"); + + final MimeMultipart mp = new MimeMultipart("related"); + + final MimeBodyPart textPart = new MimeBodyPart(); + textPart.setText("body with inline", "UTF-8"); + mp.addBodyPart(textPart); + + // Inline disposition with filename — should be recorded as an attachment + final MimeBodyPart inlinePart = new MimeBodyPart(); + inlinePart.setContent(new byte[] { (byte) 0x89, 0x50, 0x4E, 0x47 }, "image/png"); + inlinePart.setFileName("logo.png"); + inlinePart.setDisposition(jakarta.mail.Part.INLINE); + mp.addBodyPart(inlinePart); + + msg.setContent(mp); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = emlExtractor.getText(in, null); + final String[] names = data.getValues("attachmentNames"); + assertNotNull(names); + assertTrue(Arrays.stream(names).anyMatch(n -> n.contains("logo.png"))); + } + } + + @Test + public void test_maxBodyBytes_acrossMultipleParts() throws Exception { + final int maxBytes = 50; + final EmlExtractor extractor = new EmlExtractor(); + extractor.setMaxBodyBytes(maxBytes); + + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") }); + msg.setSubject("two parts", "UTF-8"); + + final MimeMultipart mp = new MimeMultipart(); + + // First part: 30 ASCII bytes + final MimeBodyPart part1 = new MimeBodyPart(); + part1.setText("a".repeat(30), "UTF-8"); + mp.addBodyPart(part1); + + // Second part: 30 ASCII bytes — combined exceeds maxBytes + final MimeBodyPart part2 = new MimeBodyPart(); + part2.setText("b".repeat(30), "UTF-8"); + mp.addBodyPart(part2); + + msg.setContent(mp); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = extractor.getText(in, null); + final String content = data.getContent(); + // Total must not exceed maxBodyBytes + assertTrue(content.length() <= maxBytes); + } + } + + @Test + public void test_setters_rejectInvalidValues() { + final EmlExtractor extractor = new EmlExtractor(); + + try { + extractor.setMaxParts(0); + fail(); + } catch (final IllegalArgumentException e) { + // expected + } + + try { + extractor.setMaxParts(-1); + fail(); + } catch (final IllegalArgumentException e) { + // expected + } + + try { + extractor.setMaxBodyBytes(0); + fail(); + } catch (final IllegalArgumentException e) { + // expected + } + + try { + extractor.setMaxMessageBytes(0); + fail(); + } catch (final IllegalArgumentException e) { + // expected + } + + try { + extractor.setMaxRecursionDepth(-1); + fail(); + } catch (final IllegalArgumentException e) { + // expected + } + + // setMaxRecursionDepth(0) must be accepted (root-only is valid) + extractor.setMaxRecursionDepth(0); + assertEquals(0, extractor.getMaxRecursionDepth()); + } + + @Test + public void test_getReceivedDate_parsesWithSemicolon() throws Exception { + // Build a message with a Received header in standard RFC 5322 form + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") }); + msg.setSubject("received date test", "UTF-8"); + msg.setText("body", "UTF-8"); + // Add a Received header with semicolon-separated date + msg.addHeader("Received", "from foo.example.com by bar.example.com; Sun, 11 Nov 2012 02:39:59 +0000"); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = emlExtractor.getText(in, null); + final String[] receivedDate = data.getValues("Received-Date"); + assertNotNull(receivedDate); + assertEquals("2012-11-11T02:39:59.000Z", receivedDate[0]); + } + } + + @Test + public void test_getReceivedDate_skipsMalformedDowInComment() throws Exception { + // DOW abbreviation in a comment, but valid date after semicolon + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") }); + msg.setSubject("received comment test", "UTF-8"); + msg.setText("body", "UTF-8"); + // The "(Mon)" in the routing portion should not confuse the parser; + // the date after ";" is the authoritative date + msg.addHeader("Received", "from foo (Mon gateway) by bar; Mon, 11 Nov 2013 05:00:00 +0000"); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = emlExtractor.getText(in, null); + final String[] receivedDate = data.getValues("Received-Date"); + assertNotNull(receivedDate); + assertEquals("2013-11-11T05:00:00.000Z", receivedDate[0]); + } + } + + @Test + public void test_manyReceivedHeaders_bounded() throws Exception { + final MimeMessage msg = new MimeMessage(newSession()); + msg.setFrom(new InternetAddress("sender@example.com")); + msg.setRecipients(Message.RecipientType.TO, new InternetAddress[] { new InternetAddress("r@example.com") }); + msg.setSubject("many received headers", "UTF-8"); + msg.setText("body", "UTF-8"); + + // Add 500 garbage Received headers first + for (int i = 0; i < 500; i++) { + msg.addHeader("Received", "garbage entry number " + i); + } + // Then add one valid Received header — but since we cap at 100, this valid one + // at index 500 will NOT be seen. We verify that extraction at least completes + // without error and does not blow up on unbounded iteration. + // (The valid header is beyond the 100-entry cap, so receivedDate may be null.) + msg.addHeader("Received", "from x by y; Mon, 11 Nov 2013 05:00:00 +0000"); + msg.saveChanges(); + + try (final InputStream in = toStream(msg)) { + final ExtractData data = emlExtractor.getText(in, null); + // Just verify it completes without exception and content is non-null + assertNotNull(data.getContent()); + } + } + + @Test + public void test_getDecodeText_returnsRawOnUnsupportedEncoding() { + // An encoded-word with an unknown charset should return the raw input, not empty string. + // Use a charset that is genuinely unsupported in the JVM. + // Note: if the JVM happens to support the charset, this test may fall back gracefully. + // We use a clearly bogus encoding name to guarantee UnsupportedEncodingException. + final String raw = "=?bogus-cs-9?B?dGVzdA==?="; + // MimeUtility.decodeText will throw UnsupportedEncodingException for unknown charset; + // getDecodeText must return the raw value unchanged in that case. + final String result = emlExtractor.getDecodeText(raw); + // Either successfully decoded (if JVM finds charset) or returns raw value + // The contract is: never return empty string when input is non-empty + assertNotNull(result); + assertTrue(result.length() > 0); + // If decoding fails, must return the raw string, not empty string + // (We can't force the failure path here without mocking, but we verify no empty return) + } }