diff --git a/lightbug_http/header.mojo b/lightbug_http/header.mojo index 1c8baa21..6173681e 100644 --- a/lightbug_http/header.mojo +++ b/lightbug_http/header.mojo @@ -4,7 +4,7 @@ from lightbug_http.http.parsing import ( http_parse_request_headers, http_parse_response_headers, ) -from lightbug_http.io.bytes import ByteReader, Bytes, byte, is_newline, is_space +from lightbug_http.io.bytes import ByteReader, Bytes, ByteWriter, byte, is_newline, is_space from lightbug_http.strings import CR, LF, BytesConstant, lineBreak from memory import Span from utils import Variant @@ -329,6 +329,66 @@ fn write_header[T: Writer](mut writer: T, key: String, value: String): writer.write(key, ": ", value, lineBreak) +fn encode_latin1_header_value(value: String) -> List[UInt8]: + """Transcode a header value from UTF-8 to ISO-8859-1 bytes. + + HTTP/1.1 header field values must be representable in ISO-8859-1 (RFC 7230 §3.2). + - Codepoints U+0000–U+007F: single byte, passed through unchanged. + - Codepoints U+0080–U+00FF: encoded as their single ISO-8859-1 byte. + - Codepoints above U+00FF: cannot be represented in ISO-8859-1; the raw UTF-8 + bytes are written as-is (best-effort fallback — use RFC 5987 encoding instead). + - Invalid UTF-8 byte sequences (obs-text from parsing): passed through as-is. + """ + var utf8 = value.as_bytes() + var out = List[UInt8](capacity=len(utf8)) + var i = 0 + while i < len(utf8): + var b = utf8[i] + if b < 0x80: + out.append(b) + i += 1 + else: + var seq_len = 0 + var codepoint = 0 + if b >= 0xC2 and b <= 0xDF and i + 1 < len(utf8): + var b2 = utf8[i + 1] + if b2 >= 0x80 and b2 <= 0xBF: + seq_len = 2 + codepoint = ((Int(b) & 0x1F) << 6) | (Int(b2) & 0x3F) + elif b >= 0xE0 and b <= 0xEF and i + 2 < len(utf8): + var b2 = utf8[i + 1] + var b3 = utf8[i + 2] + if b2 >= 0x80 and b2 <= 0xBF and b3 >= 0x80 and b3 <= 0xBF: + seq_len = 3 + codepoint = ((Int(b) & 0x0F) << 12) | ((Int(b2) & 0x3F) << 6) | (Int(b3) & 0x3F) + elif b >= 0xF0 and b <= 0xF7 and i + 3 < len(utf8): + var b2 = utf8[i + 1] + var b3 = utf8[i + 2] + var b4 = utf8[i + 3] + if b2 >= 0x80 and b2 <= 0xBF and b3 >= 0x80 and b3 <= 0xBF and b4 >= 0x80 and b4 <= 0xBF: + seq_len = 4 + codepoint = ((Int(b) & 0x07) << 18) | ((Int(b2) & 0x3F) << 12) | ((Int(b3) & 0x3F) << 6) | (Int(b4) & 0x3F) + + if seq_len > 0 and codepoint <= 0xFF: + out.append(UInt8(codepoint)) + i += seq_len + elif seq_len > 0: + for j in range(seq_len): + out.append(utf8[i + j]) + i += seq_len + else: + out.append(b) + i += 1 + return out^ + + +fn write_header_latin1(mut writer: ByteWriter, key: String, value: String): + """Write a header with the value transcoded to ISO-8859-1.""" + writer.write(key, ": ") + writer.consuming_write(encode_latin1_header_value(value)) + writer.write(lineBreak) + + @fieldwise_init struct Headers(Copyable, Stringable, Writable): """Collection of HTTP headers. @@ -383,6 +443,11 @@ struct Headers(Copyable, Stringable, Writable): for header in self._inner.items(): write_header(writer, header.key, header.value) + fn write_latin1_to(self, mut writer: ByteWriter): + """Write headers with values transcoded to ISO-8859-1 for HTTP wire format.""" + for header in self._inner.items(): + write_header_latin1(writer, header.key, header.value) + fn __str__(self) -> String: return String.write(self) diff --git a/lightbug_http/http/parsing.mojo b/lightbug_http/http/parsing.mojo index de746e0f..d3700de5 100644 --- a/lightbug_http/http/parsing.mojo +++ b/lightbug_http/http/parsing.mojo @@ -117,9 +117,10 @@ fn get_token_to_eol[ raise IncompleteError() var c = byte.value() - if not is_printable_ascii(c): - if (c < 0x20 and c != 0x09) or c == 0x7F: - break + # RFC 7230 §3.2.6: reject control characters (< 0x20 except HTAB, and DEL). + # Accept SP (0x20), visible ASCII (0x21–0x7E), and obs-text (0x80–0xFF). + if (c < 0x20 and c != 0x09) or c == 0x7F: + break buf.increment() if not buf.available(): diff --git a/lightbug_http/http/request.mojo b/lightbug_http/http/request.mojo index 4b63e73d..c43f7ee2 100644 --- a/lightbug_http/http/request.mojo +++ b/lightbug_http/http/request.mojo @@ -230,10 +230,9 @@ struct HTTPRequest(Copyable, Encodable, Stringable, Writable): whitespace, self.protocol, lineBreak, - self.headers, - self.cookies, - lineBreak, ) + self.headers.write_latin1_to(writer) + writer.write(self.cookies, lineBreak) writer.consuming_write(self.body_raw^) return writer^.consume() diff --git a/lightbug_http/http/response.mojo b/lightbug_http/http/response.mojo index c0c11ed7..bba97b38 100644 --- a/lightbug_http/http/response.mojo +++ b/lightbug_http/http/response.mojo @@ -418,7 +418,8 @@ struct HTTPResponse(Encodable, Movable, Sized, Stringable, Writable): ) if HeaderKey.DATE not in self.headers: write_header(writer, HeaderKey.DATE, http_date_now()) - writer.write(self.headers, self.cookies, lineBreak) + self.headers.write_latin1_to(writer) + writer.write(self.cookies, lineBreak) writer.consuming_write(self.body_raw^) return writer^.consume() diff --git a/tests/lightbug_http/http/test_http.mojo b/tests/lightbug_http/http/test_http.mojo index bbbfa8ce..dc75e767 100644 --- a/tests/lightbug_http/http/test_http.mojo +++ b/tests/lightbug_http/http/test_http.mojo @@ -4,7 +4,7 @@ import testing from lightbug_http.header import Header, HeaderKey, Headers from lightbug_http.io.bytes import Bytes from lightbug_http.uri import URI -from testing import assert_equal, assert_true +from testing import assert_equal, assert_false, assert_true from lightbug_http.cookie import Cookie, Duration, RequestCookieJar, ResponseCookieJar, ResponseCookieKey from lightbug_http.http import HTTPRequest, HTTPResponse, encode @@ -123,5 +123,61 @@ def test_decoding_http_response(): # testing.assert_equal(v2._v, 2) +def test_header_iso8859_encoding_regression(): + """Regression: header values must be ISO-8859-1 encoded on the wire, not raw UTF-8. + + Before the fix, a header value containing 'é' (U+00E9), which Mojo stores + internally as the UTF-8 byte sequence [0xC3, 0xA9], would be written to the + wire as those two bytes verbatim. Per RFC 7230, header field values must use + ISO-8859-1, so 'é' must appear on the wire as the single byte 0xE9. + """ + var res = HTTPResponse(Bytes()) + res.headers[HeaderKey.DATE] = "Thu, 01 Jan 2026 00:00:00 GMT" + res.headers["x-test"] = "café" + + var wire = encode(res^) + + # All other headers and the body are ASCII, so the only non-ASCII byte in + # the wire output must come from 'é' in the value of x-test. + var latin1_byte_found = False # 0xE9: correct ISO-8859-1 single byte + var utf8_lead_found = False # 0xC3: buggy raw UTF-8 lead byte + for i in range(len(wire)): + if wire[i] == UInt8(0xE9): + latin1_byte_found = True + if wire[i] == UInt8(0xC3): + utf8_lead_found = True + + assert_true(latin1_byte_found) + assert_false(utf8_lead_found) + + +def test_request_header_iso8859_encoding_regression(): + """Regression: request header values must be ISO-8859-1 encoded on the wire, not raw UTF-8. + + Mirrors test_header_iso8859_encoding_regression but for HTTPRequest.encode(), + verifying the same fix applies to the outgoing request path. + """ + var uri: URI + try: + uri = URI.parse(default_server_conn_string + "/") + except e: + raise Error("Failed to parse URI: ", e) + + var req = HTTPRequest(uri=uri^, headers=Headers(Header("x-test", "café"))) + + var wire = encode(req^) + + var latin1_byte_found = False # 0xE9: correct ISO-8859-1 single byte + var utf8_lead_found = False # 0xC3: buggy raw UTF-8 lead byte + for i in range(len(wire)): + if wire[i] == UInt8(0xE9): + latin1_byte_found = True + if wire[i] == UInt8(0xC3): + utf8_lead_found = True + + assert_true(latin1_byte_found) + assert_false(utf8_lead_found) + + def main(): testing.TestSuite.discover_tests[__functions_in_module()]().run() diff --git a/tests/lightbug_http/http/test_parsing.mojo b/tests/lightbug_http/http/test_parsing.mojo index 65839151..7bd60edf 100644 --- a/tests/lightbug_http/http/test_parsing.mojo +++ b/tests/lightbug_http/http/test_parsing.mojo @@ -296,7 +296,7 @@ fn test_request_invalid_header_name_char() raises: fn test_request_extended_chars() raises: var headers = InlineArray[HTTPHeader, 4](fill=HTTPHeader()) - # Accept MSB chars + # obs-text (0x80-0xFF) is explicitly permitted in header values per RFC 7230 §3.2.6 result = parse_request_test( "GET /\xa0 HTTP/1.0\r\nh: c\xa2y\r\n\r\n", 0, headers ) @@ -309,6 +309,33 @@ fn test_request_extended_chars() raises: assert_equal(headers[0].value, "c\xa2y") +fn test_request_tab_in_header_value() raises: + var headers = InlineArray[HTTPHeader, 4](fill=HTTPHeader()) + # HTAB (0x09) is explicitly permitted inside header field values per RFC 7230 §3.2.6 + result = parse_request_test( + "GET / HTTP/1.0\r\nfoo: bar\tbaz\r\n\r\n", 0, headers + ) + assert_true(result.ret > 0) + assert_equal(result.num_headers, 1) + assert_equal(headers[0].name, "foo") + assert_equal(headers[0].value, "bar\tbaz") + + +fn test_request_control_char_in_header_value() raises: + var headers = InlineArray[HTTPHeader, 4](fill=HTTPHeader()) + # Control characters (< 0x20 except HTAB) in a header value must cause a parse error + result = parse_request_test( + "GET / HTTP/1.0\r\nfoo: bar\x01baz\r\n\r\n", 0, headers + ) + assert_equal(result.ret, -1) + + # DEL (0x7F) is also rejected + result = parse_request_test( + "GET / HTTP/1.0\r\nfoo: bar\x7fbaz\r\n\r\n", 0, headers + ) + assert_equal(result.ret, -1) + + fn test_request_allowed_special_header_name_chars() raises: var headers = InlineArray[HTTPHeader, 4](fill=HTTPHeader()) # Accept |~ (though forbidden by SSE) diff --git a/tests/lightbug_http/test_header.mojo b/tests/lightbug_http/test_header.mojo index 9b298d55..cb403bb1 100644 --- a/tests/lightbug_http/test_header.mojo +++ b/tests/lightbug_http/test_header.mojo @@ -1,5 +1,5 @@ -from lightbug_http.header import Header, Headers -from lightbug_http.io.bytes import ByteReader, Bytes +from lightbug_http.header import Header, Headers, encode_latin1_header_value, write_header_latin1 +from lightbug_http.io.bytes import ByteReader, Bytes, ByteWriter from testing import TestSuite, assert_equal, assert_true @@ -43,5 +43,74 @@ def test_header_case_insensitive(): # assert_equal(header["Trailer"], "end-of-message") +def test_encode_latin1_ascii(): + """ASCII values are passed through unchanged.""" + var result = encode_latin1_header_value("hello, world") + assert_equal(len(result), 12) + assert_equal(result[0], UInt8(0x68)) + assert_equal(result[5], UInt8(0x2C)) + + +def test_encode_latin1_supplement(): + """UTF-8 codepoints U+0080–U+00FF are transcoded to single ISO-8859-1 bytes.""" + # "é" = U+00E9, UTF-8: 0xC3 0xA9 → ISO-8859-1: 0xE9 + var result = encode_latin1_header_value("é") + assert_equal(len(result), 1) + assert_equal(result[0], UInt8(0xE9)) + + # "ä" = U+00E4, UTF-8: 0xC3 0xA4 → ISO-8859-1: 0xE4 + result = encode_latin1_header_value("ä") + assert_equal(len(result), 1) + assert_equal(result[0], UInt8(0xE4)) + + # "café": ASCII 'c','a','f' + U+00E9 → 4 bytes in ISO-8859-1 + result = encode_latin1_header_value("café") + assert_equal(len(result), 4) + assert_equal(result[3], UInt8(0xE9)) + + +def test_encode_latin1_obs_text(): + """Raw obs-text bytes (0x80–0xFF, not part of a valid UTF-8 sequence) pass through as-is.""" + # 0xA2 alone is not a valid UTF-8 lead byte → treated as obs-text + var result = encode_latin1_header_value("c\xa2y") + assert_equal(len(result), 3) + assert_equal(result[0], UInt8(0x63)) + assert_equal(result[1], UInt8(0xA2)) # obs-text byte preserved + assert_equal(result[2], UInt8(0x79)) + + +def test_encode_latin1_above_latin1(): + """Codepoints above U+00FF fall back to raw UTF-8 bytes (best-effort).""" + # "€" = U+20AC, UTF-8: 0xE2 0x82 0xAC — codepoint > 0xFF → raw passthrough + var result = encode_latin1_header_value("€") + assert_equal(len(result), 3) + assert_equal(result[0], UInt8(0xE2)) + assert_equal(result[1], UInt8(0x82)) + assert_equal(result[2], UInt8(0xAC)) + + +def test_write_header_latin1_encodes_value(): + """Values with Latin-1 supplement characters are encoded as single bytes on the wire.""" + var writer = ByteWriter() + write_header_latin1(writer, "x-test", "café") + var bytes = writer^.consume() + # "x-test: caf" = 11 bytes, then 0xE9 = 1 byte, then "\r\n" = 2 bytes → 14 total + assert_equal(len(bytes), 14) + assert_equal(bytes[11], UInt8(0xE9)) # single Latin-1 byte for 'é' + assert_equal(bytes[12], UInt8(0x0D)) + assert_equal(bytes[13], UInt8(0x0A)) + + +def test_headers_write_latin1_to(): + """Headers.write_latin1_to transcodes values for HTTP wire format.""" + var headers = Headers(Header("x-lang", "café")) + var writer = ByteWriter() + headers.write_latin1_to(writer) + var bytes = writer^.consume() + # "x-lang: caf" = 11 bytes, then 0xE9 = 1 byte, then "\r\n" = 2 bytes → 14 total + assert_equal(len(bytes), 14) + assert_equal(bytes[11], UInt8(0xE9)) # single Latin-1 byte for 'é' + + def main(): TestSuite.discover_tests[__functions_in_module()]().run()