From c4c31eaa922c7154adfc0fc63df13ec256c6ba43 Mon Sep 17 00:00:00 2001 From: Val Date: Sun, 22 Feb 2026 13:51:28 +0100 Subject: [PATCH 1/2] [BUG] Header is ignoring ISO-8859-1 and using UTF-8 --- lightbug_http/header.mojo | 75 +++++++++++++++++++++- lightbug_http/http/parsing.mojo | 7 +- lightbug_http/http/request.mojo | 5 +- lightbug_http/http/response.mojo | 3 +- tests/lightbug_http/http/test_http.mojo | 60 ++++++++++++++++- tests/lightbug_http/http/test_parsing.mojo | 29 ++++++++- tests/lightbug_http/test_header.mojo | 74 ++++++++++++++++++++- 7 files changed, 241 insertions(+), 12 deletions(-) diff --git a/lightbug_http/header.mojo b/lightbug_http/header.mojo index 1c8baa21..098e13c5 100644 --- a/lightbug_http/header.mojo +++ b/lightbug_http/header.mojo @@ -4,7 +4,7 @@ from lightbug_http.http.parsing import ( http_parse_request_headers, http_parse_response_headers, ) -from lightbug_http.io.bytes import ByteReader, Bytes, byte, is_newline, is_space +from lightbug_http.io.bytes import ByteReader, Bytes, ByteWriter, byte, is_newline, is_space from lightbug_http.strings import CR, LF, BytesConstant, lineBreak from memory import Span from utils import Variant @@ -329,6 +329,74 @@ fn write_header[T: Writer](mut writer: T, key: String, value: String): writer.write(key, ": ", value, lineBreak) +fn encode_latin1_header_value(value: String) -> List[UInt8]: + """Transcode a header value from UTF-8 to ISO-8859-1 bytes. + + HTTP/1.1 header field values must be representable in ISO-8859-1 (RFC 7230 §3.2). + - Codepoints U+0000–U+007F: single byte, passed through unchanged. + - Codepoints U+0080–U+00FF: encoded as their single ISO-8859-1 byte. + - Codepoints above U+00FF: cannot be represented in ISO-8859-1; the raw UTF-8 + bytes are written as-is (best-effort fallback — use RFC 5987 encoding instead). + - Invalid UTF-8 byte sequences (obs-text from parsing): passed through as-is. + """ + var utf8 = value.as_bytes() + var out = List[UInt8](capacity=len(utf8)) + var i = 0 + while i < len(utf8): + var b = utf8[i] + if b < 0x80: + out.append(b) + i += 1 + else: + var seq_len = 0 + var codepoint = 0 + if b >= 0xC2 and b <= 0xDF and i + 1 < len(utf8): + var b2 = utf8[i + 1] + if b2 >= 0x80 and b2 <= 0xBF: + seq_len = 2 + codepoint = ((Int(b) & 0x1F) << 6) | (Int(b2) & 0x3F) + elif b >= 0xE0 and b <= 0xEF and i + 2 < len(utf8): + var b2 = utf8[i + 1] + var b3 = utf8[i + 2] + if b2 >= 0x80 and b2 <= 0xBF and b3 >= 0x80 and b3 <= 0xBF: + seq_len = 3 + codepoint = ((Int(b) & 0x0F) << 12) | ((Int(b2) & 0x3F) << 6) | (Int(b3) & 0x3F) + elif b >= 0xF0 and b <= 0xF7 and i + 3 < len(utf8): + var b2 = utf8[i + 1] + var b3 = utf8[i + 2] + var b4 = utf8[i + 3] + if b2 >= 0x80 and b2 <= 0xBF and b3 >= 0x80 and b3 <= 0xBF and b4 >= 0x80 and b4 <= 0xBF: + seq_len = 4 + codepoint = ((Int(b) & 0x07) << 18) | ((Int(b2) & 0x3F) << 12) | ((Int(b3) & 0x3F) << 6) | (Int(b4) & 0x3F) + + if seq_len > 0 and codepoint <= 0xFF: + # Valid UTF-8, codepoint fits in ISO-8859-1: encode as single byte. + out.append(UInt8(codepoint)) + i += seq_len + elif seq_len > 0: + # Valid UTF-8 but codepoint > U+00FF: pass raw UTF-8 bytes through. + for j in range(seq_len): + out.append(utf8[i + j]) + i += seq_len + else: + # Not valid UTF-8 (obs-text or stray continuation byte): pass through. + out.append(b) + i += 1 + return out^ + + +fn write_header_latin1(mut writer: ByteWriter, key: String, value: String): + """Write a header with the value transcoded to ISO-8859-1. + + Equivalent to write_header but converts multi-byte UTF-8 sequences for + codepoints U+0080–U+00FF to their single ISO-8859-1 byte before writing. + See encode_latin1_header_value for full encoding rules. + """ + writer.write(key, ": ") + writer.consuming_write(encode_latin1_header_value(value)) + writer.write(lineBreak) + + @fieldwise_init struct Headers(Copyable, Stringable, Writable): """Collection of HTTP headers. @@ -383,6 +451,11 @@ struct Headers(Copyable, Stringable, Writable): for header in self._inner.items(): write_header(writer, header.key, header.value) + fn write_latin1_to(self, mut writer: ByteWriter): + """Write headers with values transcoded to ISO-8859-1 for HTTP wire format.""" + for header in self._inner.items(): + write_header_latin1(writer, header.key, header.value) + fn __str__(self) -> String: return String.write(self) diff --git a/lightbug_http/http/parsing.mojo b/lightbug_http/http/parsing.mojo index de746e0f..d3700de5 100644 --- a/lightbug_http/http/parsing.mojo +++ b/lightbug_http/http/parsing.mojo @@ -117,9 +117,10 @@ fn get_token_to_eol[ raise IncompleteError() var c = byte.value() - if not is_printable_ascii(c): - if (c < 0x20 and c != 0x09) or c == 0x7F: - break + # RFC 7230 §3.2.6: reject control characters (< 0x20 except HTAB, and DEL). + # Accept SP (0x20), visible ASCII (0x21–0x7E), and obs-text (0x80–0xFF). + if (c < 0x20 and c != 0x09) or c == 0x7F: + break buf.increment() if not buf.available(): diff --git a/lightbug_http/http/request.mojo b/lightbug_http/http/request.mojo index 4b63e73d..c43f7ee2 100644 --- a/lightbug_http/http/request.mojo +++ b/lightbug_http/http/request.mojo @@ -230,10 +230,9 @@ struct HTTPRequest(Copyable, Encodable, Stringable, Writable): whitespace, self.protocol, lineBreak, - self.headers, - self.cookies, - lineBreak, ) + self.headers.write_latin1_to(writer) + writer.write(self.cookies, lineBreak) writer.consuming_write(self.body_raw^) return writer^.consume() diff --git a/lightbug_http/http/response.mojo b/lightbug_http/http/response.mojo index c0c11ed7..bba97b38 100644 --- a/lightbug_http/http/response.mojo +++ b/lightbug_http/http/response.mojo @@ -418,7 +418,8 @@ struct HTTPResponse(Encodable, Movable, Sized, Stringable, Writable): ) if HeaderKey.DATE not in self.headers: write_header(writer, HeaderKey.DATE, http_date_now()) - writer.write(self.headers, self.cookies, lineBreak) + self.headers.write_latin1_to(writer) + writer.write(self.cookies, lineBreak) writer.consuming_write(self.body_raw^) return writer^.consume() diff --git a/tests/lightbug_http/http/test_http.mojo b/tests/lightbug_http/http/test_http.mojo index bbbfa8ce..722dbfa2 100644 --- a/tests/lightbug_http/http/test_http.mojo +++ b/tests/lightbug_http/http/test_http.mojo @@ -4,7 +4,7 @@ import testing from lightbug_http.header import Header, HeaderKey, Headers from lightbug_http.io.bytes import Bytes from lightbug_http.uri import URI -from testing import assert_equal, assert_true +from testing import assert_equal, assert_false, assert_true from lightbug_http.cookie import Cookie, Duration, RequestCookieJar, ResponseCookieJar, ResponseCookieKey from lightbug_http.http import HTTPRequest, HTTPResponse, encode @@ -123,5 +123,63 @@ def test_decoding_http_response(): # testing.assert_equal(v2._v, 2) +def test_header_iso8859_encoding_regression(): + """Regression: header values must be ISO-8859-1 encoded on the wire, not raw UTF-8. + + Before the fix, a header value containing 'é' (U+00E9), which Mojo stores + internally as the UTF-8 byte sequence [0xC3, 0xA9], would be written to the + wire as those two bytes verbatim. Per RFC 7230, header field values must use + ISO-8859-1, so 'é' must appear on the wire as the single byte 0xE9. + """ + var res = HTTPResponse(Bytes()) + res.headers[HeaderKey.DATE] = "Thu, 01 Jan 2026 00:00:00 GMT" + # "café" contains 'é' = U+00E9, stored in Mojo as UTF-8 [0xC3, 0xA9]. + # On the HTTP wire it must be ISO-8859-1 [0xE9] — one byte, not two. + res.headers["x-test"] = "café" + + var wire = encode(res^) + + # All other headers and the body are ASCII, so the only non-ASCII byte in + # the wire output must come from 'é' in the value of x-test. + var latin1_byte_found = False # 0xE9: correct ISO-8859-1 single byte + var utf8_lead_found = False # 0xC3: buggy raw UTF-8 lead byte + for i in range(len(wire)): + if wire[i] == UInt8(0xE9): + latin1_byte_found = True + if wire[i] == UInt8(0xC3): + utf8_lead_found = True + + assert_true(latin1_byte_found) # 'é' must be encoded as single byte 0xE9 + assert_false(utf8_lead_found) # UTF-8 lead byte 0xC3 must NOT appear + + +def test_request_header_iso8859_encoding_regression(): + """Regression: request header values must be ISO-8859-1 encoded on the wire, not raw UTF-8. + + Mirrors test_header_iso8859_encoding_regression but for HTTPRequest.encode(), + verifying the same fix applies to the outgoing request path. + """ + var uri: URI + try: + uri = URI.parse(default_server_conn_string + "/") + except e: + raise Error("Failed to parse URI: ", e) + + var req = HTTPRequest(uri=uri^, headers=Headers(Header("x-test", "café"))) + + var wire = encode(req^) + + var latin1_byte_found = False # 0xE9: correct ISO-8859-1 single byte + var utf8_lead_found = False # 0xC3: buggy raw UTF-8 lead byte + for i in range(len(wire)): + if wire[i] == UInt8(0xE9): + latin1_byte_found = True + if wire[i] == UInt8(0xC3): + utf8_lead_found = True + + assert_true(latin1_byte_found) # 'é' must be encoded as single byte 0xE9 + assert_false(utf8_lead_found) # UTF-8 lead byte 0xC3 must NOT appear + + def main(): testing.TestSuite.discover_tests[__functions_in_module()]().run() diff --git a/tests/lightbug_http/http/test_parsing.mojo b/tests/lightbug_http/http/test_parsing.mojo index 65839151..7bd60edf 100644 --- a/tests/lightbug_http/http/test_parsing.mojo +++ b/tests/lightbug_http/http/test_parsing.mojo @@ -296,7 +296,7 @@ fn test_request_invalid_header_name_char() raises: fn test_request_extended_chars() raises: var headers = InlineArray[HTTPHeader, 4](fill=HTTPHeader()) - # Accept MSB chars + # obs-text (0x80-0xFF) is explicitly permitted in header values per RFC 7230 §3.2.6 result = parse_request_test( "GET /\xa0 HTTP/1.0\r\nh: c\xa2y\r\n\r\n", 0, headers ) @@ -309,6 +309,33 @@ fn test_request_extended_chars() raises: assert_equal(headers[0].value, "c\xa2y") +fn test_request_tab_in_header_value() raises: + var headers = InlineArray[HTTPHeader, 4](fill=HTTPHeader()) + # HTAB (0x09) is explicitly permitted inside header field values per RFC 7230 §3.2.6 + result = parse_request_test( + "GET / HTTP/1.0\r\nfoo: bar\tbaz\r\n\r\n", 0, headers + ) + assert_true(result.ret > 0) + assert_equal(result.num_headers, 1) + assert_equal(headers[0].name, "foo") + assert_equal(headers[0].value, "bar\tbaz") + + +fn test_request_control_char_in_header_value() raises: + var headers = InlineArray[HTTPHeader, 4](fill=HTTPHeader()) + # Control characters (< 0x20 except HTAB) in a header value must cause a parse error + result = parse_request_test( + "GET / HTTP/1.0\r\nfoo: bar\x01baz\r\n\r\n", 0, headers + ) + assert_equal(result.ret, -1) + + # DEL (0x7F) is also rejected + result = parse_request_test( + "GET / HTTP/1.0\r\nfoo: bar\x7fbaz\r\n\r\n", 0, headers + ) + assert_equal(result.ret, -1) + + fn test_request_allowed_special_header_name_chars() raises: var headers = InlineArray[HTTPHeader, 4](fill=HTTPHeader()) # Accept |~ (though forbidden by SSE) diff --git a/tests/lightbug_http/test_header.mojo b/tests/lightbug_http/test_header.mojo index 9b298d55..a9557a06 100644 --- a/tests/lightbug_http/test_header.mojo +++ b/tests/lightbug_http/test_header.mojo @@ -1,5 +1,5 @@ -from lightbug_http.header import Header, Headers -from lightbug_http.io.bytes import ByteReader, Bytes +from lightbug_http.header import Header, Headers, encode_latin1_header_value, write_header_latin1 +from lightbug_http.io.bytes import ByteReader, Bytes, ByteWriter from testing import TestSuite, assert_equal, assert_true @@ -43,5 +43,75 @@ def test_header_case_insensitive(): # assert_equal(header["Trailer"], "end-of-message") +def test_encode_latin1_ascii(): + """ASCII values are passed through unchanged.""" + var result = encode_latin1_header_value("hello, world") + assert_equal(len(result), 12) + assert_equal(result[0], UInt8(0x68)) # 'h' + assert_equal(result[5], UInt8(0x2C)) # ',' + + +def test_encode_latin1_supplement(): + """UTF-8 codepoints U+0080–U+00FF are transcoded to single ISO-8859-1 bytes.""" + # "é" = U+00E9, UTF-8: 0xC3 0xA9 → ISO-8859-1: 0xE9 + var result = encode_latin1_header_value("é") + assert_equal(len(result), 1) + assert_equal(result[0], UInt8(0xE9)) + + # "ä" = U+00E4, UTF-8: 0xC3 0xA4 → ISO-8859-1: 0xE4 + result = encode_latin1_header_value("ä") + assert_equal(len(result), 1) + assert_equal(result[0], UInt8(0xE4)) + + # "café": ASCII 'c','a','f' + U+00E9 → 4 bytes in ISO-8859-1 + result = encode_latin1_header_value("café") + assert_equal(len(result), 4) + assert_equal(result[3], UInt8(0xE9)) + + +def test_encode_latin1_obs_text(): + """Raw obs-text bytes (0x80–0xFF, not part of a valid UTF-8 sequence) pass through as-is.""" + # 0xA2 alone is not a valid UTF-8 lead byte → treated as obs-text + var result = encode_latin1_header_value("c\xa2y") + assert_equal(len(result), 3) + assert_equal(result[0], UInt8(0x63)) # 'c' + assert_equal(result[1], UInt8(0xA2)) # obs-text byte preserved + assert_equal(result[2], UInt8(0x79)) # 'y' + + +def test_encode_latin1_above_latin1(): + """Codepoints above U+00FF fall back to raw UTF-8 bytes (best-effort).""" + # "€" = U+20AC, UTF-8: 0xE2 0x82 0xAC — codepoint > 0xFF → raw passthrough + var result = encode_latin1_header_value("€") + assert_equal(len(result), 3) + assert_equal(result[0], UInt8(0xE2)) + assert_equal(result[1], UInt8(0x82)) + assert_equal(result[2], UInt8(0xAC)) + + +def test_write_header_latin1_encodes_value(): + """Values with Latin-1 supplement characters are encoded as single bytes on the wire.""" + # "é" (U+00E9, UTF-8: 0xC3 0xA9) must appear as single byte 0xE9 + var writer = ByteWriter() + write_header_latin1(writer, "x-test", "café") + var bytes = writer^.consume() + # "x-test: caf" = 11 bytes, then 0xE9 = 1 byte, then "\r\n" = 2 bytes → 14 total + assert_equal(len(bytes), 14) + assert_equal(bytes[11], UInt8(0xE9)) # single Latin-1 byte for 'é' + assert_equal(bytes[12], UInt8(0x0D)) # \r + assert_equal(bytes[13], UInt8(0x0A)) # \n + + +def test_headers_write_latin1_to(): + """Headers.write_latin1_to transcodes values for HTTP wire format.""" + var headers = Headers(Header("x-lang", "café")) + var writer = ByteWriter() + headers.write_latin1_to(writer) + var bytes = writer^.consume() + # "x-lang: caf" = 11 bytes, then 0xE9 = 1 byte, then "\r\n" = 2 bytes → 14 total + assert_equal(len(bytes), 14) + assert_equal(bytes[11], UInt8(0xE9)) # single Latin-1 byte for 'é' + + def main(): TestSuite.discover_tests[__functions_in_module()]().run() From 270a49ccaa4d00ea829d3801907e7cdf8c51ce46 Mon Sep 17 00:00:00 2001 From: Val Date: Sun, 22 Feb 2026 13:59:17 +0100 Subject: [PATCH 2/2] clean up comments --- lightbug_http/header.mojo | 10 +--------- tests/lightbug_http/http/test_http.mojo | 10 ++++------ tests/lightbug_http/test_header.mojo | 13 ++++++------- 3 files changed, 11 insertions(+), 22 deletions(-) diff --git a/lightbug_http/header.mojo b/lightbug_http/header.mojo index 098e13c5..6173681e 100644 --- a/lightbug_http/header.mojo +++ b/lightbug_http/header.mojo @@ -370,28 +370,20 @@ fn encode_latin1_header_value(value: String) -> List[UInt8]: codepoint = ((Int(b) & 0x07) << 18) | ((Int(b2) & 0x3F) << 12) | ((Int(b3) & 0x3F) << 6) | (Int(b4) & 0x3F) if seq_len > 0 and codepoint <= 0xFF: - # Valid UTF-8, codepoint fits in ISO-8859-1: encode as single byte. out.append(UInt8(codepoint)) i += seq_len elif seq_len > 0: - # Valid UTF-8 but codepoint > U+00FF: pass raw UTF-8 bytes through. for j in range(seq_len): out.append(utf8[i + j]) i += seq_len else: - # Not valid UTF-8 (obs-text or stray continuation byte): pass through. out.append(b) i += 1 return out^ fn write_header_latin1(mut writer: ByteWriter, key: String, value: String): - """Write a header with the value transcoded to ISO-8859-1. - - Equivalent to write_header but converts multi-byte UTF-8 sequences for - codepoints U+0080–U+00FF to their single ISO-8859-1 byte before writing. - See encode_latin1_header_value for full encoding rules. - """ + """Write a header with the value transcoded to ISO-8859-1.""" writer.write(key, ": ") writer.consuming_write(encode_latin1_header_value(value)) writer.write(lineBreak) diff --git a/tests/lightbug_http/http/test_http.mojo b/tests/lightbug_http/http/test_http.mojo index 722dbfa2..dc75e767 100644 --- a/tests/lightbug_http/http/test_http.mojo +++ b/tests/lightbug_http/http/test_http.mojo @@ -133,8 +133,6 @@ def test_header_iso8859_encoding_regression(): """ var res = HTTPResponse(Bytes()) res.headers[HeaderKey.DATE] = "Thu, 01 Jan 2026 00:00:00 GMT" - # "café" contains 'é' = U+00E9, stored in Mojo as UTF-8 [0xC3, 0xA9]. - # On the HTTP wire it must be ISO-8859-1 [0xE9] — one byte, not two. res.headers["x-test"] = "café" var wire = encode(res^) @@ -149,8 +147,8 @@ def test_header_iso8859_encoding_regression(): if wire[i] == UInt8(0xC3): utf8_lead_found = True - assert_true(latin1_byte_found) # 'é' must be encoded as single byte 0xE9 - assert_false(utf8_lead_found) # UTF-8 lead byte 0xC3 must NOT appear + assert_true(latin1_byte_found) + assert_false(utf8_lead_found) def test_request_header_iso8859_encoding_regression(): @@ -177,8 +175,8 @@ def test_request_header_iso8859_encoding_regression(): if wire[i] == UInt8(0xC3): utf8_lead_found = True - assert_true(latin1_byte_found) # 'é' must be encoded as single byte 0xE9 - assert_false(utf8_lead_found) # UTF-8 lead byte 0xC3 must NOT appear + assert_true(latin1_byte_found) + assert_false(utf8_lead_found) def main(): diff --git a/tests/lightbug_http/test_header.mojo b/tests/lightbug_http/test_header.mojo index a9557a06..cb403bb1 100644 --- a/tests/lightbug_http/test_header.mojo +++ b/tests/lightbug_http/test_header.mojo @@ -47,8 +47,8 @@ def test_encode_latin1_ascii(): """ASCII values are passed through unchanged.""" var result = encode_latin1_header_value("hello, world") assert_equal(len(result), 12) - assert_equal(result[0], UInt8(0x68)) # 'h' - assert_equal(result[5], UInt8(0x2C)) # ',' + assert_equal(result[0], UInt8(0x68)) + assert_equal(result[5], UInt8(0x2C)) def test_encode_latin1_supplement(): @@ -74,9 +74,9 @@ def test_encode_latin1_obs_text(): # 0xA2 alone is not a valid UTF-8 lead byte → treated as obs-text var result = encode_latin1_header_value("c\xa2y") assert_equal(len(result), 3) - assert_equal(result[0], UInt8(0x63)) # 'c' + assert_equal(result[0], UInt8(0x63)) assert_equal(result[1], UInt8(0xA2)) # obs-text byte preserved - assert_equal(result[2], UInt8(0x79)) # 'y' + assert_equal(result[2], UInt8(0x79)) def test_encode_latin1_above_latin1(): @@ -91,15 +91,14 @@ def test_encode_latin1_above_latin1(): def test_write_header_latin1_encodes_value(): """Values with Latin-1 supplement characters are encoded as single bytes on the wire.""" - # "é" (U+00E9, UTF-8: 0xC3 0xA9) must appear as single byte 0xE9 var writer = ByteWriter() write_header_latin1(writer, "x-test", "café") var bytes = writer^.consume() # "x-test: caf" = 11 bytes, then 0xE9 = 1 byte, then "\r\n" = 2 bytes → 14 total assert_equal(len(bytes), 14) assert_equal(bytes[11], UInt8(0xE9)) # single Latin-1 byte for 'é' - assert_equal(bytes[12], UInt8(0x0D)) # \r - assert_equal(bytes[13], UInt8(0x0A)) # \n + assert_equal(bytes[12], UInt8(0x0D)) + assert_equal(bytes[13], UInt8(0x0A)) def test_headers_write_latin1_to():