Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 66 additions & 1 deletion lightbug_http/header.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ from lightbug_http.http.parsing import (
http_parse_request_headers,
http_parse_response_headers,
)
from lightbug_http.io.bytes import ByteReader, Bytes, byte, is_newline, is_space
from lightbug_http.io.bytes import ByteReader, Bytes, ByteWriter, byte, is_newline, is_space
from lightbug_http.strings import CR, LF, BytesConstant, lineBreak
from memory import Span
from utils import Variant
Expand Down Expand Up @@ -329,6 +329,66 @@ fn write_header[T: Writer](mut writer: T, key: String, value: String):
writer.write(key, ": ", value, lineBreak)


fn encode_latin1_header_value(value: String) -> List[UInt8]:
"""Transcode a header value from UTF-8 to ISO-8859-1 bytes.

HTTP/1.1 header field values must be representable in ISO-8859-1 (RFC 7230 §3.2).
- Codepoints U+0000–U+007F: single byte, passed through unchanged.
- Codepoints U+0080–U+00FF: encoded as their single ISO-8859-1 byte.
- Codepoints above U+00FF: cannot be represented in ISO-8859-1; the raw UTF-8
bytes are written as-is (best-effort fallback — use RFC 5987 encoding instead).
- Invalid UTF-8 byte sequences (obs-text from parsing): passed through as-is.
"""
var utf8 = value.as_bytes()
var out = List[UInt8](capacity=len(utf8))
var i = 0
while i < len(utf8):
var b = utf8[i]
if b < 0x80:
out.append(b)
i += 1
else:
var seq_len = 0
var codepoint = 0
if b >= 0xC2 and b <= 0xDF and i + 1 < len(utf8):
var b2 = utf8[i + 1]
if b2 >= 0x80 and b2 <= 0xBF:
seq_len = 2
codepoint = ((Int(b) & 0x1F) << 6) | (Int(b2) & 0x3F)
elif b >= 0xE0 and b <= 0xEF and i + 2 < len(utf8):
var b2 = utf8[i + 1]
var b3 = utf8[i + 2]
if b2 >= 0x80 and b2 <= 0xBF and b3 >= 0x80 and b3 <= 0xBF:
seq_len = 3
codepoint = ((Int(b) & 0x0F) << 12) | ((Int(b2) & 0x3F) << 6) | (Int(b3) & 0x3F)
elif b >= 0xF0 and b <= 0xF7 and i + 3 < len(utf8):
var b2 = utf8[i + 1]
var b3 = utf8[i + 2]
var b4 = utf8[i + 3]
if b2 >= 0x80 and b2 <= 0xBF and b3 >= 0x80 and b3 <= 0xBF and b4 >= 0x80 and b4 <= 0xBF:
seq_len = 4
codepoint = ((Int(b) & 0x07) << 18) | ((Int(b2) & 0x3F) << 12) | ((Int(b3) & 0x3F) << 6) | (Int(b4) & 0x3F)

if seq_len > 0 and codepoint <= 0xFF:
out.append(UInt8(codepoint))
i += seq_len
elif seq_len > 0:
for j in range(seq_len):
out.append(utf8[i + j])
i += seq_len
else:
out.append(b)
i += 1
return out^


fn write_header_latin1(mut writer: ByteWriter, key: String, value: String):
"""Write a header with the value transcoded to ISO-8859-1."""
writer.write(key, ": ")
writer.consuming_write(encode_latin1_header_value(value))
writer.write(lineBreak)


@fieldwise_init
struct Headers(Copyable, Stringable, Writable):
"""Collection of HTTP headers.
Expand Down Expand Up @@ -383,6 +443,11 @@ struct Headers(Copyable, Stringable, Writable):
for header in self._inner.items():
write_header(writer, header.key, header.value)

fn write_latin1_to(self, mut writer: ByteWriter):
"""Write headers with values transcoded to ISO-8859-1 for HTTP wire format."""
for header in self._inner.items():
write_header_latin1(writer, header.key, header.value)

fn __str__(self) -> String:
return String.write(self)

Expand Down
7 changes: 4 additions & 3 deletions lightbug_http/http/parsing.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -117,9 +117,10 @@ fn get_token_to_eol[
raise IncompleteError()

var c = byte.value()
if not is_printable_ascii(c):
if (c < 0x20 and c != 0x09) or c == 0x7F:
break
# RFC 7230 §3.2.6: reject control characters (< 0x20 except HTAB, and DEL).
# Accept SP (0x20), visible ASCII (0x21–0x7E), and obs-text (0x80–0xFF).
if (c < 0x20 and c != 0x09) or c == 0x7F:
break
buf.increment()

if not buf.available():
Expand Down
5 changes: 2 additions & 3 deletions lightbug_http/http/request.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -230,10 +230,9 @@ struct HTTPRequest(Copyable, Encodable, Stringable, Writable):
whitespace,
self.protocol,
lineBreak,
self.headers,
self.cookies,
lineBreak,
)
self.headers.write_latin1_to(writer)
writer.write(self.cookies, lineBreak)
writer.consuming_write(self.body_raw^)
return writer^.consume()

Expand Down
3 changes: 2 additions & 1 deletion lightbug_http/http/response.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,8 @@ struct HTTPResponse(Encodable, Movable, Sized, Stringable, Writable):
)
if HeaderKey.DATE not in self.headers:
write_header(writer, HeaderKey.DATE, http_date_now())
writer.write(self.headers, self.cookies, lineBreak)
self.headers.write_latin1_to(writer)
writer.write(self.cookies, lineBreak)
writer.consuming_write(self.body_raw^)
return writer^.consume()

Expand Down
58 changes: 57 additions & 1 deletion tests/lightbug_http/http/test_http.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import testing
from lightbug_http.header import Header, HeaderKey, Headers
from lightbug_http.io.bytes import Bytes
from lightbug_http.uri import URI
from testing import assert_equal, assert_true
from testing import assert_equal, assert_false, assert_true

from lightbug_http.cookie import Cookie, Duration, RequestCookieJar, ResponseCookieJar, ResponseCookieKey
from lightbug_http.http import HTTPRequest, HTTPResponse, encode
Expand Down Expand Up @@ -123,5 +123,61 @@ def test_decoding_http_response():
# testing.assert_equal(v2._v, 2)


def test_header_iso8859_encoding_regression():
"""Regression: header values must be ISO-8859-1 encoded on the wire, not raw UTF-8.

Before the fix, a header value containing 'é' (U+00E9), which Mojo stores
internally as the UTF-8 byte sequence [0xC3, 0xA9], would be written to the
wire as those two bytes verbatim. Per RFC 7230, header field values must use
ISO-8859-1, so 'é' must appear on the wire as the single byte 0xE9.
"""
var res = HTTPResponse(Bytes())
res.headers[HeaderKey.DATE] = "Thu, 01 Jan 2026 00:00:00 GMT"
res.headers["x-test"] = "café"

var wire = encode(res^)

# All other headers and the body are ASCII, so the only non-ASCII byte in
# the wire output must come from 'é' in the value of x-test.
var latin1_byte_found = False # 0xE9: correct ISO-8859-1 single byte
var utf8_lead_found = False # 0xC3: buggy raw UTF-8 lead byte
for i in range(len(wire)):
if wire[i] == UInt8(0xE9):
latin1_byte_found = True
if wire[i] == UInt8(0xC3):
utf8_lead_found = True

assert_true(latin1_byte_found)
assert_false(utf8_lead_found)


def test_request_header_iso8859_encoding_regression():
"""Regression: request header values must be ISO-8859-1 encoded on the wire, not raw UTF-8.

Mirrors test_header_iso8859_encoding_regression but for HTTPRequest.encode(),
verifying the same fix applies to the outgoing request path.
"""
var uri: URI
try:
uri = URI.parse(default_server_conn_string + "/")
except e:
raise Error("Failed to parse URI: ", e)

var req = HTTPRequest(uri=uri^, headers=Headers(Header("x-test", "café")))

var wire = encode(req^)

var latin1_byte_found = False # 0xE9: correct ISO-8859-1 single byte
var utf8_lead_found = False # 0xC3: buggy raw UTF-8 lead byte
for i in range(len(wire)):
if wire[i] == UInt8(0xE9):
latin1_byte_found = True
if wire[i] == UInt8(0xC3):
utf8_lead_found = True

assert_true(latin1_byte_found)
assert_false(utf8_lead_found)


def main():
testing.TestSuite.discover_tests[__functions_in_module()]().run()
29 changes: 28 additions & 1 deletion tests/lightbug_http/http/test_parsing.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ fn test_request_invalid_header_name_char() raises:

fn test_request_extended_chars() raises:
var headers = InlineArray[HTTPHeader, 4](fill=HTTPHeader())
# Accept MSB chars
# obs-text (0x80-0xFF) is explicitly permitted in header values per RFC 7230 §3.2.6
result = parse_request_test(
"GET /\xa0 HTTP/1.0\r\nh: c\xa2y\r\n\r\n", 0, headers
)
Expand All @@ -309,6 +309,33 @@ fn test_request_extended_chars() raises:
assert_equal(headers[0].value, "c\xa2y")


fn test_request_tab_in_header_value() raises:
var headers = InlineArray[HTTPHeader, 4](fill=HTTPHeader())
# HTAB (0x09) is explicitly permitted inside header field values per RFC 7230 §3.2.6
result = parse_request_test(
"GET / HTTP/1.0\r\nfoo: bar\tbaz\r\n\r\n", 0, headers
)
assert_true(result.ret > 0)
assert_equal(result.num_headers, 1)
assert_equal(headers[0].name, "foo")
assert_equal(headers[0].value, "bar\tbaz")


fn test_request_control_char_in_header_value() raises:
var headers = InlineArray[HTTPHeader, 4](fill=HTTPHeader())
# Control characters (< 0x20 except HTAB) in a header value must cause a parse error
result = parse_request_test(
"GET / HTTP/1.0\r\nfoo: bar\x01baz\r\n\r\n", 0, headers
)
assert_equal(result.ret, -1)

# DEL (0x7F) is also rejected
result = parse_request_test(
"GET / HTTP/1.0\r\nfoo: bar\x7fbaz\r\n\r\n", 0, headers
)
assert_equal(result.ret, -1)


fn test_request_allowed_special_header_name_chars() raises:
var headers = InlineArray[HTTPHeader, 4](fill=HTTPHeader())
# Accept |~ (though forbidden by SSE)
Expand Down
73 changes: 71 additions & 2 deletions tests/lightbug_http/test_header.mojo
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from lightbug_http.header import Header, Headers
from lightbug_http.io.bytes import ByteReader, Bytes
from lightbug_http.header import Header, Headers, encode_latin1_header_value, write_header_latin1
from lightbug_http.io.bytes import ByteReader, Bytes, ByteWriter
from testing import TestSuite, assert_equal, assert_true


Expand Down Expand Up @@ -43,5 +43,74 @@ def test_header_case_insensitive():
# assert_equal(header["Trailer"], "end-of-message")


def test_encode_latin1_ascii():
"""ASCII values are passed through unchanged."""
var result = encode_latin1_header_value("hello, world")
assert_equal(len(result), 12)
assert_equal(result[0], UInt8(0x68))
assert_equal(result[5], UInt8(0x2C))


def test_encode_latin1_supplement():
"""UTF-8 codepoints U+0080–U+00FF are transcoded to single ISO-8859-1 bytes."""
# "é" = U+00E9, UTF-8: 0xC3 0xA9 → ISO-8859-1: 0xE9
var result = encode_latin1_header_value("é")
assert_equal(len(result), 1)
assert_equal(result[0], UInt8(0xE9))

# "ä" = U+00E4, UTF-8: 0xC3 0xA4 → ISO-8859-1: 0xE4
result = encode_latin1_header_value("ä")
assert_equal(len(result), 1)
assert_equal(result[0], UInt8(0xE4))

# "café": ASCII 'c','a','f' + U+00E9 → 4 bytes in ISO-8859-1
result = encode_latin1_header_value("café")
assert_equal(len(result), 4)
assert_equal(result[3], UInt8(0xE9))


def test_encode_latin1_obs_text():
"""Raw obs-text bytes (0x80–0xFF, not part of a valid UTF-8 sequence) pass through as-is."""
# 0xA2 alone is not a valid UTF-8 lead byte → treated as obs-text
var result = encode_latin1_header_value("c\xa2y")
assert_equal(len(result), 3)
assert_equal(result[0], UInt8(0x63))
assert_equal(result[1], UInt8(0xA2)) # obs-text byte preserved
assert_equal(result[2], UInt8(0x79))


def test_encode_latin1_above_latin1():
"""Codepoints above U+00FF fall back to raw UTF-8 bytes (best-effort)."""
# "€" = U+20AC, UTF-8: 0xE2 0x82 0xAC — codepoint > 0xFF → raw passthrough
var result = encode_latin1_header_value("€")
assert_equal(len(result), 3)
assert_equal(result[0], UInt8(0xE2))
assert_equal(result[1], UInt8(0x82))
assert_equal(result[2], UInt8(0xAC))


def test_write_header_latin1_encodes_value():
"""Values with Latin-1 supplement characters are encoded as single bytes on the wire."""
var writer = ByteWriter()
write_header_latin1(writer, "x-test", "café")
var bytes = writer^.consume()
# "x-test: caf" = 11 bytes, then 0xE9 = 1 byte, then "\r\n" = 2 bytes → 14 total
assert_equal(len(bytes), 14)
assert_equal(bytes[11], UInt8(0xE9)) # single Latin-1 byte for 'é'
assert_equal(bytes[12], UInt8(0x0D))
assert_equal(bytes[13], UInt8(0x0A))


def test_headers_write_latin1_to():
"""Headers.write_latin1_to transcodes values for HTTP wire format."""
var headers = Headers(Header("x-lang", "café"))
var writer = ByteWriter()
headers.write_latin1_to(writer)
var bytes = writer^.consume()
# "x-lang: caf" = 11 bytes, then 0xE9 = 1 byte, then "\r\n" = 2 bytes → 14 total
assert_equal(len(bytes), 14)
assert_equal(bytes[11], UInt8(0xE9)) # single Latin-1 byte for 'é'


def main():
TestSuite.discover_tests[__functions_in_module()]().run()