Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 90 additions & 0 deletions multipart/test_multipart_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,3 +283,93 @@ def test_zerodep(self, benchmark):
def test_python_multipart(self, benchmark):
body, ct = _get_fixture("large-binary")
benchmark(_ref_parse_multipart, body, ct)


# ── Scale curve: vary number of parts / total payload size geometrically ──


def _make_scale_body(n_parts: int, part_size: int) -> tuple[bytes, str]:
"""Build a multipart body with *n_parts* binary parts of *part_size* bytes each."""
files = {
f"file_{i}": (f"data_{i}.bin", bytes(range(256)) * (part_size // 256 + 1))[
:part_size
]
for i in range(n_parts)
}
return encode_multipart(fields={}, files=files, boundary="scalebench")


# Scale by total payload size: (n_parts, part_size_bytes) → ~target payload.
# Total ≈ n_parts * part_size (plus multipart framing overhead).
_SCALE_PARAMS = [
(1, 100, "100B"),
(1, 500, "500B"),
(2, 512, "1KB"),
(5, 1_024, "5KB"),
(5, 2_048, "10KB"),
(10, 5_120, "50KB"),
(10, 10_240, "100KB"),
(20, 25_000, "500KB"),
]

# Pre-build all payloads so construction cost is excluded from benchmarks.
_SCALE_BODIES: dict[str, tuple[bytes, str]] = {
label: _make_scale_body(n_parts, part_size)
for n_parts, part_size, label in _SCALE_PARAMS
}


class TestScaleCurve:
"""Scale curves: vary total payload size to reveal parse/encode complexity.

Each parametrized ID encodes the approximate total payload size so
results can be plotted against input scale.
"""

@pytest.mark.parametrize("label", [lbl for _, _, lbl in _SCALE_PARAMS])
def test_parse_zerodep(self, benchmark, label):
"""zerodep parse_multipart: scale with payload size."""
body, ct = _SCALE_BODIES[label]
benchmark(parse_multipart, body, ct)

@pytest.mark.skipif(not _HAS_REF, reason="python-multipart not installed")
@pytest.mark.parametrize("label", [lbl for _, _, lbl in _SCALE_PARAMS])
def test_parse_python_multipart(self, benchmark, label):
"""python-multipart parse: scale with payload size."""
body, ct = _SCALE_BODIES[label]
benchmark(_ref_parse_multipart, body, ct)

@pytest.mark.parametrize("label", [lbl for _, _, lbl in _SCALE_PARAMS])
def test_encode_zerodep(self, benchmark, label):
"""zerodep encode_multipart: scale with payload size."""
n_parts, part_size, _lbl = next(
(n, p, lb) for n, p, lb in _SCALE_PARAMS if lb == label
)
files = {
f"file_{i}": (
f"data_{i}.bin",
(bytes(range(256)) * (part_size // 256 + 1))[:part_size],
)
for i in range(n_parts)
}
benchmark(encode_multipart, {}, files, boundary="scalebench")

@pytest.mark.parametrize("label", [lbl for _, _, lbl in _SCALE_PARAMS])
def test_roundtrip_zerodep(self, benchmark, label):
"""zerodep encode + parse roundtrip: scale with payload size."""
n_parts, part_size, _lbl = next(
(n, p, lb) for n, p, lb in _SCALE_PARAMS if lb == label
)
files = {
f"file_{i}": (
f"data_{i}.bin",
(bytes(range(256)) * (part_size // 256 + 1))[:part_size],
)
for i in range(n_parts)
}

def roundtrip():
body, ct = encode_multipart({}, files, boundary="scalebench")
return parse_multipart(body, ct)

benchmark(roundtrip)
87 changes: 87 additions & 0 deletions protobuf/test_protobuf_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -514,3 +514,90 @@ def roundtrip():
return m

benchmark(roundtrip)


# ============================================================================
# Scale curve: vary repeated-field count / message size geometrically
# ============================================================================

# Scale points: number of int32 entries in the repeated `values` field.
# Each int32 takes 1-2 bytes in protobuf wire format, so payload ≈ n bytes.
_SCALE_PARAMS = [
(10, "10items"),
(50, "50items"),
(100, "100items"),
(500, "500items"),
(1_000, "1Kitems"),
(5_000, "5Kitems"),
(10_000, "10Kitems"),
(50_000, "50Kitems"),
]


def _make_repeated_msg(n: int) -> MediumMessage:
"""Build a MediumMessage with *n* repeated int32 values."""
return MediumMessage(
id=1,
title="scale",
score=1.0,
tags=["t"],
values=list(range(n)),
)


# Pre-serialize payloads so encoding cost is excluded from decode benchmarks.
_SCALE_OBJECTS = {label: _make_repeated_msg(n) for n, label in _SCALE_PARAMS}
_SCALE_BYTES = {label: obj.serialize() for label, obj in _SCALE_OBJECTS.items()}


class TestScaleCurve:
"""Scale curves: vary repeated-field count to reveal encode/decode complexity.

Each parametrized ID encodes the number of repeated int32 items so
results can be plotted against message size.
"""

@pytest.mark.parametrize("label", [lbl for _, lbl in _SCALE_PARAMS])
def test_encode_zerodep(self, benchmark, label):
"""zerodep encode: scale with repeated-field count."""
obj = _SCALE_OBJECTS[label]
benchmark(obj.serialize)

@pytest.mark.parametrize("label", [lbl for _, lbl in _SCALE_PARAMS])
def test_decode_zerodep(self, benchmark, label):
"""zerodep decode: scale with repeated-field count."""
data = _SCALE_BYTES[label]
benchmark(MediumMessage.parse, data)

@pytest.mark.parametrize("label", [lbl for _, lbl in _SCALE_PARAMS])
def test_roundtrip_zerodep(self, benchmark, label):
"""zerodep roundtrip: scale with repeated-field count."""
obj = _SCALE_OBJECTS[label]
benchmark(lambda o=obj: MediumMessage.parse(o.serialize()))

@pytest.mark.skipif(not HAS_GOOGLE_PB, reason="google-protobuf not installed")
@pytest.mark.parametrize("label", [lbl for _, lbl in _SCALE_PARAMS])
def test_encode_google(self, benchmark, label):
"""google-protobuf encode: scale with repeated-field count."""
n = next(n for n, lbl in _SCALE_PARAMS if lbl == label)
msg = GMedium(id=1, title="scale", score=1.0)
msg.tags.extend(["t"])
msg.values.extend(list(range(n)))
benchmark(msg.SerializeToString)

@pytest.mark.skipif(not HAS_GOOGLE_PB, reason="google-protobuf not installed")
@pytest.mark.parametrize("label", [lbl for _, lbl in _SCALE_PARAMS])
def test_decode_google(self, benchmark, label):
"""google-protobuf decode: scale with repeated-field count."""
n = next(n for n, lbl in _SCALE_PARAMS if lbl == label)
msg = GMedium(id=1, title="scale", score=1.0)
msg.tags.extend(["t"])
msg.values.extend(list(range(n)))
data = msg.SerializeToString()

def parse():
m = GMedium()
m.ParseFromString(data)
return m

benchmark(parse)
62 changes: 62 additions & 0 deletions soup/test_soup_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,3 +397,65 @@ def test_beautifulsoup4(self, benchmark):
benchmark(
_bs4_select_fixture, _get_fixture("docs-page"), _FIXTURE_SELECT_QUERIES
)


# ── Scale curve: vary HTML document size (number of nodes) geometrically ──

# Each "item" div from _make_html() contains 4 child tags (h3, p, a, plus the
# div itself), so n_tags items ≈ 4*n_tags total nodes.
_SCALE_PARAMS = [
(5, "5nodes"),
(25, "25nodes"),
(50, "50nodes"),
(250, "250nodes"),
(500, "500nodes"),
(2_500, "2500nodes"),
(5_000, "5000nodes"),
(25_000, "25000nodes"),
]

_SCALE_HTMLS = {label: _make_html(n) for n, label in _SCALE_PARAMS}


class TestScaleCurve:
"""Scale curves: vary HTML document size to reveal parse/find complexity.

Each parametrized ID encodes the approximate node count so results can
be plotted against input scale.
"""

@pytest.mark.parametrize("label", [lbl for _, lbl in _SCALE_PARAMS])
def test_parse_find_zerodep(self, benchmark, label):
"""zerodep parse + find_all: scale with node count."""
html = _SCALE_HTMLS[label]
benchmark(_zd_parse_and_find, html)

@pytest.mark.parametrize("label", [lbl for _, lbl in _SCALE_PARAMS])
def test_parse_find_bs4(self, benchmark, label):
"""BeautifulSoup4 parse + find_all: scale with node count."""
html = _SCALE_HTMLS[label]
benchmark(_bs4_parse_and_find, html)

@pytest.mark.parametrize("label", [lbl for _, lbl in _SCALE_PARAMS])
def test_serialize_zerodep(self, benchmark, label):
"""zerodep parse + to_html serialize: scale with node count."""
html = _SCALE_HTMLS[label]
benchmark(_zd_parse_and_serialize, html)

@pytest.mark.parametrize("label", [lbl for _, lbl in _SCALE_PARAMS])
def test_serialize_bs4(self, benchmark, label):
"""BeautifulSoup4 parse + str(): scale with node count."""
html = _SCALE_HTMLS[label]
benchmark(_bs4_parse_and_serialize, html)

@pytest.mark.parametrize("label", [lbl for _, lbl in _SCALE_PARAMS])
def test_css_select_zerodep(self, benchmark, label):
"""zerodep CSS select: scale with node count."""
html = _SCALE_HTMLS[label]
benchmark(_zd_select, html, _SELECT_QUERIES)

@pytest.mark.parametrize("label", [lbl for _, lbl in _SCALE_PARAMS])
def test_css_select_bs4(self, benchmark, label):
"""BeautifulSoup4 CSS select: scale with node count."""
html = _SCALE_HTMLS[label]
benchmark(_bs4_select, html, _SELECT_QUERIES)
60 changes: 60 additions & 0 deletions yaml/test_yaml_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,3 +261,63 @@ def test_pyyaml(self, benchmark):
_get_fixture_data("k8s-deployment"),
default_flow_style=False,
)


# ── Scale curve: vary document size geometrically ──


def _make_yaml_doc(n_keys: int) -> str:
"""Generate a flat YAML document with *n_keys* string key-value pairs."""
lines = []
for i in range(n_keys):
lines.append(f"key_{i}: value_string_{i}")
return "\n".join(lines)


# Geometric scale points: ~10 B … ~500 KB worth of keys.
# Each key-value line is roughly 25 bytes, so n_keys ≈ target_bytes / 25.
_SCALE_PARAMS = [
(4, "100B"),
(20, "500B"),
(40, "1KB"),
(200, "5KB"),
(400, "10KB"),
(2_000, "50KB"),
(4_000, "100KB"),
(20_000, "500KB"),
]

_SCALE_DOCS = {label: _make_yaml_doc(n) for n, label in _SCALE_PARAMS}
_SCALE_DATA = {label: zd_load(doc) for label, doc in _SCALE_DOCS.items()}


class TestScaleCurve:
"""Scale curves: vary document size to reveal complexity behaviour.

Each parametrized ID encodes the approximate byte size of the YAML
document so results can be plotted against input size.
"""

@pytest.mark.parametrize("label", [lbl for _, lbl in _SCALE_PARAMS])
def test_load_zerodep(self, benchmark, label):
"""zerodep load: scale with document size."""
doc = _SCALE_DOCS[label]
benchmark(zd_load, doc)

@pytest.mark.parametrize("label", [lbl for _, lbl in _SCALE_PARAMS])
def test_load_pyyaml(self, benchmark, label):
"""PyYAML safe_load: scale with document size."""
doc = _SCALE_DOCS[label]
benchmark(_pyyaml_safe_load, doc)

@pytest.mark.parametrize("label", [lbl for _, lbl in _SCALE_PARAMS])
def test_dump_zerodep(self, benchmark, label):
"""zerodep dump: scale with data size."""
data = _SCALE_DATA[label]
benchmark(zd_dump, data)

@pytest.mark.parametrize("label", [lbl for _, lbl in _SCALE_PARAMS])
def test_dump_pyyaml(self, benchmark, label):
"""PyYAML dump: scale with data size."""
data = _SCALE_DATA[label]
benchmark(_pyyaml_dump, data, default_flow_style=False)
Loading