diff --git a/multipart/test_multipart_benchmark.py b/multipart/test_multipart_benchmark.py index e137eff..fad1cc0 100644 --- a/multipart/test_multipart_benchmark.py +++ b/multipart/test_multipart_benchmark.py @@ -283,3 +283,93 @@ def test_zerodep(self, benchmark): def test_python_multipart(self, benchmark): body, ct = _get_fixture("large-binary") benchmark(_ref_parse_multipart, body, ct) + + +# ── Scale curve: vary number of parts / total payload size geometrically ── + + +def _make_scale_body(n_parts: int, part_size: int) -> tuple[bytes, str]: + """Build a multipart body with *n_parts* binary parts of *part_size* bytes each.""" + files = { + f"file_{i}": (f"data_{i}.bin", bytes(range(256)) * (part_size // 256 + 1))[ + :part_size + ] + for i in range(n_parts) + } + return encode_multipart(fields={}, files=files, boundary="scalebench") + + +# Scale by total payload size: (n_parts, part_size_bytes) → ~target payload. +# Total ≈ n_parts * part_size (plus multipart framing overhead). +_SCALE_PARAMS = [ + (1, 100, "100B"), + (1, 500, "500B"), + (2, 512, "1KB"), + (5, 1_024, "5KB"), + (5, 2_048, "10KB"), + (10, 5_120, "50KB"), + (10, 10_240, "100KB"), + (20, 25_000, "500KB"), +] + +# Pre-build all payloads so construction cost is excluded from benchmarks. +_SCALE_BODIES: dict[str, tuple[bytes, str]] = { + label: _make_scale_body(n_parts, part_size) + for n_parts, part_size, label in _SCALE_PARAMS +} + + +class TestScaleCurve: + """Scale curves: vary total payload size to reveal parse/encode complexity. + + Each parametrized ID encodes the approximate total payload size so + results can be plotted against input scale. + """ + + @pytest.mark.parametrize("label", [lbl for _, _, lbl in _SCALE_PARAMS]) + def test_parse_zerodep(self, benchmark, label): + """zerodep parse_multipart: scale with payload size.""" + body, ct = _SCALE_BODIES[label] + benchmark(parse_multipart, body, ct) + + @pytest.mark.skipif(not _HAS_REF, reason="python-multipart not installed") + @pytest.mark.parametrize("label", [lbl for _, _, lbl in _SCALE_PARAMS]) + def test_parse_python_multipart(self, benchmark, label): + """python-multipart parse: scale with payload size.""" + body, ct = _SCALE_BODIES[label] + benchmark(_ref_parse_multipart, body, ct) + + @pytest.mark.parametrize("label", [lbl for _, _, lbl in _SCALE_PARAMS]) + def test_encode_zerodep(self, benchmark, label): + """zerodep encode_multipart: scale with payload size.""" + n_parts, part_size, _lbl = next( + (n, p, lb) for n, p, lb in _SCALE_PARAMS if lb == label + ) + files = { + f"file_{i}": ( + f"data_{i}.bin", + (bytes(range(256)) * (part_size // 256 + 1))[:part_size], + ) + for i in range(n_parts) + } + benchmark(encode_multipart, {}, files, boundary="scalebench") + + @pytest.mark.parametrize("label", [lbl for _, _, lbl in _SCALE_PARAMS]) + def test_roundtrip_zerodep(self, benchmark, label): + """zerodep encode + parse roundtrip: scale with payload size.""" + n_parts, part_size, _lbl = next( + (n, p, lb) for n, p, lb in _SCALE_PARAMS if lb == label + ) + files = { + f"file_{i}": ( + f"data_{i}.bin", + (bytes(range(256)) * (part_size // 256 + 1))[:part_size], + ) + for i in range(n_parts) + } + + def roundtrip(): + body, ct = encode_multipart({}, files, boundary="scalebench") + return parse_multipart(body, ct) + + benchmark(roundtrip) diff --git a/protobuf/test_protobuf_benchmark.py b/protobuf/test_protobuf_benchmark.py index 97cc86c..8059cf1 100644 --- a/protobuf/test_protobuf_benchmark.py +++ b/protobuf/test_protobuf_benchmark.py @@ -514,3 +514,90 @@ def roundtrip(): return m benchmark(roundtrip) + + +# ============================================================================ +# Scale curve: vary repeated-field count / message size geometrically +# ============================================================================ + +# Scale points: number of int32 entries in the repeated `values` field. +# Each int32 takes 1-2 bytes in protobuf wire format, so payload ≈ n bytes. +_SCALE_PARAMS = [ + (10, "10items"), + (50, "50items"), + (100, "100items"), + (500, "500items"), + (1_000, "1Kitems"), + (5_000, "5Kitems"), + (10_000, "10Kitems"), + (50_000, "50Kitems"), +] + + +def _make_repeated_msg(n: int) -> MediumMessage: + """Build a MediumMessage with *n* repeated int32 values.""" + return MediumMessage( + id=1, + title="scale", + score=1.0, + tags=["t"], + values=list(range(n)), + ) + + +# Pre-serialize payloads so encoding cost is excluded from decode benchmarks. +_SCALE_OBJECTS = {label: _make_repeated_msg(n) for n, label in _SCALE_PARAMS} +_SCALE_BYTES = {label: obj.serialize() for label, obj in _SCALE_OBJECTS.items()} + + +class TestScaleCurve: + """Scale curves: vary repeated-field count to reveal encode/decode complexity. + + Each parametrized ID encodes the number of repeated int32 items so + results can be plotted against message size. + """ + + @pytest.mark.parametrize("label", [lbl for _, lbl in _SCALE_PARAMS]) + def test_encode_zerodep(self, benchmark, label): + """zerodep encode: scale with repeated-field count.""" + obj = _SCALE_OBJECTS[label] + benchmark(obj.serialize) + + @pytest.mark.parametrize("label", [lbl for _, lbl in _SCALE_PARAMS]) + def test_decode_zerodep(self, benchmark, label): + """zerodep decode: scale with repeated-field count.""" + data = _SCALE_BYTES[label] + benchmark(MediumMessage.parse, data) + + @pytest.mark.parametrize("label", [lbl for _, lbl in _SCALE_PARAMS]) + def test_roundtrip_zerodep(self, benchmark, label): + """zerodep roundtrip: scale with repeated-field count.""" + obj = _SCALE_OBJECTS[label] + benchmark(lambda o=obj: MediumMessage.parse(o.serialize())) + + @pytest.mark.skipif(not HAS_GOOGLE_PB, reason="google-protobuf not installed") + @pytest.mark.parametrize("label", [lbl for _, lbl in _SCALE_PARAMS]) + def test_encode_google(self, benchmark, label): + """google-protobuf encode: scale with repeated-field count.""" + n = next(n for n, lbl in _SCALE_PARAMS if lbl == label) + msg = GMedium(id=1, title="scale", score=1.0) + msg.tags.extend(["t"]) + msg.values.extend(list(range(n))) + benchmark(msg.SerializeToString) + + @pytest.mark.skipif(not HAS_GOOGLE_PB, reason="google-protobuf not installed") + @pytest.mark.parametrize("label", [lbl for _, lbl in _SCALE_PARAMS]) + def test_decode_google(self, benchmark, label): + """google-protobuf decode: scale with repeated-field count.""" + n = next(n for n, lbl in _SCALE_PARAMS if lbl == label) + msg = GMedium(id=1, title="scale", score=1.0) + msg.tags.extend(["t"]) + msg.values.extend(list(range(n))) + data = msg.SerializeToString() + + def parse(): + m = GMedium() + m.ParseFromString(data) + return m + + benchmark(parse) diff --git a/soup/test_soup_benchmark.py b/soup/test_soup_benchmark.py index b5cd651..b02f2d2 100644 --- a/soup/test_soup_benchmark.py +++ b/soup/test_soup_benchmark.py @@ -397,3 +397,65 @@ def test_beautifulsoup4(self, benchmark): benchmark( _bs4_select_fixture, _get_fixture("docs-page"), _FIXTURE_SELECT_QUERIES ) + + +# ── Scale curve: vary HTML document size (number of nodes) geometrically ── + +# Each "item" div from _make_html() contains 4 child tags (h3, p, a, plus the +# div itself), so n_tags items ≈ 4*n_tags total nodes. +_SCALE_PARAMS = [ + (5, "5nodes"), + (25, "25nodes"), + (50, "50nodes"), + (250, "250nodes"), + (500, "500nodes"), + (2_500, "2500nodes"), + (5_000, "5000nodes"), + (25_000, "25000nodes"), +] + +_SCALE_HTMLS = {label: _make_html(n) for n, label in _SCALE_PARAMS} + + +class TestScaleCurve: + """Scale curves: vary HTML document size to reveal parse/find complexity. + + Each parametrized ID encodes the approximate node count so results can + be plotted against input scale. + """ + + @pytest.mark.parametrize("label", [lbl for _, lbl in _SCALE_PARAMS]) + def test_parse_find_zerodep(self, benchmark, label): + """zerodep parse + find_all: scale with node count.""" + html = _SCALE_HTMLS[label] + benchmark(_zd_parse_and_find, html) + + @pytest.mark.parametrize("label", [lbl for _, lbl in _SCALE_PARAMS]) + def test_parse_find_bs4(self, benchmark, label): + """BeautifulSoup4 parse + find_all: scale with node count.""" + html = _SCALE_HTMLS[label] + benchmark(_bs4_parse_and_find, html) + + @pytest.mark.parametrize("label", [lbl for _, lbl in _SCALE_PARAMS]) + def test_serialize_zerodep(self, benchmark, label): + """zerodep parse + to_html serialize: scale with node count.""" + html = _SCALE_HTMLS[label] + benchmark(_zd_parse_and_serialize, html) + + @pytest.mark.parametrize("label", [lbl for _, lbl in _SCALE_PARAMS]) + def test_serialize_bs4(self, benchmark, label): + """BeautifulSoup4 parse + str(): scale with node count.""" + html = _SCALE_HTMLS[label] + benchmark(_bs4_parse_and_serialize, html) + + @pytest.mark.parametrize("label", [lbl for _, lbl in _SCALE_PARAMS]) + def test_css_select_zerodep(self, benchmark, label): + """zerodep CSS select: scale with node count.""" + html = _SCALE_HTMLS[label] + benchmark(_zd_select, html, _SELECT_QUERIES) + + @pytest.mark.parametrize("label", [lbl for _, lbl in _SCALE_PARAMS]) + def test_css_select_bs4(self, benchmark, label): + """BeautifulSoup4 CSS select: scale with node count.""" + html = _SCALE_HTMLS[label] + benchmark(_bs4_select, html, _SELECT_QUERIES) diff --git a/yaml/test_yaml_benchmark.py b/yaml/test_yaml_benchmark.py index 203da7e..d968528 100644 --- a/yaml/test_yaml_benchmark.py +++ b/yaml/test_yaml_benchmark.py @@ -261,3 +261,63 @@ def test_pyyaml(self, benchmark): _get_fixture_data("k8s-deployment"), default_flow_style=False, ) + + +# ── Scale curve: vary document size geometrically ── + + +def _make_yaml_doc(n_keys: int) -> str: + """Generate a flat YAML document with *n_keys* string key-value pairs.""" + lines = [] + for i in range(n_keys): + lines.append(f"key_{i}: value_string_{i}") + return "\n".join(lines) + + +# Geometric scale points: ~10 B … ~500 KB worth of keys. +# Each key-value line is roughly 25 bytes, so n_keys ≈ target_bytes / 25. +_SCALE_PARAMS = [ + (4, "100B"), + (20, "500B"), + (40, "1KB"), + (200, "5KB"), + (400, "10KB"), + (2_000, "50KB"), + (4_000, "100KB"), + (20_000, "500KB"), +] + +_SCALE_DOCS = {label: _make_yaml_doc(n) for n, label in _SCALE_PARAMS} +_SCALE_DATA = {label: zd_load(doc) for label, doc in _SCALE_DOCS.items()} + + +class TestScaleCurve: + """Scale curves: vary document size to reveal complexity behaviour. + + Each parametrized ID encodes the approximate byte size of the YAML + document so results can be plotted against input size. + """ + + @pytest.mark.parametrize("label", [lbl for _, lbl in _SCALE_PARAMS]) + def test_load_zerodep(self, benchmark, label): + """zerodep load: scale with document size.""" + doc = _SCALE_DOCS[label] + benchmark(zd_load, doc) + + @pytest.mark.parametrize("label", [lbl for _, lbl in _SCALE_PARAMS]) + def test_load_pyyaml(self, benchmark, label): + """PyYAML safe_load: scale with document size.""" + doc = _SCALE_DOCS[label] + benchmark(_pyyaml_safe_load, doc) + + @pytest.mark.parametrize("label", [lbl for _, lbl in _SCALE_PARAMS]) + def test_dump_zerodep(self, benchmark, label): + """zerodep dump: scale with data size.""" + data = _SCALE_DATA[label] + benchmark(zd_dump, data) + + @pytest.mark.parametrize("label", [lbl for _, lbl in _SCALE_PARAMS]) + def test_dump_pyyaml(self, benchmark, label): + """PyYAML dump: scale with data size.""" + data = _SCALE_DATA[label] + benchmark(_pyyaml_dump, data, default_flow_style=False)