From 62aec8d5676f81582f623a45409bb16f5d098613 Mon Sep 17 00:00:00 2001 From: Peng Ding Date: Mon, 18 May 2026 07:05:25 -0500 Subject: [PATCH] feat(bench): add tracemalloc memory benchmarks for parser modules (closes #84, closes #85) Add test__memory.py for soup, yaml, xml, multipart, readability, and protobuf. Each file uses stdlib tracemalloc to record peak heap allocation (reported in KB) for S/M/L inputs, with zerodep-vs-reference comparison tests where the reference library is available. No pytest-benchmark dependency; plain pytest with -s prints results inline. --- multipart/test_multipart_memory.py | 225 ++++++++++++++++++++ protobuf/test_protobuf_memory.py | 272 +++++++++++++++++++++++++ readability/test_readability_memory.py | 225 ++++++++++++++++++++ soup/test_soup_memory.py | 110 ++++++++++ xml/test_xml_memory.py | 200 ++++++++++++++++++ yaml/test_yaml_memory.py | 206 +++++++++++++++++++ 6 files changed, 1238 insertions(+) create mode 100644 multipart/test_multipart_memory.py create mode 100644 protobuf/test_protobuf_memory.py create mode 100644 readability/test_readability_memory.py create mode 100644 soup/test_soup_memory.py create mode 100644 xml/test_xml_memory.py create mode 100644 yaml/test_yaml_memory.py diff --git a/multipart/test_multipart_memory.py b/multipart/test_multipart_memory.py new file mode 100644 index 0000000..e892c20 --- /dev/null +++ b/multipart/test_multipart_memory.py @@ -0,0 +1,225 @@ +"""Memory benchmarks: zerodep multipart vs python-multipart. + +Uses tracemalloc to measure peak heap allocation for parse_multipart +and encode_multipart at three input sizes (S/M/L). Results are +printed in KB so they are visible in plain ``pytest -s`` output. No +pytest-benchmark required. + +The python-multipart reference library is loaded via site-packages path +inspection to avoid the name collision with our local module (same +technique as the time benchmark). +""" + +import importlib +import importlib.util +import os +import sys +import tracemalloc + +import pytest + +# ── Import python-multipart reference via direct path loading ── + +_HAS_REF = False +_ref_mod = None +try: + for _p in sys.path: + if "site-packages" not in _p: + continue + _pkg_dir = os.path.join(_p, "multipart") + _ref_file = os.path.join(_pkg_dir, "multipart.py") + if os.path.isfile(_ref_file): + _spec = importlib.util.spec_from_file_location( + "multipart_reference", _ref_file + ) + if _spec and _spec.loader: + _ref_mod = importlib.util.module_from_spec(_spec) + _spec.loader.exec_module(_ref_mod) + _HAS_REF = True + break +except Exception: + pass + +# ── Import our module ── + +sys.path.insert(0, os.path.dirname(__file__)) + +from multipart import encode_multipart, parse_multipart # noqa: E402 + +# ── Reference parser adapter ── + + +def _ref_parse_multipart(body: bytes, content_type: str) -> list: + """Parse multipart body using python-multipart as reference.""" + if not _HAS_REF: + pytest.skip("python-multipart not installed") + + from io import BytesIO + + _multipart_mod = _ref_mod + + content_type_bytes = content_type.encode("latin-1") + _, options = _multipart_mod.parse_options_header(content_type_bytes) + boundary = options.get(b"boundary", b"") + + parts: list[dict] = [] + current_part: dict = {} + current_data = BytesIO() + + def on_part_begin(): + nonlocal current_part, current_data + current_part = {"headers": {}} + current_data = BytesIO() + + def on_part_data(data: bytes, start: int, end: int): + current_data.write(data[start:end]) + + def on_part_end(): + current_part["data"] = current_data.getvalue() + parts.append(current_part) + + def on_header_field(data: bytes, start: int, end: int): + current_part["_header_field"] = data[start:end].decode("latin-1") + + def on_header_value(data: bytes, start: int, end: int): + fld = current_part.pop("_header_field", "") + current_part["headers"][fld.lower()] = data[start:end].decode("latin-1") + + callbacks = { + "on_part_begin": on_part_begin, + "on_part_data": on_part_data, + "on_part_end": on_part_end, + "on_header_field": on_header_field, + "on_header_value": on_header_value, + } + + parser = _multipart_mod.MultipartParser(boundary, callbacks) + parser.write(body) + parser.finalize() + + return parts + + +# ── Test data ── + + +def _make_small_body() -> tuple[bytes, str]: + """Small payload: 3 text fields (~200 bytes).""" + return encode_multipart( + fields={"name": "Alice", "age": "30", "city": "Wonderland"}, + boundary="benchboundary", + ) + + +def _make_medium_body() -> tuple[bytes, str]: + """Medium payload: 5 text fields + 2 small files (~10 KB).""" + fields = {f"field_{i}": f"value_{i} " * 20 for i in range(5)} + files = { + "file1": ("report.txt", b"x" * 2048, "text/plain"), + "file2": ("data.bin", b"\x00\xff" * 2048, "application/octet-stream"), + } + return encode_multipart(fields=fields, files=files, boundary="benchboundary") + + +def _make_large_body() -> tuple[bytes, str]: + """Large payload: 10 text fields + 5 files (~500 KB).""" + fields = {f"field_{i}": f"value_{i} " * 100 for i in range(10)} + files = { + f"file_{i}": ( + f"upload_{i}.bin", + bytes(range(256)) * 313, # ~80 KB deterministic, no os.urandom + "application/octet-stream", + ) + for i in range(5) + } + return encode_multipart(fields=fields, files=files, boundary="benchboundary") + + +# Pre-build payloads so generation cost is excluded from measurements. +SMALL_BODY, SMALL_CT = _make_small_body() +MEDIUM_BODY, MEDIUM_CT = _make_medium_body() +LARGE_BODY, LARGE_CT = _make_large_body() + +SMALL_FIELDS = {"name": "Alice", "age": "30", "city": "Wonderland"} +MEDIUM_FIELDS = {f"field_{i}": f"value_{i} " * 20 for i in range(5)} +MEDIUM_FILES = { + "file1": ("report.txt", b"x" * 2048, "text/plain"), + "file2": ("data.bin", b"\x00\xff" * 2048, "application/octet-stream"), +} +LARGE_FIELDS = {f"field_{i}": f"value_{i} " * 100 for i in range(10)} + + +# ── Helpers ── + + +def _measure_peak_kb(fn, *args, **kwargs) -> float: + """Run *fn* with *args*/*kwargs* under tracemalloc and return peak KB.""" + tracemalloc.start() + try: + fn(*args, **kwargs) + _, peak = tracemalloc.get_traced_memory() + finally: + tracemalloc.stop() + return peak / 1024 + + +_BODY_SIZES = [ + pytest.param("small", SMALL_BODY, SMALL_CT, id="small"), + pytest.param("medium", MEDIUM_BODY, MEDIUM_CT, id="medium"), + pytest.param("large", LARGE_BODY, LARGE_CT, id="large"), +] + + +# ── Parse memory tests ── + + +@pytest.mark.parametrize("label,body,ct", _BODY_SIZES) +def test_parse_memory_zerodep(label: str, body: bytes, ct: str) -> None: + """Measure peak memory for zerodep parse_multipart.""" + peak_kb = _measure_peak_kb(parse_multipart, body, ct) + print(f"\n[multipart parse zerodep {label:6s}] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 + + +@pytest.mark.skipif(not _HAS_REF, reason="python-multipart not installed") +@pytest.mark.parametrize("label,body,ct", _BODY_SIZES) +def test_parse_memory_python_multipart(label: str, body: bytes, ct: str) -> None: + """Measure peak memory for python-multipart parse.""" + peak_kb = _measure_peak_kb(_ref_parse_multipart, body, ct) + print(f"\n[multipart parse reference {label:6s}] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 + + +@pytest.mark.skipif(not _HAS_REF, reason="python-multipart not installed") +@pytest.mark.parametrize("label,body,ct", _BODY_SIZES) +def test_parse_memory_comparison(label: str, body: bytes, ct: str) -> None: + """Compare zerodep vs python-multipart peak memory for parse.""" + zd_kb = _measure_peak_kb(parse_multipart, body, ct) + ref_kb = _measure_peak_kb(_ref_parse_multipart, body, ct) + ratio = zd_kb / ref_kb if ref_kb > 0 else float("inf") + print( + f"\n[multipart parse compare {label:6s}] zerodep={zd_kb:.1f} KB " + f"reference={ref_kb:.1f} KB ratio={ratio:.2f}x" + ) + assert zd_kb >= 0 + assert ref_kb >= 0 + + +# ── Encode memory tests ── + + +@pytest.mark.parametrize( + "label,fields,files", + [ + pytest.param("small", SMALL_FIELDS, None, id="small"), + pytest.param("medium", MEDIUM_FIELDS, MEDIUM_FILES, id="medium"), + ], +) +def test_encode_memory_zerodep(label: str, fields: dict, files) -> None: + """Measure peak memory for zerodep encode_multipart.""" + kwargs = {"fields": fields, "boundary": "benchboundary"} + if files is not None: + kwargs["files"] = files + peak_kb = _measure_peak_kb(encode_multipart, **kwargs) + print(f"\n[multipart encode zerodep {label:6s}] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 diff --git a/protobuf/test_protobuf_memory.py b/protobuf/test_protobuf_memory.py new file mode 100644 index 0000000..ae38078 --- /dev/null +++ b/protobuf/test_protobuf_memory.py @@ -0,0 +1,272 @@ +"""Memory benchmarks: zerodep protobuf encode/decode. + +Uses tracemalloc to measure peak heap allocation for encode (serialize) +and decode (parse) at three message sizes (S/M/L). Results are printed +in KB so they are visible in plain ``pytest -s`` output. No +pytest-benchmark required. + +The google-protobuf reference library is optional. When absent, only +zerodep measurements are collected and comparison tests are skipped. +""" + +import os +import sys +import tracemalloc + +import pytest + +sys.path.insert(0, os.path.dirname(__file__)) + +from protobuf import ( # noqa: E402 + bool_, + double, + field, + int32, + map_field, + message, + repeated, + uint64, +) + +# ── Google protobuf reference (optional) ── + +try: + from google.protobuf import descriptor_pb2 as _dpb2 + from google.protobuf import descriptor_pool as _pool + + HAS_GOOGLE_PB = True +except ImportError: + HAS_GOOGLE_PB = False + + +def _build_google_messages(): + """Dynamically build google-protobuf message classes equivalent to zerodep ones.""" + from google.protobuf import message_factory + + pool = _pool.DescriptorPool() + + file_dp = _dpb2.FileDescriptorProto( + name="bench_mem.proto", + package="benchmem", + syntax="proto3", + ) + + # SmallMessage: name(1)=string, value(2)=int32, flag(3)=bool + sm = file_dp.message_type.add() + sm.name = "SmallMessage" + for fname, fnum, ftype in [ + ("name", 1, _dpb2.FieldDescriptorProto.TYPE_STRING), + ("value", 2, _dpb2.FieldDescriptorProto.TYPE_INT32), + ("flag", 3, _dpb2.FieldDescriptorProto.TYPE_BOOL), + ]: + f = sm.field.add() + f.name, f.number, f.type = fname, fnum, ftype + f.label = _dpb2.FieldDescriptorProto.LABEL_OPTIONAL + + # LargeMessage: id(1)=uint64, name(2)=string, scores(3)=repeated double, + # active(4)=bool + lm = file_dp.message_type.add() + lm.name = "LargeMessage" + for fname, fnum, ftype, flabel in [ + ( + "id", + 1, + _dpb2.FieldDescriptorProto.TYPE_UINT64, + _dpb2.FieldDescriptorProto.LABEL_OPTIONAL, + ), + ( + "name", + 2, + _dpb2.FieldDescriptorProto.TYPE_STRING, + _dpb2.FieldDescriptorProto.LABEL_OPTIONAL, + ), + ( + "scores", + 3, + _dpb2.FieldDescriptorProto.TYPE_DOUBLE, + _dpb2.FieldDescriptorProto.LABEL_REPEATED, + ), + ( + "active", + 4, + _dpb2.FieldDescriptorProto.TYPE_BOOL, + _dpb2.FieldDescriptorProto.LABEL_OPTIONAL, + ), + ]: + f = lm.field.add() + f.name, f.number, f.type, f.label = fname, fnum, ftype, flabel + + fd = pool.Add(file_dp) + factory = message_factory.MessageFactory(pool=pool) + classes = {} + for msg_name in ["SmallMessage", "LargeMessage"]: + desc = fd.message_types_by_name[msg_name] + classes[msg_name] = factory.GetPrototype(desc) + return classes + + +if HAS_GOOGLE_PB: + try: + _goog_classes = _build_google_messages() + _GoogSmall = _goog_classes["SmallMessage"] + _GoogLarge = _goog_classes["LargeMessage"] + except Exception: + HAS_GOOGLE_PB = False + _GoogSmall = None + _GoogLarge = None +else: + _GoogSmall = None + _GoogLarge = None + +# ── zerodep message definitions ── + + +@message +class SmallMessage: + name: str = field(1) + value: int32 = field(2) + flag: bool_ = field(3) + + +@message +class MediumMessage: + id: uint64 = field(1) + title: str = field(2) + score: double = field(3) + tags: repeated[str] = field(4) + values: repeated[int32] = field(5) + + +@message +class InnerMessage: + x: int32 = field(1) + y: int32 = field(2) + label: str = field(3) + + +@message +class LargeMessage: + id: uint64 = field(1) + name: str = field(2) + items: repeated[InnerMessage] = field(3) + metadata: map_field[str, str] = field(4) + scores: repeated[double] = field(5) + active: bool_ = field(6) + + +# ── Pre-built objects ── + +_SMALL_OBJ = SmallMessage(name="benchmark", value=42, flag=True) +_MEDIUM_OBJ = MediumMessage( + id=123456789, + title="A medium-sized message for benchmarking", + score=3.14159, + tags=["alpha", "beta", "gamma", "delta"], + values=list(range(100)), +) +_LARGE_OBJ = LargeMessage( + id=987654321, + name="Large benchmark message", + items=[InnerMessage(x=i, y=i * 2, label=f"item_{i}") for i in range(50)], + metadata={f"key_{i}": f"value_{i}" for i in range(20)}, + scores=[float(i) * 0.1 for i in range(100)], + active=True, +) + +_SMALL_BYTES = _SMALL_OBJ.serialize() +_MEDIUM_BYTES = _MEDIUM_OBJ.serialize() +_LARGE_BYTES = _LARGE_OBJ.serialize() + + +# ── Helpers ── + + +def _measure_peak_kb(fn, *args, **kwargs) -> float: + """Run *fn* with *args*/*kwargs* under tracemalloc and return peak KB.""" + tracemalloc.start() + try: + fn(*args, **kwargs) + _, peak = tracemalloc.get_traced_memory() + finally: + tracemalloc.stop() + return peak / 1024 + + +_ENC_SIZES = [ + pytest.param("small", _SMALL_OBJ, id="small"), + pytest.param("medium", _MEDIUM_OBJ, id="medium"), + pytest.param("large", _LARGE_OBJ, id="large"), +] + +_DEC_SIZES = [ + pytest.param("small", SmallMessage, _SMALL_BYTES, id="small"), + pytest.param("medium", MediumMessage, _MEDIUM_BYTES, id="medium"), + pytest.param("large", LargeMessage, _LARGE_BYTES, id="large"), +] + +_RT_SIZES = [ + pytest.param("small", _SMALL_OBJ, SmallMessage, id="small"), + pytest.param("medium", _MEDIUM_OBJ, MediumMessage, id="medium"), + pytest.param("large", _LARGE_OBJ, LargeMessage, id="large"), +] + + +# ── Encode memory tests ── + + +@pytest.mark.parametrize("label,obj", _ENC_SIZES) +def test_encode_memory_zerodep(label: str, obj) -> None: + """Measure peak memory for zerodep protobuf encode.""" + peak_kb = _measure_peak_kb(obj.serialize) + print(f"\n[protobuf encode zerodep {label:6s}] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 + + +@pytest.mark.skipif(not HAS_GOOGLE_PB, reason="google-protobuf not installed") +def test_encode_memory_google_small() -> None: + """Measure peak memory for google-protobuf encode (small).""" + obj = _GoogSmall(name="benchmark", value=42, flag=True) + + peak_kb = _measure_peak_kb(obj.SerializeToString) + print(f"\n[protobuf encode google small ] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 + + +@pytest.mark.skipif(not HAS_GOOGLE_PB, reason="google-protobuf not installed") +def test_encode_memory_google_large() -> None: + """Measure peak memory for google-protobuf encode (large).""" + obj = _GoogLarge( + id=987654321, + name="Large benchmark message", + scores=[float(i) * 0.1 for i in range(100)], + active=True, + ) + peak_kb = _measure_peak_kb(obj.SerializeToString) + print(f"\n[protobuf encode google large ] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 + + +# ── Decode memory tests ── + + +@pytest.mark.parametrize("label,cls,data", _DEC_SIZES) +def test_decode_memory_zerodep(label: str, cls, data: bytes) -> None: + """Measure peak memory for zerodep protobuf decode.""" + peak_kb = _measure_peak_kb(cls.parse, data) + print(f"\n[protobuf decode zerodep {label:6s}] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 + + +# ── Roundtrip memory tests ── + + +@pytest.mark.parametrize("label,obj,cls", _RT_SIZES) +def test_roundtrip_memory_zerodep(label: str, obj, cls) -> None: + """Measure peak memory for zerodep protobuf roundtrip.""" + + def roundtrip(): + return cls.parse(obj.serialize()) + + peak_kb = _measure_peak_kb(roundtrip) + print(f"\n[protobuf roundtrip zerodep {label:6s}] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 diff --git a/readability/test_readability_memory.py b/readability/test_readability_memory.py new file mode 100644 index 0000000..e844f7c --- /dev/null +++ b/readability/test_readability_memory.py @@ -0,0 +1,225 @@ +"""Memory benchmarks: zerodep readability vs readability-lxml. + +Uses tracemalloc to measure peak heap allocation for HTML content +extraction at three fixture size tiers (S/M/L from Mozilla test pages). +Results are printed in KB so they are visible in plain ``pytest -s`` +output. No pytest-benchmark required. +""" + +import importlib +import os +import sys +import tracemalloc + +import pytest + +sys.path.insert(0, os.path.dirname(__file__)) + +from readability import extract # noqa: E402 + +# ── readability-lxml reference ── + +try: + from importlib.metadata import version as _pkg_version + + _pkg_version("readability-lxml") + _HAS_REFERENCE = True +except Exception: + _HAS_REFERENCE = False + + +def _load_reference_document_class(): + """Load readability-lxml's Document class, working around name clash.""" + saved_path = sys.path[:] + saved_modules = { + k: sys.modules.pop(k) + for k in list(sys.modules) + if k == "readability" or k.startswith("readability.") + } + try: + this_dir = os.path.dirname(__file__) + this_abs = os.path.abspath(this_dir) + sys.path = [p for p in sys.path if os.path.abspath(p) != this_abs] + mod = importlib.import_module("readability") + return mod.Document + finally: + sys.path = saved_path + for k in list(sys.modules): + if k == "readability" or k.startswith("readability."): + del sys.modules[k] + sys.modules.update(saved_modules) + + +if _HAS_REFERENCE: + _RefDocument = _load_reference_document_class() +else: + _RefDocument = None + + +# ── Test fixtures: Mozilla Readability test pages ── + +_TEST_PAGES_DIR = os.path.join(os.path.dirname(__file__), "test-pages") + +_SMALL_FIXTURES = ["rtl-1", "basic-tags-cleaning", "003-metadata-preferred"] +_MEDIUM_FIXTURES = ["001", "ars-1"] +_LARGE_FIXTURES = ["cnn", "bbc-1", "guardian-1"] + + +def _fixture_path(name: str) -> str: + return os.path.join(_TEST_PAGES_DIR, name, "source.html") + + +def _pick_available(names: list) -> str | None: + """Return the first available fixture name, or None.""" + for name in names: + if os.path.isfile(_fixture_path(name)): + return name + return None + + +_FIXTURE_CACHE: dict[str, str] = {} + + +def _get_fixture(name: str) -> str: + """Get cached fixture HTML.""" + if name not in _FIXTURE_CACHE: + with open(_fixture_path(name), encoding="utf-8") as f: + _FIXTURE_CACHE[name] = f.read() + return _FIXTURE_CACHE[name] + + +# Resolve one fixture per size tier (skip entire tier if none available). +_SMALL = _pick_available(_SMALL_FIXTURES) +_MEDIUM = _pick_available(_MEDIUM_FIXTURES) +_LARGE = _pick_available(_LARGE_FIXTURES) + + +# ── Helpers ── + + +def _measure_peak_kb(fn, *args, **kwargs) -> float: + """Run *fn* with *args*/*kwargs* under tracemalloc and return peak KB.""" + tracemalloc.start() + try: + fn(*args, **kwargs) + _, peak = tracemalloc.get_traced_memory() + finally: + tracemalloc.stop() + return peak / 1024 + + +def _zd_extract(html: str) -> None: + extract(html) + + +def _ref_extract(html: str) -> None: + doc = _RefDocument(html) + doc.summary() + + +# ── Memory tests ── + + +@pytest.mark.skipif(_SMALL is None, reason="no small fixture available") +def test_extract_memory_zerodep_small() -> None: + """Measure peak memory for zerodep extract on a small fixture.""" + html = _get_fixture(_SMALL) + peak_kb = _measure_peak_kb(_zd_extract, html) + print(f"\n[readability zerodep small ] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 + + +@pytest.mark.skipif(_MEDIUM is None, reason="no medium fixture available") +def test_extract_memory_zerodep_medium() -> None: + """Measure peak memory for zerodep extract on a medium fixture.""" + html = _get_fixture(_MEDIUM) + peak_kb = _measure_peak_kb(_zd_extract, html) + print(f"\n[readability zerodep medium] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 + + +@pytest.mark.skipif(_LARGE is None, reason="no large fixture available") +def test_extract_memory_zerodep_large() -> None: + """Measure peak memory for zerodep extract on a large fixture.""" + html = _get_fixture(_LARGE) + peak_kb = _measure_peak_kb(_zd_extract, html) + print(f"\n[readability zerodep large ] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 + + +@pytest.mark.skipif(not _HAS_REFERENCE, reason="readability-lxml not installed") +@pytest.mark.skipif(_SMALL is None, reason="no small fixture available") +def test_extract_memory_reference_small() -> None: + """Measure peak memory for readability-lxml on a small fixture.""" + html = _get_fixture(_SMALL) + peak_kb = _measure_peak_kb(_ref_extract, html) + print(f"\n[readability lxml small ] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 + + +@pytest.mark.skipif(not _HAS_REFERENCE, reason="readability-lxml not installed") +@pytest.mark.skipif(_MEDIUM is None, reason="no medium fixture available") +def test_extract_memory_reference_medium() -> None: + """Measure peak memory for readability-lxml on a medium fixture.""" + html = _get_fixture(_MEDIUM) + peak_kb = _measure_peak_kb(_ref_extract, html) + print(f"\n[readability lxml medium] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 + + +@pytest.mark.skipif(not _HAS_REFERENCE, reason="readability-lxml not installed") +@pytest.mark.skipif(_LARGE is None, reason="no large fixture available") +def test_extract_memory_reference_large() -> None: + """Measure peak memory for readability-lxml on a large fixture.""" + html = _get_fixture(_LARGE) + peak_kb = _measure_peak_kb(_ref_extract, html) + print(f"\n[readability lxml large ] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 + + +@pytest.mark.skipif(not _HAS_REFERENCE, reason="readability-lxml not installed") +@pytest.mark.skipif(_SMALL is None, reason="no small fixture available") +def test_extract_memory_comparison_small() -> None: + """Compare zerodep vs readability-lxml peak memory on small fixture.""" + html = _get_fixture(_SMALL) + zd_kb = _measure_peak_kb(_zd_extract, html) + ref_kb = _measure_peak_kb(_ref_extract, html) + ratio = zd_kb / ref_kb if ref_kb > 0 else float("inf") + print( + f"\n[readability compare small ] zerodep={zd_kb:.1f} KB " + f"lxml={ref_kb:.1f} KB ratio={ratio:.2f}x" + ) + assert zd_kb >= 0 + assert ref_kb >= 0 + + +@pytest.mark.skipif(not _HAS_REFERENCE, reason="readability-lxml not installed") +@pytest.mark.skipif(_MEDIUM is None, reason="no medium fixture available") +def test_extract_memory_comparison_medium() -> None: + """Compare zerodep vs readability-lxml peak memory on medium fixture.""" + html = _get_fixture(_MEDIUM) + zd_kb = _measure_peak_kb(_zd_extract, html) + ref_kb = _measure_peak_kb(_ref_extract, html) + ratio = zd_kb / ref_kb if ref_kb > 0 else float("inf") + print( + f"\n[readability compare medium] zerodep={zd_kb:.1f} KB " + f"lxml={ref_kb:.1f} KB ratio={ratio:.2f}x" + ) + assert zd_kb >= 0 + assert ref_kb >= 0 + + +@pytest.mark.skipif(not _HAS_REFERENCE, reason="readability-lxml not installed") +@pytest.mark.skipif(_LARGE is None, reason="no large fixture available") +def test_extract_memory_comparison_large() -> None: + """Compare zerodep vs readability-lxml peak memory on large fixture.""" + html = _get_fixture(_LARGE) + zd_kb = _measure_peak_kb(_zd_extract, html) + ref_kb = _measure_peak_kb(_ref_extract, html) + ratio = zd_kb / ref_kb if ref_kb > 0 else float("inf") + print( + f"\n[readability compare large ] zerodep={zd_kb:.1f} KB " + f"lxml={ref_kb:.1f} KB ratio={ratio:.2f}x" + ) + assert zd_kb >= 0 + assert ref_kb >= 0 diff --git a/soup/test_soup_memory.py b/soup/test_soup_memory.py new file mode 100644 index 0000000..63b1726 --- /dev/null +++ b/soup/test_soup_memory.py @@ -0,0 +1,110 @@ +"""Memory benchmarks: zerodep soup vs beautifulsoup4. + +Uses tracemalloc to measure peak heap allocation for parse + find_all +at three input sizes (S/M/L). Results are printed in KB so they are +visible in plain ``pytest -s`` output. No pytest-benchmark required. +""" + +import os +import sys +import tracemalloc + +import pytest + +sys.path.insert(0, os.path.dirname(__file__)) + +from soup import Soup + +bs4 = pytest.importorskip("bs4", reason="beautifulsoup4 not installed") +BeautifulSoup = bs4.BeautifulSoup + + +# ── Test data ── + + +def _make_html(n_tags: int) -> str: + """Generate an HTML document with *n_tags* leaf elements.""" + lines = [ + "", + "Benchmark", + ] + for i in range(n_tags): + cls = "even" if i % 2 == 0 else "odd" + lines.append( + f'
' + f"

Title {i}

" + f"

Description for item {i}

" + f'Link' + f"
" + ) + lines.append("") + return "\n".join(lines) + + +SMALL = _make_html(5) +MEDIUM = _make_html(50) +LARGE = _make_html(500) + + +# ── Helpers ── + + +def _measure_peak_kb(fn, *args) -> float: + """Run *fn* with *args* under tracemalloc and return peak KB.""" + tracemalloc.start() + try: + fn(*args) + _, peak = tracemalloc.get_traced_memory() + finally: + tracemalloc.stop() + return peak / 1024 + + +def _zd_parse_and_find(html: str) -> list: + soup = Soup(html) + return soup.find_all("div", class_="item") + + +def _bs4_parse_and_find(html: str) -> list: + soup = BeautifulSoup(html, "html.parser") + return soup.find_all("div", class_="item") + + +# ── Memory tests ── + + +_SIZES = [ + pytest.param("small", SMALL, id="small"), + pytest.param("medium", MEDIUM, id="medium"), + pytest.param("large", LARGE, id="large"), +] + + +@pytest.mark.parametrize("label,html", _SIZES) +def test_memory_zerodep(label: str, html: str) -> None: + """Measure peak memory for zerodep Soup parse + find_all.""" + peak_kb = _measure_peak_kb(_zd_parse_and_find, html) + print(f"\n[soup zerodep {label:6s}] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 + + +@pytest.mark.parametrize("label,html", _SIZES) +def test_memory_beautifulsoup4(label: str, html: str) -> None: + """Measure peak memory for BeautifulSoup parse + find_all.""" + peak_kb = _measure_peak_kb(_bs4_parse_and_find, html) + print(f"\n[soup bs4 {label:6s}] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 + + +@pytest.mark.parametrize("label,html", _SIZES) +def test_memory_comparison(label: str, html: str) -> None: + """Compare zerodep vs bs4 peak memory; print ratio.""" + zd_kb = _measure_peak_kb(_zd_parse_and_find, html) + bs4_kb = _measure_peak_kb(_bs4_parse_and_find, html) + ratio = zd_kb / bs4_kb if bs4_kb > 0 else float("inf") + print( + f"\n[soup compare {label:6s}] zerodep={zd_kb:.1f} KB " + f"bs4={bs4_kb:.1f} KB ratio={ratio:.2f}x" + ) + assert zd_kb >= 0 + assert bs4_kb >= 0 diff --git a/xml/test_xml_memory.py b/xml/test_xml_memory.py new file mode 100644 index 0000000..9164f54 --- /dev/null +++ b/xml/test_xml_memory.py @@ -0,0 +1,200 @@ +"""Memory benchmarks: zerodep xml vs xmltodict. + +Uses tracemalloc to measure peak heap allocation for parse and unparse +at three input sizes (S/M/L). Results are printed in KB so they are +visible in plain ``pytest -s`` output. No pytest-benchmark required. + +Note: our ``xml.py`` shadows stdlib ``xml`` on sys.path, so xmltodict +must be imported with path manipulation (same technique as the time +benchmark). +""" + +import os +import sys +import tracemalloc + +import pytest + +# ── Import xmltodict before our module shadows stdlib xml ── + +_this_dir = os.path.dirname(__file__) + +_saved_path = sys.path[:] +sys.path = [ + p + for p in sys.path + if os.path.abspath(p) + not in ( + os.path.abspath(_this_dir), + os.path.abspath(os.path.join(_this_dir, "..")), + ) +] +_cached_xml = sys.modules.pop("xml", None) +_cached_xml_sub = {} +for _k in list(sys.modules): + if _k.startswith("xml."): + _cached_xml_sub[_k] = sys.modules.pop(_k) + +try: + import xmltodict as _xmltodict + + if not hasattr(_xmltodict, "parse"): + raise ImportError("Not the real xmltodict") + _ref_parse = _xmltodict.parse + _ref_unparse = _xmltodict.unparse +except ImportError: + pytest.skip("xmltodict not installed", allow_module_level=True) +finally: + sys.path = _saved_path + for _k in list(sys.modules): + if _k == "xml" or _k.startswith("xml."): + del sys.modules[_k] + sys.modules.update(_cached_xml_sub) + if _cached_xml is not None: + sys.modules["xml"] = _cached_xml + +# Now import our module. +sys.path.insert(0, _this_dir) +for _k in list(sys.modules): + if _k == "xml" or _k.startswith("xml."): + del sys.modules[_k] + +from xml import parse as zd_parse # noqa: E402 +from xml import unparse as zd_unparse # noqa: E402 + +# ── Test data (same as time benchmark) ── + +SMALL_XML = "Alice30true" + +MEDIUM_XML = ( + """\ + + +""" + + "".join( + f" \n" + f" https://example.com/page-{i}\n" + f" 2024-{(i % 12) + 1:02d}-01\n" + f" weekly\n" + f" {0.5 + (i % 5) * 0.1:.1f}\n" + f" \n" + for i in range(25) + ) + + "" +) + +LARGE_XML = ( + "\n" + + "".join( + f' \n' + f" Product {i}\n" + f" {9.99 + i * 0.5:.2f}\n" + f" Description for product {i}\n" + f" \n" + f" tag-{i % 5}\n" + f" tag-{(i + 1) % 5}\n" + f" \n" + f" {'true' if i % 3 != 0 else 'false'}\n" + f" \n" + for i in range(200) + ) + + "" +) + +# Pre-parsed dicts for unparse benchmarks. +SMALL_DATA = _ref_parse(SMALL_XML) +MEDIUM_DATA = _ref_parse(MEDIUM_XML) +LARGE_DATA = _ref_parse(LARGE_XML) + + +# ── Helpers ── + + +def _measure_peak_kb(fn, *args, **kwargs) -> float: + """Run *fn* with *args*/*kwargs* under tracemalloc and return peak KB.""" + tracemalloc.start() + try: + fn(*args, **kwargs) + _, peak = tracemalloc.get_traced_memory() + finally: + tracemalloc.stop() + return peak / 1024 + + +_DOC_SIZES = [ + pytest.param("small", SMALL_XML, id="small"), + pytest.param("medium", MEDIUM_XML, id="medium"), + pytest.param("large", LARGE_XML, id="large"), +] + +_DATA_SIZES = [ + pytest.param("small", SMALL_DATA, id="small"), + pytest.param("medium", MEDIUM_DATA, id="medium"), + pytest.param("large", LARGE_DATA, id="large"), +] + + +# ── Parse memory tests ── + + +@pytest.mark.parametrize("label,doc", _DOC_SIZES) +def test_parse_memory_zerodep(label: str, doc: str) -> None: + """Measure peak memory for zerodep xml.parse.""" + peak_kb = _measure_peak_kb(zd_parse, doc) + print(f"\n[xml parse zerodep {label:6s}] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 + + +@pytest.mark.parametrize("label,doc", _DOC_SIZES) +def test_parse_memory_xmltodict(label: str, doc: str) -> None: + """Measure peak memory for xmltodict.parse.""" + peak_kb = _measure_peak_kb(_ref_parse, doc) + print(f"\n[xml parse xmltodict {label:6s}] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 + + +@pytest.mark.parametrize("label,doc", _DOC_SIZES) +def test_parse_memory_comparison(label: str, doc: str) -> None: + """Compare zerodep vs xmltodict peak memory for parse.""" + zd_kb = _measure_peak_kb(zd_parse, doc) + ref_kb = _measure_peak_kb(_ref_parse, doc) + ratio = zd_kb / ref_kb if ref_kb > 0 else float("inf") + print( + f"\n[xml parse compare {label:6s}] zerodep={zd_kb:.1f} KB " + f"xmltodict={ref_kb:.1f} KB ratio={ratio:.2f}x" + ) + assert zd_kb >= 0 + assert ref_kb >= 0 + + +# ── Unparse memory tests ── + + +@pytest.mark.parametrize("label,data", _DATA_SIZES) +def test_unparse_memory_zerodep(label: str, data) -> None: + """Measure peak memory for zerodep xml.unparse.""" + peak_kb = _measure_peak_kb(zd_unparse, data, full_document=False) + print(f"\n[xml unparse zerodep {label:6s}] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 + + +@pytest.mark.parametrize("label,data", _DATA_SIZES) +def test_unparse_memory_xmltodict(label: str, data) -> None: + """Measure peak memory for xmltodict.unparse.""" + peak_kb = _measure_peak_kb(_ref_unparse, data, full_document=False) + print(f"\n[xml unparse xmltodict {label:6s}] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 + + +@pytest.mark.parametrize("label,data", _DATA_SIZES) +def test_unparse_memory_comparison(label: str, data) -> None: + """Compare zerodep vs xmltodict peak memory for unparse.""" + zd_kb = _measure_peak_kb(zd_unparse, data, full_document=False) + ref_kb = _measure_peak_kb(_ref_unparse, data, full_document=False) + ratio = zd_kb / ref_kb if ref_kb > 0 else float("inf") + print( + f"\n[xml unparse compare {label:6s}] zerodep={zd_kb:.1f} KB " + f"xmltodict={ref_kb:.1f} KB ratio={ratio:.2f}x" + ) + assert zd_kb >= 0 + assert ref_kb >= 0 diff --git a/yaml/test_yaml_memory.py b/yaml/test_yaml_memory.py new file mode 100644 index 0000000..c897422 --- /dev/null +++ b/yaml/test_yaml_memory.py @@ -0,0 +1,206 @@ +"""Memory benchmarks: zerodep yaml vs PyYAML. + +Uses tracemalloc to measure peak heap allocation for load (parse) and +dump (serialize) at three input sizes (S/M/L). Results are printed in +KB so they are visible in plain ``pytest -s`` output. No +pytest-benchmark required. + +Note: our ``yaml.py`` shadows PyYAML on sys.path, so PyYAML must be +imported with path manipulation (same technique as the time benchmark). +""" + +import os +import sys +import tracemalloc + +import pytest + +# ── Import PyYAML without being shadowed by our yaml.py ── + +_this_dir = os.path.dirname(__file__) + +_saved_path = sys.path[:] +sys.path = [ + p + for p in sys.path + if os.path.abspath(p) + not in ( + os.path.abspath(_this_dir), + os.path.abspath(os.path.join(_this_dir, "..")), + ) +] +_cached_yaml = sys.modules.pop("yaml", None) + +try: + import yaml as _pyyaml + + if not hasattr(_pyyaml, "safe_load"): + raise ImportError("Not the real PyYAML") + _pyyaml_safe_load = _pyyaml.safe_load + _pyyaml_dump = _pyyaml.dump +except ImportError: + pytest.skip("PyYAML not installed", allow_module_level=True) +finally: + sys.path = _saved_path + sys.modules.pop("yaml", None) + if _cached_yaml is not None: + sys.modules["yaml"] = _cached_yaml + +sys.path.insert(0, _this_dir) + +from yaml import dump as zd_dump # noqa: E402 +from yaml import load as zd_load # noqa: E402 + +# ── Test data (same as time benchmark) ── + +SMALL = "name: Alice\nage: 30\nactive: true\ncity: NYC\nscore: 9.5" + +MEDIUM = """ +database: + host: localhost + port: 5432 + name: mydb + credentials: + user: admin + password: secret + options: + pool_size: 10 + timeout: 30 + ssl: true + +servers: + - name: web-1 + ip: 10.0.0.1 + roles: [web, api] + - name: web-2 + ip: 10.0.0.2 + roles: [web] + - name: db-1 + ip: 10.0.0.3 + roles: [database, backup] + +features: + auth: true + cache: true + logging: true + debug: false + +limits: + max_connections: 1000 + max_request_size: 10485760 + rate_limit: 100 +""".strip() + +_large_items = [] +for i in range(100): + _large_items.append( + f"item_{i}:\n" + f" id: {i}\n" + f" name: 'Item {i}'\n" + f" value: {i * 1.5}\n" + f" active: {'true' if i % 2 == 0 else 'false'}\n" + f" tags: [tag_a, tag_b, tag_{i}]" + ) +LARGE = "\n".join(_large_items) + +SMALL_DATA = {"name": "Alice", "age": 30, "active": True, "city": "NYC", "score": 9.5} +MEDIUM_DATA = _pyyaml_safe_load(MEDIUM) +LARGE_DATA = _pyyaml_safe_load(LARGE) + + +# ── Helpers ── + + +def _measure_peak_kb(fn, *args) -> float: + """Run *fn* with *args* under tracemalloc and return peak KB.""" + tracemalloc.start() + try: + fn(*args) + _, peak = tracemalloc.get_traced_memory() + finally: + tracemalloc.stop() + return peak / 1024 + + +_DOC_SIZES = [ + pytest.param("small", SMALL, id="small"), + pytest.param("medium", MEDIUM, id="medium"), + pytest.param("large", LARGE, id="large"), +] + +_DATA_SIZES = [ + pytest.param("small", SMALL_DATA, id="small"), + pytest.param("medium", MEDIUM_DATA, id="medium"), + pytest.param("large", LARGE_DATA, id="large"), +] + + +# ── Load (parse) memory tests ── + + +@pytest.mark.parametrize("label,doc", _DOC_SIZES) +def test_load_memory_zerodep(label: str, doc: str) -> None: + """Measure peak memory for zerodep yaml.load.""" + peak_kb = _measure_peak_kb(zd_load, doc) + print(f"\n[yaml load zerodep {label:6s}] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 + + +@pytest.mark.parametrize("label,doc", _DOC_SIZES) +def test_load_memory_pyyaml(label: str, doc: str) -> None: + """Measure peak memory for PyYAML safe_load.""" + peak_kb = _measure_peak_kb(_pyyaml_safe_load, doc) + print(f"\n[yaml load pyyaml {label:6s}] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 + + +@pytest.mark.parametrize("label,doc", _DOC_SIZES) +def test_load_memory_comparison(label: str, doc: str) -> None: + """Compare zerodep vs PyYAML peak memory for load.""" + zd_kb = _measure_peak_kb(zd_load, doc) + ref_kb = _measure_peak_kb(_pyyaml_safe_load, doc) + ratio = zd_kb / ref_kb if ref_kb > 0 else float("inf") + print( + f"\n[yaml load compare {label:6s}] zerodep={zd_kb:.1f} KB " + f"pyyaml={ref_kb:.1f} KB ratio={ratio:.2f}x" + ) + assert zd_kb >= 0 + assert ref_kb >= 0 + + +# ── Dump (serialize) memory tests ── + + +@pytest.mark.parametrize("label,data", _DATA_SIZES) +def test_dump_memory_zerodep(label: str, data) -> None: + """Measure peak memory for zerodep yaml.dump.""" + peak_kb = _measure_peak_kb(zd_dump, data) + print(f"\n[yaml dump zerodep {label:6s}] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 + + +def _pyyaml_dump_block(data) -> str: + """Wrap PyYAML dump with default_flow_style=False.""" + return _pyyaml_dump(data, default_flow_style=False) + + +@pytest.mark.parametrize("label,data", _DATA_SIZES) +def test_dump_memory_pyyaml(label: str, data) -> None: + """Measure peak memory for PyYAML dump.""" + peak_kb = _measure_peak_kb(_pyyaml_dump_block, data) + print(f"\n[yaml dump pyyaml {label:6s}] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 + + +@pytest.mark.parametrize("label,data", _DATA_SIZES) +def test_dump_memory_comparison(label: str, data) -> None: + """Compare zerodep vs PyYAML peak memory for dump.""" + zd_kb = _measure_peak_kb(zd_dump, data) + ref_kb = _measure_peak_kb(_pyyaml_dump_block, data) + ratio = zd_kb / ref_kb if ref_kb > 0 else float("inf") + print( + f"\n[yaml dump compare {label:6s}] zerodep={zd_kb:.1f} KB " + f"pyyaml={ref_kb:.1f} KB ratio={ratio:.2f}x" + ) + assert zd_kb >= 0 + assert ref_kb >= 0