diff --git a/multipart/test_multipart_memory.py b/multipart/test_multipart_memory.py new file mode 100644 index 0000000..e892c20 --- /dev/null +++ b/multipart/test_multipart_memory.py @@ -0,0 +1,225 @@ +"""Memory benchmarks: zerodep multipart vs python-multipart. + +Uses tracemalloc to measure peak heap allocation for parse_multipart +and encode_multipart at three input sizes (S/M/L). Results are +printed in KB so they are visible in plain ``pytest -s`` output. No +pytest-benchmark required. + +The python-multipart reference library is loaded via site-packages path +inspection to avoid the name collision with our local module (same +technique as the time benchmark). +""" + +import importlib +import importlib.util +import os +import sys +import tracemalloc + +import pytest + +# ── Import python-multipart reference via direct path loading ── + +_HAS_REF = False +_ref_mod = None +try: + for _p in sys.path: + if "site-packages" not in _p: + continue + _pkg_dir = os.path.join(_p, "multipart") + _ref_file = os.path.join(_pkg_dir, "multipart.py") + if os.path.isfile(_ref_file): + _spec = importlib.util.spec_from_file_location( + "multipart_reference", _ref_file + ) + if _spec and _spec.loader: + _ref_mod = importlib.util.module_from_spec(_spec) + _spec.loader.exec_module(_ref_mod) + _HAS_REF = True + break +except Exception: + pass + +# ── Import our module ── + +sys.path.insert(0, os.path.dirname(__file__)) + +from multipart import encode_multipart, parse_multipart # noqa: E402 + +# ── Reference parser adapter ── + + +def _ref_parse_multipart(body: bytes, content_type: str) -> list: + """Parse multipart body using python-multipart as reference.""" + if not _HAS_REF: + pytest.skip("python-multipart not installed") + + from io import BytesIO + + _multipart_mod = _ref_mod + + content_type_bytes = content_type.encode("latin-1") + _, options = _multipart_mod.parse_options_header(content_type_bytes) + boundary = options.get(b"boundary", b"") + + parts: list[dict] = [] + current_part: dict = {} + current_data = BytesIO() + + def on_part_begin(): + nonlocal current_part, current_data + current_part = {"headers": {}} + current_data = BytesIO() + + def on_part_data(data: bytes, start: int, end: int): + current_data.write(data[start:end]) + + def on_part_end(): + current_part["data"] = current_data.getvalue() + parts.append(current_part) + + def on_header_field(data: bytes, start: int, end: int): + current_part["_header_field"] = data[start:end].decode("latin-1") + + def on_header_value(data: bytes, start: int, end: int): + fld = current_part.pop("_header_field", "") + current_part["headers"][fld.lower()] = data[start:end].decode("latin-1") + + callbacks = { + "on_part_begin": on_part_begin, + "on_part_data": on_part_data, + "on_part_end": on_part_end, + "on_header_field": on_header_field, + "on_header_value": on_header_value, + } + + parser = _multipart_mod.MultipartParser(boundary, callbacks) + parser.write(body) + parser.finalize() + + return parts + + +# ── Test data ── + + +def _make_small_body() -> tuple[bytes, str]: + """Small payload: 3 text fields (~200 bytes).""" + return encode_multipart( + fields={"name": "Alice", "age": "30", "city": "Wonderland"}, + boundary="benchboundary", + ) + + +def _make_medium_body() -> tuple[bytes, str]: + """Medium payload: 5 text fields + 2 small files (~10 KB).""" + fields = {f"field_{i}": f"value_{i} " * 20 for i in range(5)} + files = { + "file1": ("report.txt", b"x" * 2048, "text/plain"), + "file2": ("data.bin", b"\x00\xff" * 2048, "application/octet-stream"), + } + return encode_multipart(fields=fields, files=files, boundary="benchboundary") + + +def _make_large_body() -> tuple[bytes, str]: + """Large payload: 10 text fields + 5 files (~500 KB).""" + fields = {f"field_{i}": f"value_{i} " * 100 for i in range(10)} + files = { + f"file_{i}": ( + f"upload_{i}.bin", + bytes(range(256)) * 313, # ~80 KB deterministic, no os.urandom + "application/octet-stream", + ) + for i in range(5) + } + return encode_multipart(fields=fields, files=files, boundary="benchboundary") + + +# Pre-build payloads so generation cost is excluded from measurements. +SMALL_BODY, SMALL_CT = _make_small_body() +MEDIUM_BODY, MEDIUM_CT = _make_medium_body() +LARGE_BODY, LARGE_CT = _make_large_body() + +SMALL_FIELDS = {"name": "Alice", "age": "30", "city": "Wonderland"} +MEDIUM_FIELDS = {f"field_{i}": f"value_{i} " * 20 for i in range(5)} +MEDIUM_FILES = { + "file1": ("report.txt", b"x" * 2048, "text/plain"), + "file2": ("data.bin", b"\x00\xff" * 2048, "application/octet-stream"), +} +LARGE_FIELDS = {f"field_{i}": f"value_{i} " * 100 for i in range(10)} + + +# ── Helpers ── + + +def _measure_peak_kb(fn, *args, **kwargs) -> float: + """Run *fn* with *args*/*kwargs* under tracemalloc and return peak KB.""" + tracemalloc.start() + try: + fn(*args, **kwargs) + _, peak = tracemalloc.get_traced_memory() + finally: + tracemalloc.stop() + return peak / 1024 + + +_BODY_SIZES = [ + pytest.param("small", SMALL_BODY, SMALL_CT, id="small"), + pytest.param("medium", MEDIUM_BODY, MEDIUM_CT, id="medium"), + pytest.param("large", LARGE_BODY, LARGE_CT, id="large"), +] + + +# ── Parse memory tests ── + + +@pytest.mark.parametrize("label,body,ct", _BODY_SIZES) +def test_parse_memory_zerodep(label: str, body: bytes, ct: str) -> None: + """Measure peak memory for zerodep parse_multipart.""" + peak_kb = _measure_peak_kb(parse_multipart, body, ct) + print(f"\n[multipart parse zerodep {label:6s}] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 + + +@pytest.mark.skipif(not _HAS_REF, reason="python-multipart not installed") +@pytest.mark.parametrize("label,body,ct", _BODY_SIZES) +def test_parse_memory_python_multipart(label: str, body: bytes, ct: str) -> None: + """Measure peak memory for python-multipart parse.""" + peak_kb = _measure_peak_kb(_ref_parse_multipart, body, ct) + print(f"\n[multipart parse reference {label:6s}] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 + + +@pytest.mark.skipif(not _HAS_REF, reason="python-multipart not installed") +@pytest.mark.parametrize("label,body,ct", _BODY_SIZES) +def test_parse_memory_comparison(label: str, body: bytes, ct: str) -> None: + """Compare zerodep vs python-multipart peak memory for parse.""" + zd_kb = _measure_peak_kb(parse_multipart, body, ct) + ref_kb = _measure_peak_kb(_ref_parse_multipart, body, ct) + ratio = zd_kb / ref_kb if ref_kb > 0 else float("inf") + print( + f"\n[multipart parse compare {label:6s}] zerodep={zd_kb:.1f} KB " + f"reference={ref_kb:.1f} KB ratio={ratio:.2f}x" + ) + assert zd_kb >= 0 + assert ref_kb >= 0 + + +# ── Encode memory tests ── + + +@pytest.mark.parametrize( + "label,fields,files", + [ + pytest.param("small", SMALL_FIELDS, None, id="small"), + pytest.param("medium", MEDIUM_FIELDS, MEDIUM_FILES, id="medium"), + ], +) +def test_encode_memory_zerodep(label: str, fields: dict, files) -> None: + """Measure peak memory for zerodep encode_multipart.""" + kwargs = {"fields": fields, "boundary": "benchboundary"} + if files is not None: + kwargs["files"] = files + peak_kb = _measure_peak_kb(encode_multipart, **kwargs) + print(f"\n[multipart encode zerodep {label:6s}] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 diff --git a/protobuf/test_protobuf_memory.py b/protobuf/test_protobuf_memory.py new file mode 100644 index 0000000..ae38078 --- /dev/null +++ b/protobuf/test_protobuf_memory.py @@ -0,0 +1,272 @@ +"""Memory benchmarks: zerodep protobuf encode/decode. + +Uses tracemalloc to measure peak heap allocation for encode (serialize) +and decode (parse) at three message sizes (S/M/L). Results are printed +in KB so they are visible in plain ``pytest -s`` output. No +pytest-benchmark required. + +The google-protobuf reference library is optional. When absent, only +zerodep measurements are collected and comparison tests are skipped. +""" + +import os +import sys +import tracemalloc + +import pytest + +sys.path.insert(0, os.path.dirname(__file__)) + +from protobuf import ( # noqa: E402 + bool_, + double, + field, + int32, + map_field, + message, + repeated, + uint64, +) + +# ── Google protobuf reference (optional) ── + +try: + from google.protobuf import descriptor_pb2 as _dpb2 + from google.protobuf import descriptor_pool as _pool + + HAS_GOOGLE_PB = True +except ImportError: + HAS_GOOGLE_PB = False + + +def _build_google_messages(): + """Dynamically build google-protobuf message classes equivalent to zerodep ones.""" + from google.protobuf import message_factory + + pool = _pool.DescriptorPool() + + file_dp = _dpb2.FileDescriptorProto( + name="bench_mem.proto", + package="benchmem", + syntax="proto3", + ) + + # SmallMessage: name(1)=string, value(2)=int32, flag(3)=bool + sm = file_dp.message_type.add() + sm.name = "SmallMessage" + for fname, fnum, ftype in [ + ("name", 1, _dpb2.FieldDescriptorProto.TYPE_STRING), + ("value", 2, _dpb2.FieldDescriptorProto.TYPE_INT32), + ("flag", 3, _dpb2.FieldDescriptorProto.TYPE_BOOL), + ]: + f = sm.field.add() + f.name, f.number, f.type = fname, fnum, ftype + f.label = _dpb2.FieldDescriptorProto.LABEL_OPTIONAL + + # LargeMessage: id(1)=uint64, name(2)=string, scores(3)=repeated double, + # active(4)=bool + lm = file_dp.message_type.add() + lm.name = "LargeMessage" + for fname, fnum, ftype, flabel in [ + ( + "id", + 1, + _dpb2.FieldDescriptorProto.TYPE_UINT64, + _dpb2.FieldDescriptorProto.LABEL_OPTIONAL, + ), + ( + "name", + 2, + _dpb2.FieldDescriptorProto.TYPE_STRING, + _dpb2.FieldDescriptorProto.LABEL_OPTIONAL, + ), + ( + "scores", + 3, + _dpb2.FieldDescriptorProto.TYPE_DOUBLE, + _dpb2.FieldDescriptorProto.LABEL_REPEATED, + ), + ( + "active", + 4, + _dpb2.FieldDescriptorProto.TYPE_BOOL, + _dpb2.FieldDescriptorProto.LABEL_OPTIONAL, + ), + ]: + f = lm.field.add() + f.name, f.number, f.type, f.label = fname, fnum, ftype, flabel + + fd = pool.Add(file_dp) + factory = message_factory.MessageFactory(pool=pool) + classes = {} + for msg_name in ["SmallMessage", "LargeMessage"]: + desc = fd.message_types_by_name[msg_name] + classes[msg_name] = factory.GetPrototype(desc) + return classes + + +if HAS_GOOGLE_PB: + try: + _goog_classes = _build_google_messages() + _GoogSmall = _goog_classes["SmallMessage"] + _GoogLarge = _goog_classes["LargeMessage"] + except Exception: + HAS_GOOGLE_PB = False + _GoogSmall = None + _GoogLarge = None +else: + _GoogSmall = None + _GoogLarge = None + +# ── zerodep message definitions ── + + +@message +class SmallMessage: + name: str = field(1) + value: int32 = field(2) + flag: bool_ = field(3) + + +@message +class MediumMessage: + id: uint64 = field(1) + title: str = field(2) + score: double = field(3) + tags: repeated[str] = field(4) + values: repeated[int32] = field(5) + + +@message +class InnerMessage: + x: int32 = field(1) + y: int32 = field(2) + label: str = field(3) + + +@message +class LargeMessage: + id: uint64 = field(1) + name: str = field(2) + items: repeated[InnerMessage] = field(3) + metadata: map_field[str, str] = field(4) + scores: repeated[double] = field(5) + active: bool_ = field(6) + + +# ── Pre-built objects ── + +_SMALL_OBJ = SmallMessage(name="benchmark", value=42, flag=True) +_MEDIUM_OBJ = MediumMessage( + id=123456789, + title="A medium-sized message for benchmarking", + score=3.14159, + tags=["alpha", "beta", "gamma", "delta"], + values=list(range(100)), +) +_LARGE_OBJ = LargeMessage( + id=987654321, + name="Large benchmark message", + items=[InnerMessage(x=i, y=i * 2, label=f"item_{i}") for i in range(50)], + metadata={f"key_{i}": f"value_{i}" for i in range(20)}, + scores=[float(i) * 0.1 for i in range(100)], + active=True, +) + +_SMALL_BYTES = _SMALL_OBJ.serialize() +_MEDIUM_BYTES = _MEDIUM_OBJ.serialize() +_LARGE_BYTES = _LARGE_OBJ.serialize() + + +# ── Helpers ── + + +def _measure_peak_kb(fn, *args, **kwargs) -> float: + """Run *fn* with *args*/*kwargs* under tracemalloc and return peak KB.""" + tracemalloc.start() + try: + fn(*args, **kwargs) + _, peak = tracemalloc.get_traced_memory() + finally: + tracemalloc.stop() + return peak / 1024 + + +_ENC_SIZES = [ + pytest.param("small", _SMALL_OBJ, id="small"), + pytest.param("medium", _MEDIUM_OBJ, id="medium"), + pytest.param("large", _LARGE_OBJ, id="large"), +] + +_DEC_SIZES = [ + pytest.param("small", SmallMessage, _SMALL_BYTES, id="small"), + pytest.param("medium", MediumMessage, _MEDIUM_BYTES, id="medium"), + pytest.param("large", LargeMessage, _LARGE_BYTES, id="large"), +] + +_RT_SIZES = [ + pytest.param("small", _SMALL_OBJ, SmallMessage, id="small"), + pytest.param("medium", _MEDIUM_OBJ, MediumMessage, id="medium"), + pytest.param("large", _LARGE_OBJ, LargeMessage, id="large"), +] + + +# ── Encode memory tests ── + + +@pytest.mark.parametrize("label,obj", _ENC_SIZES) +def test_encode_memory_zerodep(label: str, obj) -> None: + """Measure peak memory for zerodep protobuf encode.""" + peak_kb = _measure_peak_kb(obj.serialize) + print(f"\n[protobuf encode zerodep {label:6s}] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 + + +@pytest.mark.skipif(not HAS_GOOGLE_PB, reason="google-protobuf not installed") +def test_encode_memory_google_small() -> None: + """Measure peak memory for google-protobuf encode (small).""" + obj = _GoogSmall(name="benchmark", value=42, flag=True) + + peak_kb = _measure_peak_kb(obj.SerializeToString) + print(f"\n[protobuf encode google small ] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 + + +@pytest.mark.skipif(not HAS_GOOGLE_PB, reason="google-protobuf not installed") +def test_encode_memory_google_large() -> None: + """Measure peak memory for google-protobuf encode (large).""" + obj = _GoogLarge( + id=987654321, + name="Large benchmark message", + scores=[float(i) * 0.1 for i in range(100)], + active=True, + ) + peak_kb = _measure_peak_kb(obj.SerializeToString) + print(f"\n[protobuf encode google large ] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 + + +# ── Decode memory tests ── + + +@pytest.mark.parametrize("label,cls,data", _DEC_SIZES) +def test_decode_memory_zerodep(label: str, cls, data: bytes) -> None: + """Measure peak memory for zerodep protobuf decode.""" + peak_kb = _measure_peak_kb(cls.parse, data) + print(f"\n[protobuf decode zerodep {label:6s}] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 + + +# ── Roundtrip memory tests ── + + +@pytest.mark.parametrize("label,obj,cls", _RT_SIZES) +def test_roundtrip_memory_zerodep(label: str, obj, cls) -> None: + """Measure peak memory for zerodep protobuf roundtrip.""" + + def roundtrip(): + return cls.parse(obj.serialize()) + + peak_kb = _measure_peak_kb(roundtrip) + print(f"\n[protobuf roundtrip zerodep {label:6s}] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 diff --git a/readability/test_readability_memory.py b/readability/test_readability_memory.py new file mode 100644 index 0000000..e844f7c --- /dev/null +++ b/readability/test_readability_memory.py @@ -0,0 +1,225 @@ +"""Memory benchmarks: zerodep readability vs readability-lxml. + +Uses tracemalloc to measure peak heap allocation for HTML content +extraction at three fixture size tiers (S/M/L from Mozilla test pages). +Results are printed in KB so they are visible in plain ``pytest -s`` +output. No pytest-benchmark required. +""" + +import importlib +import os +import sys +import tracemalloc + +import pytest + +sys.path.insert(0, os.path.dirname(__file__)) + +from readability import extract # noqa: E402 + +# ── readability-lxml reference ── + +try: + from importlib.metadata import version as _pkg_version + + _pkg_version("readability-lxml") + _HAS_REFERENCE = True +except Exception: + _HAS_REFERENCE = False + + +def _load_reference_document_class(): + """Load readability-lxml's Document class, working around name clash.""" + saved_path = sys.path[:] + saved_modules = { + k: sys.modules.pop(k) + for k in list(sys.modules) + if k == "readability" or k.startswith("readability.") + } + try: + this_dir = os.path.dirname(__file__) + this_abs = os.path.abspath(this_dir) + sys.path = [p for p in sys.path if os.path.abspath(p) != this_abs] + mod = importlib.import_module("readability") + return mod.Document + finally: + sys.path = saved_path + for k in list(sys.modules): + if k == "readability" or k.startswith("readability."): + del sys.modules[k] + sys.modules.update(saved_modules) + + +if _HAS_REFERENCE: + _RefDocument = _load_reference_document_class() +else: + _RefDocument = None + + +# ── Test fixtures: Mozilla Readability test pages ── + +_TEST_PAGES_DIR = os.path.join(os.path.dirname(__file__), "test-pages") + +_SMALL_FIXTURES = ["rtl-1", "basic-tags-cleaning", "003-metadata-preferred"] +_MEDIUM_FIXTURES = ["001", "ars-1"] +_LARGE_FIXTURES = ["cnn", "bbc-1", "guardian-1"] + + +def _fixture_path(name: str) -> str: + return os.path.join(_TEST_PAGES_DIR, name, "source.html") + + +def _pick_available(names: list) -> str | None: + """Return the first available fixture name, or None.""" + for name in names: + if os.path.isfile(_fixture_path(name)): + return name + return None + + +_FIXTURE_CACHE: dict[str, str] = {} + + +def _get_fixture(name: str) -> str: + """Get cached fixture HTML.""" + if name not in _FIXTURE_CACHE: + with open(_fixture_path(name), encoding="utf-8") as f: + _FIXTURE_CACHE[name] = f.read() + return _FIXTURE_CACHE[name] + + +# Resolve one fixture per size tier (skip entire tier if none available). +_SMALL = _pick_available(_SMALL_FIXTURES) +_MEDIUM = _pick_available(_MEDIUM_FIXTURES) +_LARGE = _pick_available(_LARGE_FIXTURES) + + +# ── Helpers ── + + +def _measure_peak_kb(fn, *args, **kwargs) -> float: + """Run *fn* with *args*/*kwargs* under tracemalloc and return peak KB.""" + tracemalloc.start() + try: + fn(*args, **kwargs) + _, peak = tracemalloc.get_traced_memory() + finally: + tracemalloc.stop() + return peak / 1024 + + +def _zd_extract(html: str) -> None: + extract(html) + + +def _ref_extract(html: str) -> None: + doc = _RefDocument(html) + doc.summary() + + +# ── Memory tests ── + + +@pytest.mark.skipif(_SMALL is None, reason="no small fixture available") +def test_extract_memory_zerodep_small() -> None: + """Measure peak memory for zerodep extract on a small fixture.""" + html = _get_fixture(_SMALL) + peak_kb = _measure_peak_kb(_zd_extract, html) + print(f"\n[readability zerodep small ] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 + + +@pytest.mark.skipif(_MEDIUM is None, reason="no medium fixture available") +def test_extract_memory_zerodep_medium() -> None: + """Measure peak memory for zerodep extract on a medium fixture.""" + html = _get_fixture(_MEDIUM) + peak_kb = _measure_peak_kb(_zd_extract, html) + print(f"\n[readability zerodep medium] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 + + +@pytest.mark.skipif(_LARGE is None, reason="no large fixture available") +def test_extract_memory_zerodep_large() -> None: + """Measure peak memory for zerodep extract on a large fixture.""" + html = _get_fixture(_LARGE) + peak_kb = _measure_peak_kb(_zd_extract, html) + print(f"\n[readability zerodep large ] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 + + +@pytest.mark.skipif(not _HAS_REFERENCE, reason="readability-lxml not installed") +@pytest.mark.skipif(_SMALL is None, reason="no small fixture available") +def test_extract_memory_reference_small() -> None: + """Measure peak memory for readability-lxml on a small fixture.""" + html = _get_fixture(_SMALL) + peak_kb = _measure_peak_kb(_ref_extract, html) + print(f"\n[readability lxml small ] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 + + +@pytest.mark.skipif(not _HAS_REFERENCE, reason="readability-lxml not installed") +@pytest.mark.skipif(_MEDIUM is None, reason="no medium fixture available") +def test_extract_memory_reference_medium() -> None: + """Measure peak memory for readability-lxml on a medium fixture.""" + html = _get_fixture(_MEDIUM) + peak_kb = _measure_peak_kb(_ref_extract, html) + print(f"\n[readability lxml medium] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 + + +@pytest.mark.skipif(not _HAS_REFERENCE, reason="readability-lxml not installed") +@pytest.mark.skipif(_LARGE is None, reason="no large fixture available") +def test_extract_memory_reference_large() -> None: + """Measure peak memory for readability-lxml on a large fixture.""" + html = _get_fixture(_LARGE) + peak_kb = _measure_peak_kb(_ref_extract, html) + print(f"\n[readability lxml large ] peak memory: {peak_kb:.1f} KB") + assert peak_kb >= 0 + + +@pytest.mark.skipif(not _HAS_REFERENCE, reason="readability-lxml not installed") +@pytest.mark.skipif(_SMALL is None, reason="no small fixture available") +def test_extract_memory_comparison_small() -> None: + """Compare zerodep vs readability-lxml peak memory on small fixture.""" + html = _get_fixture(_SMALL) + zd_kb = _measure_peak_kb(_zd_extract, html) + ref_kb = _measure_peak_kb(_ref_extract, html) + ratio = zd_kb / ref_kb if ref_kb > 0 else float("inf") + print( + f"\n[readability compare small ] zerodep={zd_kb:.1f} KB " + f"lxml={ref_kb:.1f} KB ratio={ratio:.2f}x" + ) + assert zd_kb >= 0 + assert ref_kb >= 0 + + +@pytest.mark.skipif(not _HAS_REFERENCE, reason="readability-lxml not installed") +@pytest.mark.skipif(_MEDIUM is None, reason="no medium fixture available") +def test_extract_memory_comparison_medium() -> None: + """Compare zerodep vs readability-lxml peak memory on medium fixture.""" + html = _get_fixture(_MEDIUM) + zd_kb = _measure_peak_kb(_zd_extract, html) + ref_kb = _measure_peak_kb(_ref_extract, html) + ratio = zd_kb / ref_kb if ref_kb > 0 else float("inf") + print( + f"\n[readability compare medium] zerodep={zd_kb:.1f} KB " + f"lxml={ref_kb:.1f} KB ratio={ratio:.2f}x" + ) + assert zd_kb >= 0 + assert ref_kb >= 0 + + +@pytest.mark.skipif(not _HAS_REFERENCE, reason="readability-lxml not installed") +@pytest.mark.skipif(_LARGE is None, reason="no large fixture available") +def test_extract_memory_comparison_large() -> None: + """Compare zerodep vs readability-lxml peak memory on large fixture.""" + html = _get_fixture(_LARGE) + zd_kb = _measure_peak_kb(_zd_extract, html) + ref_kb = _measure_peak_kb(_ref_extract, html) + ratio = zd_kb / ref_kb if ref_kb > 0 else float("inf") + print( + f"\n[readability compare large ] zerodep={zd_kb:.1f} KB " + f"lxml={ref_kb:.1f} KB ratio={ratio:.2f}x" + ) + assert zd_kb >= 0 + assert ref_kb >= 0 diff --git a/soup/test_soup_memory.py b/soup/test_soup_memory.py new file mode 100644 index 0000000..63b1726 --- /dev/null +++ b/soup/test_soup_memory.py @@ -0,0 +1,110 @@ +"""Memory benchmarks: zerodep soup vs beautifulsoup4. + +Uses tracemalloc to measure peak heap allocation for parse + find_all +at three input sizes (S/M/L). Results are printed in KB so they are +visible in plain ``pytest -s`` output. No pytest-benchmark required. +""" + +import os +import sys +import tracemalloc + +import pytest + +sys.path.insert(0, os.path.dirname(__file__)) + +from soup import Soup + +bs4 = pytest.importorskip("bs4", reason="beautifulsoup4 not installed") +BeautifulSoup = bs4.BeautifulSoup + + +# ── Test data ── + + +def _make_html(n_tags: int) -> str: + """Generate an HTML document with *n_tags* leaf elements.""" + lines = [ + "", + "