From e8dcc1964578def7a1b3173179e8ff1be53a88c1 Mon Sep 17 00:00:00 2001 From: git-hyagi <45576767+git-hyagi@users.noreply.github.com> Date: Tue, 24 Mar 2026 09:21:57 -0300 Subject: [PATCH] Add a stored normalized name column with an index Add a name_normalized field to PythonPackageContent that stores the pre-computed LOWER(REGEXP_REPLACE(name, ...)) value, populated via a BEFORE_SAVE hook. Add db_index=True. Change all name__normalize= lookups to use name_normalized__exact=. This eliminates the regex computation at query time. closes: #1159 Assisted By: claude-opus-4.6 --- CHANGES/1159.feature | 2 ++ ...20_pythonpackagecontent_name_normalized.py | 35 +++++++++++++++++++ pulp_python/app/models.py | 19 +++++----- pulp_python/app/pypi/views.py | 13 ++++--- pulp_python/app/tasks/publish.py | 6 ++-- pulp_python/app/viewsets.py | 24 ++++++++++++- 6 files changed, 81 insertions(+), 18 deletions(-) create mode 100644 CHANGES/1159.feature create mode 100644 pulp_python/app/migrations/0020_pythonpackagecontent_name_normalized.py diff --git a/CHANGES/1159.feature b/CHANGES/1159.feature new file mode 100644 index 000000000..82940a2a8 --- /dev/null +++ b/CHANGES/1159.feature @@ -0,0 +1,2 @@ +Added the name_normalized field to PythonPackageContent model with a database index to replace +runtime regex normalization, reducing database load for package name lookups. diff --git a/pulp_python/app/migrations/0020_pythonpackagecontent_name_normalized.py b/pulp_python/app/migrations/0020_pythonpackagecontent_name_normalized.py new file mode 100644 index 000000000..eef8cddd8 --- /dev/null +++ b/pulp_python/app/migrations/0020_pythonpackagecontent_name_normalized.py @@ -0,0 +1,35 @@ +import re + +from django.db import migrations, models, transaction + + +def populate_name_normalized(apps, schema_editor): + """Populate name_normalized for existing PythonPackageContent rows.""" + PythonPackageContent = apps.get_model("python", "PythonPackageContent") + package_bulk = [] + for package in PythonPackageContent.objects.only("pk", "name").iterator(): + package.name_normalized = re.sub(r"[-_.]+", "-", package.name).lower() + package_bulk.append(package) + if len(package_bulk) == 100000: + with transaction.atomic(): + PythonPackageContent.objects.bulk_update(package_bulk, ["name_normalized"]) + package_bulk = [] + if package_bulk: + with transaction.atomic(): + PythonPackageContent.objects.bulk_update(package_bulk, ["name_normalized"]) + + +class Migration(migrations.Migration): + + dependencies = [ + ("python", "0019_create_missing_metadata_artifacts"), + ] + + operations = [ + migrations.AddField( + model_name="pythonpackagecontent", + name="name_normalized", + field=models.TextField(db_index=True, default=""), + ), + migrations.RunPython(populate_name_normalized, migrations.RunPython.noop, elidable=True), + ] diff --git a/pulp_python/app/models.py b/pulp_python/app/models.py index b7a5e5483..234628912 100644 --- a/pulp_python/app/models.py +++ b/pulp_python/app/models.py @@ -115,7 +115,7 @@ def content_handler(self, path): if name: normalized = canonicalize_name(name) package_content = PythonPackageContent.objects.filter( - pk__in=self.publication.repository_version.content, name__normalize=normalized + pk__in=self.publication.repository_version.content, name_normalized=normalized ) # TODO Change this value to the Repo's serial value when implemented headers = {PYPI_LAST_SERIAL: str(PYPI_SERIAL_CONSTANT)} @@ -136,14 +136,6 @@ class Meta: ] -class NormalizeName(models.Transform): - """A transform field to normalize package names according to PEP426.""" - - function = "REGEXP_REPLACE" - template = "LOWER(%(function)s(%(expressions)s, '(\.|_|-)', '-', 'ig'))" # noqa:W605 - lookup_name = "normalize" - - class PythonPackageContent(Content): """ A Content Type representing Python's Distribution Package. @@ -195,6 +187,9 @@ class PythonPackageContent(Content): license_expression = models.TextField() license_file = models.JSONField(default=list) + # Stored normalized name for indexed lookups + name_normalized = models.TextField(db_index=True, default="") + # Release metadata filename = models.TextField(db_index=True) packagetype = models.TextField(choices=PACKAGE_TYPES) @@ -208,9 +203,13 @@ class PythonPackageContent(Content): PROTECTED_FROM_RECLAIM = False TYPE = "python" _pulp_domain = models.ForeignKey("core.Domain", default=get_domain_pk, on_delete=models.PROTECT) - name.register_lookup(NormalizeName) repo_key_fields = ("filename",) + @hook(BEFORE_SAVE) + def set_name_normalized(self): + """Pre-compute the normalized package name for indexed lookups.""" + self.name_normalized = canonicalize_name(self.name) + @staticmethod def init_from_artifact_and_relative_path(artifact, relative_path): """Used when downloading package from pull-through cache.""" diff --git a/pulp_python/app/pypi/views.py b/pulp_python/app/pypi/views.py index b7808a9ec..ac8d10e3b 100644 --- a/pulp_python/app/pypi/views.py +++ b/pulp_python/app/pypi/views.py @@ -303,7 +303,12 @@ def list(self, request, path): repo_version, content = self.get_rvc() if self.should_redirect(repo_version=repo_version): return redirect(urljoin(self.base_content_url, f"{path}/simple/")) - names = content.order_by("name").values_list("name", flat=True).distinct().iterator() + names = ( + content.order_by("name_normalized") + .values_list("name", flat=True) + .distinct("name_normalized") + .iterator() + ) media_type = request.accepted_renderer.media_type headers = {"X-PyPI-Last-Serial": str(PYPI_SERIAL_CONSTANT)} @@ -361,7 +366,7 @@ def retrieve(self, request, path, package): elif self.should_redirect(repo_version=repo_ver): return redirect(urljoin(self.base_content_url, f"{path}/simple/{normalized}/")) if content: - local_packages = content.filter(name__normalize=normalized) + local_packages = content.filter(name_normalized=normalized) packages = local_packages.values( "filename", "sha256", @@ -454,7 +459,7 @@ def retrieve(self, request, path, meta): name = meta_path.parts[0] if name: normalized = canonicalize_name(name) - package_content = content.filter(name__normalize=normalized) + package_content = content.filter(name_normalized=normalized) # TODO Change this value to the Repo's serial value when implemented headers = {PYPI_LAST_SERIAL: str(PYPI_SERIAL_CONSTANT)} if settings.DOMAIN_ENABLED: @@ -541,7 +546,7 @@ def retrieve(self, request, path, package, version, filename): repo_ver, content = self.get_rvc() if content: package_content = content.filter( - name__normalize=package, version=version, filename=filename + name_normalized=package, version=version, filename=filename ).first() if package_content: provenance = self.get_provenances(repo_ver).filter(package=package_content).first() diff --git a/pulp_python/app/tasks/publish.py b/pulp_python/app/tasks/publish.py index 4e8a80388..4604a60f9 100644 --- a/pulp_python/app/tasks/publish.py +++ b/pulp_python/app/tasks/publish.py @@ -60,9 +60,9 @@ def write_simple_api(publication): python_models.PythonPackageContent.objects.filter( pk__in=publication.repository_version.content, _pulp_domain=domain ) - .order_by("name__normalize") + .order_by("name_normalized") .values_list("name", flat=True) - .distinct("name__normalize") + .distinct("name_normalized") ) # write the root index, which lists all of the projects for which there is a package available @@ -81,7 +81,7 @@ def write_simple_api(publication): packages = python_models.PythonPackageContent.objects.filter( pk__in=publication.repository_version.content, _pulp_domain=domain ) - releases = packages.order_by("name__normalize").values("name", "filename", "sha256") + releases = packages.order_by("name_normalized").values("name", "filename", "sha256") ind = 0 current_name = canonicalize_name(project_names[ind]) diff --git a/pulp_python/app/viewsets.py b/pulp_python/app/viewsets.py index 9aa19fd83..c99d1c2fd 100644 --- a/pulp_python/app/viewsets.py +++ b/pulp_python/app/viewsets.py @@ -1,6 +1,9 @@ from bandersnatch.configuration import BandersnatchConfig from django.db import transaction +from django_filters import CharFilter +from django_filters.rest_framework import filters as drf_filters from drf_spectacular.utils import extend_schema, extend_schema_view +from packaging.utils import canonicalize_name from pathlib import Path from rest_framework import status from rest_framework.decorators import action @@ -329,15 +332,34 @@ class PythonDistributionViewSet(core_viewsets.DistributionViewSet, core_viewsets } +class NormalizedNameFilter(CharFilter): + """Filter that normalizes the input and queries name_normalized.""" + + def filter(self, qs, value): + if value: + if isinstance(value, list): + value = [canonicalize_name(v) for v in value] + else: + value = canonicalize_name(value) + return super().filter(qs, value) + + +class NormalizedNameInFilter(drf_filters.BaseInFilter, NormalizedNameFilter): + """In-filter that normalizes each input value and queries name_normalized.""" + + class PythonPackageContentFilter(core_viewsets.ContentFilter): """ FilterSet for PythonPackageContent. """ + name = NormalizedNameFilter(field_name="name_normalized", lookup_expr="exact") + name__in = NormalizedNameInFilter(field_name="name_normalized", lookup_expr="in") + name__contains = CharFilter(field_name="name", lookup_expr="contains") + class Meta: model = python_models.PythonPackageContent fields = { - "name": ["exact", "in", "contains"], "author": ["exact", "in", "contains"], "packagetype": ["exact", "in"], "requires_python": ["exact", "in", "contains"],