From e8dcc1964578def7a1b3173179e8ff1be53a88c1 Mon Sep 17 00:00:00 2001
From: git-hyagi <45576767+git-hyagi@users.noreply.github.com>
Date: Tue, 24 Mar 2026 09:21:57 -0300
Subject: [PATCH] Add a stored normalized name column with an index

Add a name_normalized field to PythonPackageContent that stores
the pre-computed LOWER(REGEXP_REPLACE(name, ...)) value, populated
via a BEFORE_SAVE hook.
Add db_index=True.
Change all name__normalize= lookups to use name_normalized__exact=.
This eliminates the regex computation at query time.

closes: #1159
Assisted By: claude-opus-4.6
---
 CHANGES/1159.feature                          |  2 ++
 ...20_pythonpackagecontent_name_normalized.py | 35 +++++++++++++++++++
 pulp_python/app/models.py                     | 19 +++++-----
 pulp_python/app/pypi/views.py                 | 13 ++++---
 pulp_python/app/tasks/publish.py              |  6 ++--
 pulp_python/app/viewsets.py                   | 24 ++++++++++++-
 6 files changed, 81 insertions(+), 18 deletions(-)
 create mode 100644 CHANGES/1159.feature
 create mode 100644 pulp_python/app/migrations/0020_pythonpackagecontent_name_normalized.py

diff --git a/CHANGES/1159.feature b/CHANGES/1159.feature
new file mode 100644
index 000000000..82940a2a8
--- /dev/null
+++ b/CHANGES/1159.feature
@@ -0,0 +1,2 @@
+Added the name_normalized field to PythonPackageContent model with a database index to replace
+runtime regex normalization, reducing database load for package name lookups.                                                                                                                                                                      
diff --git a/pulp_python/app/migrations/0020_pythonpackagecontent_name_normalized.py b/pulp_python/app/migrations/0020_pythonpackagecontent_name_normalized.py
new file mode 100644
index 000000000..eef8cddd8
--- /dev/null
+++ b/pulp_python/app/migrations/0020_pythonpackagecontent_name_normalized.py
@@ -0,0 +1,35 @@
+import re
+
+from django.db import migrations, models, transaction
+
+
+def populate_name_normalized(apps, schema_editor):
+    """Populate name_normalized for existing PythonPackageContent rows."""
+    PythonPackageContent = apps.get_model("python", "PythonPackageContent")
+    package_bulk = []
+    for package in PythonPackageContent.objects.only("pk", "name").iterator():
+        package.name_normalized = re.sub(r"[-_.]+", "-", package.name).lower()
+        package_bulk.append(package)
+        if len(package_bulk) == 100000:
+            with transaction.atomic():
+                PythonPackageContent.objects.bulk_update(package_bulk, ["name_normalized"])
+                package_bulk = []
+    if package_bulk:
+        with transaction.atomic():
+            PythonPackageContent.objects.bulk_update(package_bulk, ["name_normalized"])
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("python", "0019_create_missing_metadata_artifacts"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="pythonpackagecontent",
+            name="name_normalized",
+            field=models.TextField(db_index=True, default=""),
+        ),
+        migrations.RunPython(populate_name_normalized, migrations.RunPython.noop, elidable=True),
+    ]
diff --git a/pulp_python/app/models.py b/pulp_python/app/models.py
index b7a5e5483..234628912 100644
--- a/pulp_python/app/models.py
+++ b/pulp_python/app/models.py
@@ -115,7 +115,7 @@ def content_handler(self, path):
         if name:
             normalized = canonicalize_name(name)
             package_content = PythonPackageContent.objects.filter(
-                pk__in=self.publication.repository_version.content, name__normalize=normalized
+                pk__in=self.publication.repository_version.content, name_normalized=normalized
             )
             # TODO Change this value to the Repo's serial value when implemented
             headers = {PYPI_LAST_SERIAL: str(PYPI_SERIAL_CONSTANT)}
@@ -136,14 +136,6 @@ class Meta:
         ]
 
 
-class NormalizeName(models.Transform):
-    """A transform field to normalize package names according to PEP426."""
-
-    function = "REGEXP_REPLACE"
-    template = "LOWER(%(function)s(%(expressions)s, '(\.|_|-)', '-', 'ig'))"  # noqa:W605
-    lookup_name = "normalize"
-
-
 class PythonPackageContent(Content):
     """
     A Content Type representing Python's Distribution Package.
@@ -195,6 +187,9 @@ class PythonPackageContent(Content):
     license_expression = models.TextField()
     license_file = models.JSONField(default=list)
 
+    # Stored normalized name for indexed lookups
+    name_normalized = models.TextField(db_index=True, default="")
+
     # Release metadata
     filename = models.TextField(db_index=True)
     packagetype = models.TextField(choices=PACKAGE_TYPES)
@@ -208,9 +203,13 @@ class PythonPackageContent(Content):
     PROTECTED_FROM_RECLAIM = False
     TYPE = "python"
     _pulp_domain = models.ForeignKey("core.Domain", default=get_domain_pk, on_delete=models.PROTECT)
-    name.register_lookup(NormalizeName)
     repo_key_fields = ("filename",)
 
+    @hook(BEFORE_SAVE)
+    def set_name_normalized(self):
+        """Pre-compute the normalized package name for indexed lookups."""
+        self.name_normalized = canonicalize_name(self.name)
+
     @staticmethod
     def init_from_artifact_and_relative_path(artifact, relative_path):
         """Used when downloading package from pull-through cache."""
diff --git a/pulp_python/app/pypi/views.py b/pulp_python/app/pypi/views.py
index b7808a9ec..ac8d10e3b 100644
--- a/pulp_python/app/pypi/views.py
+++ b/pulp_python/app/pypi/views.py
@@ -303,7 +303,12 @@ def list(self, request, path):
         repo_version, content = self.get_rvc()
         if self.should_redirect(repo_version=repo_version):
             return redirect(urljoin(self.base_content_url, f"{path}/simple/"))
-        names = content.order_by("name").values_list("name", flat=True).distinct().iterator()
+        names = (
+            content.order_by("name_normalized")
+            .values_list("name", flat=True)
+            .distinct("name_normalized")
+            .iterator()
+        )
         media_type = request.accepted_renderer.media_type
         headers = {"X-PyPI-Last-Serial": str(PYPI_SERIAL_CONSTANT)}
 
@@ -361,7 +366,7 @@ def retrieve(self, request, path, package):
         elif self.should_redirect(repo_version=repo_ver):
             return redirect(urljoin(self.base_content_url, f"{path}/simple/{normalized}/"))
         if content:
-            local_packages = content.filter(name__normalize=normalized)
+            local_packages = content.filter(name_normalized=normalized)
             packages = local_packages.values(
                 "filename",
                 "sha256",
@@ -454,7 +459,7 @@ def retrieve(self, request, path, meta):
             name = meta_path.parts[0]
         if name:
             normalized = canonicalize_name(name)
-            package_content = content.filter(name__normalize=normalized)
+            package_content = content.filter(name_normalized=normalized)
             # TODO Change this value to the Repo's serial value when implemented
             headers = {PYPI_LAST_SERIAL: str(PYPI_SERIAL_CONSTANT)}
             if settings.DOMAIN_ENABLED:
@@ -541,7 +546,7 @@ def retrieve(self, request, path, package, version, filename):
         repo_ver, content = self.get_rvc()
         if content:
             package_content = content.filter(
-                name__normalize=package, version=version, filename=filename
+                name_normalized=package, version=version, filename=filename
             ).first()
             if package_content:
                 provenance = self.get_provenances(repo_ver).filter(package=package_content).first()
diff --git a/pulp_python/app/tasks/publish.py b/pulp_python/app/tasks/publish.py
index 4e8a80388..4604a60f9 100644
--- a/pulp_python/app/tasks/publish.py
+++ b/pulp_python/app/tasks/publish.py
@@ -60,9 +60,9 @@ def write_simple_api(publication):
         python_models.PythonPackageContent.objects.filter(
             pk__in=publication.repository_version.content, _pulp_domain=domain
         )
-        .order_by("name__normalize")
+        .order_by("name_normalized")
         .values_list("name", flat=True)
-        .distinct("name__normalize")
+        .distinct("name_normalized")
     )
 
     # write the root index, which lists all of the projects for which there is a package available
@@ -81,7 +81,7 @@ def write_simple_api(publication):
     packages = python_models.PythonPackageContent.objects.filter(
         pk__in=publication.repository_version.content, _pulp_domain=domain
     )
-    releases = packages.order_by("name__normalize").values("name", "filename", "sha256")
+    releases = packages.order_by("name_normalized").values("name", "filename", "sha256")
 
     ind = 0
     current_name = canonicalize_name(project_names[ind])
diff --git a/pulp_python/app/viewsets.py b/pulp_python/app/viewsets.py
index 9aa19fd83..c99d1c2fd 100644
--- a/pulp_python/app/viewsets.py
+++ b/pulp_python/app/viewsets.py
@@ -1,6 +1,9 @@
 from bandersnatch.configuration import BandersnatchConfig
 from django.db import transaction
+from django_filters import CharFilter
+from django_filters.rest_framework import filters as drf_filters
 from drf_spectacular.utils import extend_schema, extend_schema_view
+from packaging.utils import canonicalize_name
 from pathlib import Path
 from rest_framework import status
 from rest_framework.decorators import action
@@ -329,15 +332,34 @@ class PythonDistributionViewSet(core_viewsets.DistributionViewSet, core_viewsets
     }
 
 
+class NormalizedNameFilter(CharFilter):
+    """Filter that normalizes the input and queries name_normalized."""
+
+    def filter(self, qs, value):
+        if value:
+            if isinstance(value, list):
+                value = [canonicalize_name(v) for v in value]
+            else:
+                value = canonicalize_name(value)
+        return super().filter(qs, value)
+
+
+class NormalizedNameInFilter(drf_filters.BaseInFilter, NormalizedNameFilter):
+    """In-filter that normalizes each input value and queries name_normalized."""
+
+
 class PythonPackageContentFilter(core_viewsets.ContentFilter):
     """
     FilterSet for PythonPackageContent.
     """
 
+    name = NormalizedNameFilter(field_name="name_normalized", lookup_expr="exact")
+    name__in = NormalizedNameInFilter(field_name="name_normalized", lookup_expr="in")
+    name__contains = CharFilter(field_name="name", lookup_expr="contains")
+
     class Meta:
         model = python_models.PythonPackageContent
         fields = {
-            "name": ["exact", "in", "contains"],
             "author": ["exact", "in", "contains"],
             "packagetype": ["exact", "in"],
             "requires_python": ["exact", "in", "contains"],