Skip to content

Commit 2edeffd

Browse files
committed
Add a stored normalized name column with an index
Add a name_normalized field to PythonPackageContent that stores the pre-computed LOWER(REGEXP_REPLACE(name, ...)) value, populated via a BEFORE_SAVE hook. Add db_index=True. Change all name__normalize= lookups to use name_normalized__exact=. This eliminates the regex computation at query time. closes: #1159 Assisted By: claude-opus-4.6
1 parent f457c52 commit 2edeffd

File tree

5 files changed

+64
-9
lines changed

5 files changed

+64
-9
lines changed

CHANGES/1159.feature

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Added the name_normalized field to PythonPackageContent model with a database index to replace
2+
runtime regex normalization, reducing database load for package name lookups.
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import re
2+
3+
from django.db import migrations, models, transaction
4+
5+
6+
def populate_name_normalized(apps, schema_editor):
7+
"""Populate name_normalized for existing PythonPackageContent rows."""
8+
PythonPackageContent = apps.get_model("python", "PythonPackageContent")
9+
package_bulk = []
10+
for package in PythonPackageContent.objects.only("pk", "name").iterator():
11+
package.name_normalized = re.sub(r"[-_.]+", "-", package.name).lower()
12+
package_bulk.append(package)
13+
if len(package_bulk) == 100000:
14+
with transaction.atomic():
15+
PythonPackageContent.objects.bulk_update(package_bulk, ["name_normalized"])
16+
package_bulk = []
17+
if package_bulk:
18+
with transaction.atomic():
19+
PythonPackageContent.objects.bulk_update(package_bulk, ["name_normalized"])
20+
21+
22+
class Migration(migrations.Migration):
23+
24+
dependencies = [
25+
("python", "0019_create_missing_metadata_artifacts"),
26+
]
27+
28+
operations = [
29+
migrations.AddField(
30+
model_name="pythonpackagecontent",
31+
name="name_normalized",
32+
field=models.TextField(db_index=True, default=""),
33+
),
34+
migrations.RunPython(populate_name_normalized, migrations.RunPython.noop, elidable=True),
35+
migrations.AlterField(
36+
model_name="pythonpackagecontent",
37+
name="name",
38+
field=models.TextField(db_index=True),
39+
),
40+
]

pulp_python/app/models.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ def content_handler(self, path):
115115
if name:
116116
normalized = canonicalize_name(name)
117117
package_content = PythonPackageContent.objects.filter(
118-
pk__in=self.publication.repository_version.content, name__normalize=normalized
118+
pk__in=self.publication.repository_version.content, name_normalized=normalized
119119
)
120120
# TODO Change this value to the Repo's serial value when implemented
121121
headers = {PYPI_LAST_SERIAL: str(PYPI_SERIAL_CONSTANT)}
@@ -168,7 +168,7 @@ class PythonPackageContent(Content):
168168
keywords = models.TextField()
169169
license = models.TextField() # Deprecated in favour of License-Expression
170170
metadata_version = models.TextField()
171-
name = models.TextField()
171+
name = models.TextField(db_index=True)
172172
platform = models.TextField()
173173
summary = models.TextField()
174174
version = models.TextField()
@@ -195,6 +195,9 @@ class PythonPackageContent(Content):
195195
license_expression = models.TextField()
196196
license_file = models.JSONField(default=list)
197197

198+
# Stored normalized name for indexed lookups (replaces NormalizeName REGEXP_REPLACE)
199+
name_normalized = models.TextField(db_index=True, default="")
200+
198201
# Release metadata
199202
filename = models.TextField(db_index=True)
200203
packagetype = models.TextField(choices=PACKAGE_TYPES)
@@ -211,6 +214,11 @@ class PythonPackageContent(Content):
211214
name.register_lookup(NormalizeName)
212215
repo_key_fields = ("filename",)
213216

217+
@hook(BEFORE_SAVE)
218+
def set_name_normalized(self):
219+
"""Pre-compute the normalized package name for indexed lookups."""
220+
self.name_normalized = canonicalize_name(self.name)
221+
214222
@staticmethod
215223
def init_from_artifact_and_relative_path(artifact, relative_path):
216224
"""Used when downloading package from pull-through cache."""

pulp_python/app/pypi/views.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,12 @@ def list(self, request, path):
303303
repo_version, content = self.get_rvc()
304304
if self.should_redirect(repo_version=repo_version):
305305
return redirect(urljoin(self.base_content_url, f"{path}/simple/"))
306-
names = content.order_by("name").values_list("name", flat=True).distinct().iterator()
306+
names = (
307+
content.order_by("name_normalized")
308+
.values_list("name", flat=True)
309+
.distinct("name_normalized")
310+
.iterator()
311+
)
307312
media_type = request.accepted_renderer.media_type
308313
headers = {"X-PyPI-Last-Serial": str(PYPI_SERIAL_CONSTANT)}
309314

@@ -361,7 +366,7 @@ def retrieve(self, request, path, package):
361366
elif self.should_redirect(repo_version=repo_ver):
362367
return redirect(urljoin(self.base_content_url, f"{path}/simple/{normalized}/"))
363368
if content:
364-
local_packages = content.filter(name__normalize=normalized)
369+
local_packages = content.filter(name_normalized=normalized)
365370
packages = local_packages.values(
366371
"filename",
367372
"sha256",
@@ -454,7 +459,7 @@ def retrieve(self, request, path, meta):
454459
name = meta_path.parts[0]
455460
if name:
456461
normalized = canonicalize_name(name)
457-
package_content = content.filter(name__normalize=normalized)
462+
package_content = content.filter(name_normalized=normalized)
458463
# TODO Change this value to the Repo's serial value when implemented
459464
headers = {PYPI_LAST_SERIAL: str(PYPI_SERIAL_CONSTANT)}
460465
if settings.DOMAIN_ENABLED:
@@ -541,7 +546,7 @@ def retrieve(self, request, path, package, version, filename):
541546
repo_ver, content = self.get_rvc()
542547
if content:
543548
package_content = content.filter(
544-
name__normalize=package, version=version, filename=filename
549+
name_normalized=package, version=version, filename=filename
545550
).first()
546551
if package_content:
547552
provenance = self.get_provenances(repo_ver).filter(package=package_content).first()

pulp_python/app/tasks/publish.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,9 @@ def write_simple_api(publication):
6060
python_models.PythonPackageContent.objects.filter(
6161
pk__in=publication.repository_version.content, _pulp_domain=domain
6262
)
63-
.order_by("name__normalize")
63+
.order_by("name_normalized")
6464
.values_list("name", flat=True)
65-
.distinct("name__normalize")
65+
.distinct("name_normalized")
6666
)
6767

6868
# write the root index, which lists all of the projects for which there is a package available
@@ -81,7 +81,7 @@ def write_simple_api(publication):
8181
packages = python_models.PythonPackageContent.objects.filter(
8282
pk__in=publication.repository_version.content, _pulp_domain=domain
8383
)
84-
releases = packages.order_by("name__normalize").values("name", "filename", "sha256")
84+
releases = packages.order_by("name_normalized").values("name", "filename", "sha256")
8585

8686
ind = 0
8787
current_name = canonicalize_name(project_names[ind])

0 commit comments

Comments
 (0)