Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGES/1159.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Added the name_normalized field to PythonPackageContent model with a database index to replace
runtime regex normalization, reducing database load for package name lookups.
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import re

from django.db import migrations, models, transaction


def populate_name_normalized(apps, schema_editor):
"""Populate name_normalized for existing PythonPackageContent rows."""
PythonPackageContent = apps.get_model("python", "PythonPackageContent")
package_bulk = []
for package in PythonPackageContent.objects.only("pk", "name").iterator():
package.name_normalized = re.sub(r"[-_.]+", "-", package.name).lower()
package_bulk.append(package)
if len(package_bulk) == 100000:
with transaction.atomic():
PythonPackageContent.objects.bulk_update(package_bulk, ["name_normalized"])
package_bulk = []
if package_bulk:
with transaction.atomic():
PythonPackageContent.objects.bulk_update(package_bulk, ["name_normalized"])


class Migration(migrations.Migration):

dependencies = [
("python", "0019_create_missing_metadata_artifacts"),
]

operations = [
migrations.AddField(
model_name="pythonpackagecontent",
name="name_normalized",
field=models.TextField(db_index=True, default=""),
),
migrations.RunPython(populate_name_normalized, migrations.RunPython.noop, elidable=True),
]
10 changes: 9 additions & 1 deletion pulp_python/app/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def content_handler(self, path):
if name:
normalized = canonicalize_name(name)
package_content = PythonPackageContent.objects.filter(
pk__in=self.publication.repository_version.content, name__normalize=normalized
pk__in=self.publication.repository_version.content, name_normalized=normalized
)
# TODO Change this value to the Repo's serial value when implemented
headers = {PYPI_LAST_SERIAL: str(PYPI_SERIAL_CONSTANT)}
Expand Down Expand Up @@ -195,6 +195,9 @@ class PythonPackageContent(Content):
license_expression = models.TextField()
license_file = models.JSONField(default=list)

# Stored normalized name for indexed lookups (replaces NormalizeName REGEXP_REPLACE)
name_normalized = models.TextField(db_index=True, default="")

# Release metadata
filename = models.TextField(db_index=True)
packagetype = models.TextField(choices=PACKAGE_TYPES)
Expand All @@ -211,6 +214,11 @@ class PythonPackageContent(Content):
name.register_lookup(NormalizeName)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we still need name.register_lookup(NormalizeName) with this change?

repo_key_fields = ("filename",)

@hook(BEFORE_SAVE)
def set_name_normalized(self):
"""Pre-compute the normalized package name for indexed lookups."""
self.name_normalized = canonicalize_name(self.name)

@staticmethod
def init_from_artifact_and_relative_path(artifact, relative_path):
"""Used when downloading package from pull-through cache."""
Expand Down
13 changes: 9 additions & 4 deletions pulp_python/app/pypi/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,12 @@ def list(self, request, path):
repo_version, content = self.get_rvc()
if self.should_redirect(repo_version=repo_version):
return redirect(urljoin(self.base_content_url, f"{path}/simple/"))
names = content.order_by("name").values_list("name", flat=True).distinct().iterator()
names = (
content.order_by("name_normalized")
.values_list("name", flat=True)
.distinct("name_normalized")
.iterator()
)
media_type = request.accepted_renderer.media_type
headers = {"X-PyPI-Last-Serial": str(PYPI_SERIAL_CONSTANT)}

Expand Down Expand Up @@ -361,7 +366,7 @@ def retrieve(self, request, path, package):
elif self.should_redirect(repo_version=repo_ver):
return redirect(urljoin(self.base_content_url, f"{path}/simple/{normalized}/"))
if content:
local_packages = content.filter(name__normalize=normalized)
local_packages = content.filter(name_normalized=normalized)
packages = local_packages.values(
"filename",
"sha256",
Expand Down Expand Up @@ -454,7 +459,7 @@ def retrieve(self, request, path, meta):
name = meta_path.parts[0]
if name:
normalized = canonicalize_name(name)
package_content = content.filter(name__normalize=normalized)
package_content = content.filter(name_normalized=normalized)
# TODO Change this value to the Repo's serial value when implemented
headers = {PYPI_LAST_SERIAL: str(PYPI_SERIAL_CONSTANT)}
if settings.DOMAIN_ENABLED:
Expand Down Expand Up @@ -541,7 +546,7 @@ def retrieve(self, request, path, package, version, filename):
repo_ver, content = self.get_rvc()
if content:
package_content = content.filter(
name__normalize=package, version=version, filename=filename
name_normalized=package, version=version, filename=filename
).first()
if package_content:
provenance = self.get_provenances(repo_ver).filter(package=package_content).first()
Expand Down
6 changes: 3 additions & 3 deletions pulp_python/app/tasks/publish.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ def write_simple_api(publication):
python_models.PythonPackageContent.objects.filter(
pk__in=publication.repository_version.content, _pulp_domain=domain
)
.order_by("name__normalize")
.order_by("name_normalized")
.values_list("name", flat=True)
.distinct("name__normalize")
.distinct("name_normalized")
)

# write the root index, which lists all of the projects for which there is a package available
Expand All @@ -81,7 +81,7 @@ def write_simple_api(publication):
packages = python_models.PythonPackageContent.objects.filter(
pk__in=publication.repository_version.content, _pulp_domain=domain
)
releases = packages.order_by("name__normalize").values("name", "filename", "sha256")
releases = packages.order_by("name_normalized").values("name", "filename", "sha256")

ind = 0
current_name = canonicalize_name(project_names[ind])
Expand Down
24 changes: 23 additions & 1 deletion pulp_python/app/viewsets.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from bandersnatch.configuration import BandersnatchConfig
from django.db import transaction
from django_filters import CharFilter
from django_filters.rest_framework import filters as drf_filters
from drf_spectacular.utils import extend_schema, extend_schema_view
from packaging.utils import canonicalize_name
from pathlib import Path
from rest_framework import status
from rest_framework.decorators import action
Expand Down Expand Up @@ -329,15 +332,34 @@ class PythonDistributionViewSet(core_viewsets.DistributionViewSet, core_viewsets
}


class NormalizedNameFilter(CharFilter):
"""Filter that normalizes the input and queries name_normalized."""

def filter(self, qs, value):
if value:
if isinstance(value, list):
value = [canonicalize_name(v) for v in value]
else:
value = canonicalize_name(value)
return super().filter(qs, value)


class NormalizedNameInFilter(drf_filters.BaseInFilter, NormalizedNameFilter):
"""In-filter that normalizes each input value and queries name_normalized."""


class PythonPackageContentFilter(core_viewsets.ContentFilter):
"""
FilterSet for PythonPackageContent.
"""

name = NormalizedNameFilter(field_name="name_normalized", lookup_expr="exact")
name__in = NormalizedNameInFilter(field_name="name_normalized", lookup_expr="in")
name__contains = CharFilter(field_name="name", lookup_expr="contains")

class Meta:
model = python_models.PythonPackageContent
fields = {
"name": ["exact", "in", "contains"],
"author": ["exact", "in", "contains"],
"packagetype": ["exact", "in"],
"requires_python": ["exact", "in", "contains"],
Expand Down
Loading