Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 67 additions & 22 deletions src/openedx_tagging/models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,15 @@
from __future__ import annotations

import logging
import operator
import re
from functools import reduce
from typing import List, Self

from django.core.exceptions import ValidationError
from django.db import models
from django.db.models import F, Value
from django.db.models.functions import Concat, Length, Replace, Substr
from django.db.models import Count, F, IntegerField, Q, Subquery, Value
from django.db.models.functions import Coalesce, Concat, Length, Replace, Substr
from django.utils.functional import cached_property
from django.utils.module_loading import import_string
from django.utils.translation import gettext_lazy as _
Expand Down Expand Up @@ -539,16 +541,7 @@ def _get_filtered_tags_one_level(
qs = qs.values("value", "child_count", "descendant_count", "depth", "parent_value", "external_id", "_id")
qs = qs.order_by("value")
if include_counts:
# We need to include the count of how many times this tag is used to tag objects.
# You'd think we could just use:
# qs = qs.annotate(usage_count=models.Count("objecttag__pk"))
# but that adds another join which starts creating a cross product and the children and usage_count become
# intertwined and multiplied with each other. So we use a subquery.
obj_tags = ObjectTag.objects.filter(tag_id=models.OuterRef("pk")).order_by().annotate(
# We need to use Func() to get Count() without GROUP BY - see https://stackoverflow.com/a/69031027
count=models.Func(F('id'), function='Count')
)
qs = qs.annotate(usage_count=models.Subquery(obj_tags.values('count')))
qs = self.add_counts_query(qs)
return qs # type: ignore[return-value]

def _get_filtered_tags_deep(
Expand Down Expand Up @@ -644,18 +637,70 @@ def _get_filtered_tags_deep(
# ordering by it gives the tree sort order that we want.
qs = qs.order_by("lineage")
if include_counts:
# Including the counts is a bit tricky; see the comment above in _get_filtered_tags_one_level()
obj_tags = (
ObjectTag.objects.filter(tag_id=models.OuterRef("pk"))
.order_by()
.annotate(
# We need to use Func() to get Count() without GROUP BY - see https://stackoverflow.com/a/69031027
count=models.Func(F("id"), function="Count")
)
)
qs = qs.annotate(usage_count=models.Subquery(obj_tags.values("count")))
qs = self.add_counts_query(qs)

return qs # type: ignore[return-value]

def add_counts_query(self, qs: models.QuerySet):
Comment on lines +640 to +644
Copy link

Copilot AI Mar 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add_counts_query is only used internally by get_filtered_tags paths. Making it a public Taxonomy method expands the model’s API surface unnecessarily; consider renaming to _add_counts_query (or similar) and adding an explicit return type (e.g., TagDataQuerySet/models.QuerySet) to clarify intended usage.

Suggested change
qs = self.add_counts_query(qs)
return qs # type: ignore[return-value]
def add_counts_query(self, qs: models.QuerySet):
qs = self._add_counts_query(qs)
return qs # type: ignore[return-value]
def _add_counts_query(self, qs: TagDataQuerySet) -> TagDataQuerySet:

Copilot uses AI. Check for mistakes.
"""
Adds a subquery to the passed-in queryset that returns the usage_count
for a given tag, or the appropriate count with de-deuplication per Object
Copy link

Copilot AI Mar 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo in docstring: “de-deuplication” should be “deduplication”.

Suggested change
for a given tag, or the appropriate count with de-deuplication per Object
for a given tag, or the appropriate count with deduplication per Object

Copilot uses AI. Check for mistakes.
for the parents of a used child tag
:param qs: The QuerySet to annotate with usage counts.
:return: the queryset annotated with the usage counts
Comment on lines +647 to +650
Copy link

Copilot AI Mar 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This docstring uses Sphinx-style ":param"/":return" fields, but other docstrings in this module don’t. For consistency (and to avoid mixed docstring formats), please rewrite this docstring to match the prevailing style used elsewhere in this file.

Suggested change
for a given tag, or the appropriate count with de-deuplication per Object
for the parents of a used child tag
:param qs: The QuerySet to annotate with usage counts.
:return: the queryset annotated with the usage counts
for a given tag, or the appropriate count with de-duplication per object
for the parents of a used child tag.
The ``qs`` argument is the QuerySet to annotate with usage counts, and
the returned queryset is annotated with those usage counts.

Copilot uses AI. Check for mistakes.
"""
# Adds a subquery to the passed-in queryset that returns the number
# of times a tag has been used.
#
# Note: The count is not a simple count, we need to do a 'roll up'
# where we count the number of times a tag is directly used and applied,
# but then that also needs to add a "1" count to the lineage tags
# (parent, grandparent, etc.), but de-duplicate counts for any children
# so that if we have "2" child tags, it only counts towards "1" for the
# parent.
# This query gets the raw counts for each tag usage, gets the distinct
# usages (so de-duplicates counts) by actual application to an "Object"
# (library, course, course module, course section, etc.), which creates
# a count per tag, annotated to that particular tag from the passed-in
# queryset.

# Since Depth may change depending on the value of TAXONOMY_MAX_DEPTH, dynamically
# build a list of lineage paths to be used in the query, so we're not hard coding to
# a certain number of levels. This will build an array containing something like:
# ['tag_id', 'tag__parent_id', 'tag__parent__parent_id', 'tag__parent__parent__parent_id', ...]
lineage_paths = [f"tag{'__parent' * i}_id" for i in range(TAXONOMY_MAX_DEPTH+1)]
Copy link

Copilot AI Mar 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PEP8/style consistency: add spaces around the "+" in range(TAXONOMY_MAX_DEPTH+1) (elsewhere in this file it’s written as TAXONOMY_MAX_DEPTH + 1).

Suggested change
lineage_paths = [f"tag{'__parent' * i}_id" for i in range(TAXONOMY_MAX_DEPTH+1)]
lineage_paths = [f"tag{'__parent' * i}_id" for i in range(TAXONOMY_MAX_DEPTH + 1)]

Copilot uses AI. Check for mistakes.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of using TAXONOMY_MAX_DEPTH for this query, what about using the actual max depth of the current taxonomy? e.g. max_depth = qs.aggregate(models.Max("depth", default=0))["depth__max"] ?


# Combine the above-built lineage with a Q query against the OuterRef("pk"),
lineage_query_list = [Q(**{path: models.OuterRef("pk")}) for path in lineage_paths]

usage_count_qs = ObjectTag.objects.filter(
# Combine the logic built above with an or operator to flesh out a
# lineage query of the form:
# ```
# Q(tag_id=OuterRef('pk')) |
# Q(tag__parent_id=OuterRef('pk')) |
# Q(tag__parent__parent_id=OuterRef('pk')) |
# ...
# ```
# Previously the above was hard coded and needed to be changed with every
# change in TAXONOMY_MAX_DEPTH, now it is dynamic to reduce maintenace
Copy link

Copilot AI Mar 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo in comment: “maintenace” should be “maintenance”.

Suggested change
# change in TAXONOMY_MAX_DEPTH, now it is dynamic to reduce maintenace
# change in TAXONOMY_MAX_DEPTH, now it is dynamic to reduce maintenance

Copilot uses AI. Check for mistakes.
# (Thanks Google for helping me build this)
Comment on lines +686 to +687
Copy link

Copilot AI Mar 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The parenthetical “Thanks Google for helping me build this” doesn’t add technical context and is likely to become noise in long-lived code. Please remove it or replace it with a brief explanation of the actual rationale/constraints (e.g., why the Q needs to be built dynamically).

Suggested change
# change in TAXONOMY_MAX_DEPTH, now it is dynamic to reduce maintenace
# (Thanks Google for helping me build this)
# change in TAXONOMY_MAX_DEPTH; now it is built dynamically to reduce
# maintenance overhead and keep the query aligned with the configured depth.

Copilot uses AI. Check for mistakes.

reduce(operator.or_, lineage_query_list)
).values('object_id').distinct().annotate(
intermediate_grouping=Value(1, output_field=IntegerField())
).values('intermediate_grouping').annotate(
total_usage=Count('object_id', distinct=True)
).values('total_usage')

qs = qs.annotate(
usage_count=Coalesce(
Subquery(usage_count_qs, output_field=IntegerField()),
0 # Coalesce ensures we return 0 instead of None if there are no usages
)
)
return qs

def add_tag(
self,
tag_value: str,
Expand Down
22 changes: 11 additions & 11 deletions tests/openedx_tagging/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -755,43 +755,43 @@ def get_object_tags():
"Archaea (used: 1, children: 2)",
" Euryarchaeida (used: 0, children: 0)",
" Proteoarchaeota (used: 0, children: 0)",
"Bacteria (used: 0, children: 1)", # does not contain "cha" but a child does
"Bacteria (used: 1, children: 1)", # does not contain "cha" but a child does
" Archaebacteria (used: 1, children: 0)",
]),
("ar", [
"Archaea (used: 1, children: 2)",
" Euryarchaeida (used: 0, children: 0)",
" Proteoarchaeota (used: 0, children: 0)",
"Bacteria (used: 0, children: 1)", # does not contain "ar" but a child does
"Bacteria (used: 1, children: 1)", # does not contain "ar" but a child does
" Archaebacteria (used: 1, children: 0)",
"Eukaryota (used: 0, children: 1 + 2)",
" Animalia (used: 1, children: 2)", # does not contain "ar" but a child does
"Eukaryota (used: 6, children: 1 + 2)",
" Animalia (used: 4, children: 2)", # does not contain "ar" but a child does
" Arthropoda (used: 1, children: 0)",
" Cnidaria (used: 0, children: 0)",
]),
("aE", [
"Archaea (used: 1, children: 2)",
" Euryarchaeida (used: 0, children: 0)",
" Proteoarchaeota (used: 0, children: 0)",
"Bacteria (used: 0, children: 1)", # does not contain "ae" but a child does
"Bacteria (used: 1, children: 1)", # does not contain "ae" but a child does
" Archaebacteria (used: 1, children: 0)",
"Eukaryota (used: 0, children: 1)", # does not contain "ae" but a child does
"Eukaryota (used: 6, children: 1)", # does not contain "ae" but a child does
" Plantae (used: 1, children: 0)",
]),
("a", [
"Archaea (used: 1, children: 3)",
" DPANN (used: 0, children: 0)",
" Euryarchaeida (used: 0, children: 0)",
" Proteoarchaeota (used: 0, children: 0)",
"Bacteria (used: 0, children: 2)",
"Bacteria (used: 1, children: 2)",
" Archaebacteria (used: 1, children: 0)",
" Eubacteria (used: 0, children: 0)",
"Eukaryota (used: 0, children: 4 + 8)",
" Animalia (used: 1, children: 7 + 1)",
"Eukaryota (used: 6, children: 4 + 8)",
" Animalia (used: 4, children: 7 + 1)",
" Arthropoda (used: 1, children: 0)",
" Chordata (used: 0, children: 1)",
" Chordata (used: 0, children: 1)", # <<< Chordata has a matching child but we only support searching
" Mammalia (used: 0, children: 0)",
" Cnidaria (used: 0, children: 0)",
" Cnidaria (used: 0, children: 0)", # 3 levels deep at once for now.
" Ctenophora (used: 0, children: 0)",
" Gastrotrich (used: 1, children: 0)",
" Placozoa (used: 1, children: 0)",
Expand Down
Loading
Loading