openedx · tbain · Mar 18, 2026 · Mar 23, 2026 · Mar 25, 2026 · Mar 26, 2026
diff --git a/src/openedx_tagging/models/base.py b/src/openedx_tagging/models/base.py
@@ -5,13 +5,15 @@
 from __future__ import annotations
 
 import logging
+import operator
 import re
+from functools import reduce
 from typing import List, Self
 
 from django.core.exceptions import ValidationError
 from django.db import models
-from django.db.models import F, Value
-from django.db.models.functions import Concat, Length, Replace, Substr
+from django.db.models import Count, F, IntegerField, Q, Subquery, Value
+from django.db.models.functions import Coalesce, Concat, Length, Replace, Substr
 from django.utils.functional import cached_property
 from django.utils.module_loading import import_string
 from django.utils.translation import gettext_lazy as _
@@ -539,16 +541,7 @@ def _get_filtered_tags_one_level(
         qs = qs.values("value", "child_count", "descendant_count", "depth", "parent_value", "external_id", "_id")
         qs = qs.order_by("value")
         if include_counts:
-            # We need to include the count of how many times this tag is used to tag objects.
-            # You'd think we could just use:
-            #     qs = qs.annotate(usage_count=models.Count("objecttag__pk"))
-            # but that adds another join which starts creating a cross product and the children and usage_count become
-            # intertwined and multiplied with each other. So we use a subquery.
-            obj_tags = ObjectTag.objects.filter(tag_id=models.OuterRef("pk")).order_by().annotate(
-                # We need to use Func() to get Count() without GROUP BY - see https://stackoverflow.com/a/69031027
-                count=models.Func(F('id'), function='Count')
-            )
-            qs = qs.annotate(usage_count=models.Subquery(obj_tags.values('count')))
+            qs = self.add_counts_query(qs)
         return qs  # type: ignore[return-value]
 
     def _get_filtered_tags_deep(
@@ -644,18 +637,70 @@ def _get_filtered_tags_deep(
         # ordering by it gives the tree sort order that we want.
         qs = qs.order_by("lineage")
         if include_counts:
-            # Including the counts is a bit tricky; see the comment above in _get_filtered_tags_one_level()
-            obj_tags = (
-                ObjectTag.objects.filter(tag_id=models.OuterRef("pk"))
-                .order_by()
-                .annotate(
-                    # We need to use Func() to get Count() without GROUP BY - see https://stackoverflow.com/a/69031027
-                    count=models.Func(F("id"), function="Count")
-                )
-            )
-            qs = qs.annotate(usage_count=models.Subquery(obj_tags.values("count")))
+            qs = self.add_counts_query(qs)
+
         return qs  # type: ignore[return-value]
 
+    def add_counts_query(self, qs: models.QuerySet):
-            qs = self.add_counts_query(qs)
-
-        return qs  # type: ignore[return-value]
-
-    def add_counts_query(self, qs: models.QuerySet):
+            qs = self._add_counts_query(qs)
+
+        return qs  # type: ignore[return-value]
+
+    def _add_counts_query(self, qs: TagDataQuerySet) -> TagDataQuerySet:
-            qs = self.add_counts_query(qs)
-
-        return qs  # type: ignore[return-value]
-
-    def add_counts_query(self, qs: models.QuerySet):
+            qs = self._add_counts_query(qs)
+
+        return qs  # type: ignore[return-value]
+
+    def _add_counts_query(self, qs: TagDataQuerySet) -> TagDataQuerySet:
+        """
+        Adds a subquery to the passed-in queryset that returns the usage_count
+        for a given tag, or the appropriate count with de-deuplication per Object
-        for a given tag, or the appropriate count with de-deuplication per Object
+        for a given tag, or the appropriate count with deduplication per Object
-        for a given tag, or the appropriate count with de-deuplication per Object
+        for a given tag, or the appropriate count with deduplication per Object
+        for the parents of a used child tag
+        :param qs: The QuerySet to annotate with usage counts.
+        :return: the queryset annotated with the usage counts
-        for a given tag, or the appropriate count with de-deuplication per Object
-        for the parents of a used child tag
-        :param qs: The QuerySet to annotate with usage counts.
-        :return: the queryset annotated with the usage counts
+        for a given tag, or the appropriate count with de-duplication per object
+        for the parents of a used child tag.
+
+        The ``qs`` argument is the QuerySet to annotate with usage counts, and
+        the returned queryset is annotated with those usage counts.
-        for a given tag, or the appropriate count with de-deuplication per Object
-        for the parents of a used child tag
-        :param qs: The QuerySet to annotate with usage counts.
-        :return: the queryset annotated with the usage counts
+        for a given tag, or the appropriate count with de-duplication per object
+        for the parents of a used child tag.
+
+        The ``qs`` argument is the QuerySet to annotate with usage counts, and
+        the returned queryset is annotated with those usage counts.
+        """
+        # Adds a subquery to the passed-in queryset that returns the number
+        # of times a tag has been used.
+        #
+        # Note: The count is not a simple count, we need to do a 'roll up'
+        # where we count the number of times a tag is directly used and applied,
+        # but then that also needs to add a "1" count to the lineage tags
+        # (parent, grandparent, etc.), but de-duplicate counts for any children
+        # so that if we have "2" child tags, it only counts towards "1" for the
+        # parent.
+        # This query gets the raw counts for each tag usage, gets the distinct
+        # usages (so de-duplicates counts) by actual application to an "Object"
+        # (library, course, course module, course section, etc.), which creates
+        # a count per tag, annotated to that particular tag from the passed-in
+        # queryset.
+
+        # Since Depth may change depending on the value of TAXONOMY_MAX_DEPTH, dynamically
+        # build a list of lineage paths to be used in the query, so we're not hard coding to
+        # a certain number of levels. This will build an array containing something like:
+        # ['tag_id', 'tag__parent_id', 'tag__parent__parent_id', 'tag__parent__parent__parent_id', ...]
+        lineage_paths = [f"tag{'__parent' * i}_id" for i in range(TAXONOMY_MAX_DEPTH+1)]
-        lineage_paths = [f"tag{'__parent' * i}_id" for i in range(TAXONOMY_MAX_DEPTH+1)]
+        lineage_paths = [f"tag{'__parent' * i}_id" for i in range(TAXONOMY_MAX_DEPTH + 1)]
-        lineage_paths = [f"tag{'__parent' * i}_id" for i in range(TAXONOMY_MAX_DEPTH+1)]
+        lineage_paths = [f"tag{'__parent' * i}_id" for i in range(TAXONOMY_MAX_DEPTH + 1)]
+
+        # Combine the above-built lineage with a Q query against the OuterRef("pk"),
+        lineage_query_list = [Q(**{path: models.OuterRef("pk")}) for path in lineage_paths]
+
+        usage_count_qs = ObjectTag.objects.filter(
+            # Combine the logic built above with an or operator to flesh out a
+            # lineage query of the form:
+            # ```
+            #   Q(tag_id=OuterRef('pk')) |
+            #   Q(tag__parent_id=OuterRef('pk')) |
+            #   Q(tag__parent__parent_id=OuterRef('pk')) |
+            #   ...
+            # ```
+            # Previously the above was hard coded and needed to be changed with every
+            # change in TAXONOMY_MAX_DEPTH, now it is dynamic to reduce maintenace
-            # change in TAXONOMY_MAX_DEPTH, now it is dynamic to reduce maintenace
+            # change in TAXONOMY_MAX_DEPTH, now it is dynamic to reduce maintenance
-            # change in TAXONOMY_MAX_DEPTH, now it is dynamic to reduce maintenace
+            # change in TAXONOMY_MAX_DEPTH, now it is dynamic to reduce maintenance
+            # (Thanks Google for helping me build this)
-            # change in TAXONOMY_MAX_DEPTH, now it is dynamic to reduce maintenace
-            # (Thanks Google for helping me build this)
+            # change in TAXONOMY_MAX_DEPTH; now it is built dynamically to reduce
+            # maintenance overhead and keep the query aligned with the configured depth.
-            # change in TAXONOMY_MAX_DEPTH, now it is dynamic to reduce maintenace
-            # (Thanks Google for helping me build this)
+            # change in TAXONOMY_MAX_DEPTH; now it is built dynamically to reduce
+            # maintenance overhead and keep the query aligned with the configured depth.
+
+            reduce(operator.or_, lineage_query_list)
+        ).values('object_id').distinct().annotate(
+            intermediate_grouping=Value(1, output_field=IntegerField())
+        ).values('intermediate_grouping').annotate(
+            total_usage=Count('object_id', distinct=True)
+        ).values('total_usage')
+
+        qs = qs.annotate(
+            usage_count=Coalesce(
+                Subquery(usage_count_qs, output_field=IntegerField()),
+                0  # Coalesce ensures we return 0 instead of None if there are no usages
+            )
+        )
+        return qs
+
     def add_tag(
         self,
         tag_value: str,

diff --git a/tests/openedx_tagging/test_api.py b/tests/openedx_tagging/test_api.py
@@ -755,43 +755,43 @@ def get_object_tags():
             "Archaea (used: 1, children: 2)",
             "  Euryarchaeida (used: 0, children: 0)",
             "  Proteoarchaeota (used: 0, children: 0)",
-            "Bacteria (used: 0, children: 1)",  # does not contain "cha" but a child does
+            "Bacteria (used: 1, children: 1)",  # does not contain "cha" but a child does
             "  Archaebacteria (used: 1, children: 0)",
         ]),
         ("ar", [
             "Archaea (used: 1, children: 2)",
             "  Euryarchaeida (used: 0, children: 0)",
             "  Proteoarchaeota (used: 0, children: 0)",
-            "Bacteria (used: 0, children: 1)",  # does not contain "ar" but a child does
+            "Bacteria (used: 1, children: 1)",  # does not contain "ar" but a child does
             "  Archaebacteria (used: 1, children: 0)",
-            "Eukaryota (used: 0, children: 1 + 2)",
-            "  Animalia (used: 1, children: 2)",  # does not contain "ar" but a child does
+            "Eukaryota (used: 6, children: 1 + 2)",
+            "  Animalia (used: 4, children: 2)",  # does not contain "ar" but a child does
             "    Arthropoda (used: 1, children: 0)",
             "    Cnidaria (used: 0, children: 0)",
         ]),
         ("aE", [
             "Archaea (used: 1, children: 2)",
             "  Euryarchaeida (used: 0, children: 0)",
             "  Proteoarchaeota (used: 0, children: 0)",
-            "Bacteria (used: 0, children: 1)",  # does not contain "ae" but a child does
+            "Bacteria (used: 1, children: 1)",  # does not contain "ae" but a child does
             "  Archaebacteria (used: 1, children: 0)",
-            "Eukaryota (used: 0, children: 1)",  # does not contain "ae" but a child does
+            "Eukaryota (used: 6, children: 1)",  # does not contain "ae" but a child does
             "  Plantae (used: 1, children: 0)",
         ]),
         ("a", [
             "Archaea (used: 1, children: 3)",
             "  DPANN (used: 0, children: 0)",
             "  Euryarchaeida (used: 0, children: 0)",
             "  Proteoarchaeota (used: 0, children: 0)",
-            "Bacteria (used: 0, children: 2)",
+            "Bacteria (used: 1, children: 2)",
             "  Archaebacteria (used: 1, children: 0)",
             "  Eubacteria (used: 0, children: 0)",
-            "Eukaryota (used: 0, children: 4 + 8)",
-            "  Animalia (used: 1, children: 7 + 1)",
+            "Eukaryota (used: 6, children: 4 + 8)",
+            "  Animalia (used: 4, children: 7 + 1)",
             "    Arthropoda (used: 1, children: 0)",
-            "    Chordata (used: 0, children: 1)",
+            "    Chordata (used: 0, children: 1)",  # <<< Chordata has a matching child but we only support searching
             "      Mammalia (used: 0, children: 0)",
-            "    Cnidaria (used: 0, children: 0)",
+            "    Cnidaria (used: 0, children: 0)",  # 3 levels deep at once for now.
             "    Ctenophora (used: 0, children: 0)",
             "    Gastrotrich (used: 1, children: 0)",
             "    Placozoa (used: 1, children: 0)",