Improve tag-overview performance

* denormalize IssueTag.key; this allows for key to be used in and index (issue, key, count). * rewrite to grouping-first, per-key-query-second. i.e. reverts part of bbfee84c6a. Reasoning: I don't want to rely on "mostly unique" always guessing correctly, and we don't dynamically determine that yet. Which means that (in the single query version) if you'd have a per-event value for some tag, you could end up iterating over as many values as there are events, which won't work. * in tags.py, do the tab-check first to avoid doing the tag-calculation twice. * further denormalation (of key__key, of value__str) actually turns out to not be required for both the grouping and indivdual queries to be fast. Performance tests, as always, against sqlite3. -- Roads not taken/background * This commit removes a future TODO that "A point _could_ be made for ['issue', '?value?' 'count']", I tried both versions of that index (against the group-then-query version, the only one which I trust) but without denormalization of key, I could not get it to be fast. * I thought about a hybrid approach (for those keys with low counts of values do the single-query thing) but as it stands the extra complexity isn't worth it. --- on the 1.2M events, 3 (user defined) tags / event test env this basically lowers the time from "seconds" to "miliseconds".
2026-03-10 08:01:17 +00:00 · 2025-03-11 22:06:27 +01:00
parent b031792784
commit cd7f3978cf
5 changed files with 42 additions and 31 deletions
--- a/issues/models.py
+++ b/issues/models.py
@@ -1,7 +1,6 @@
 import json
 import uuid
 from functools import partial
-from itertools import groupby

 from django.db import models, transaction
 from django.db.models import F, Value
@@ -128,22 +127,25 @@ class Issue(models.Model):
    def _get_issue_tags(self, other_cutoff, other_label):
        result = []

-        all_issue_tags = (
-            IssueTag.objects
-            .filter(issue=self, value__key__mostly_unique=False)  # note: project is implied through issue
-            .order_by("value__key__key", "-count")
-            .select_related("value", "value__key"))
+        ds = self.tags \
+            .filter(key__mostly_unique=False)\
+            .values("key")\
+            .annotate(count_sum=models.Sum("count"))\
+            .distinct()\
+            .order_by("key__key")

-        # NOTE: at some point in the past we did grouping in SQL first; and then did 1 per-group query to get the
-        # the actual values. This was less efficient in terms of the number of queries, but had the advantage of quickly
-        # stopping work when there were very many values for a certain key. We now rely on the 'mostly_unique' property
-        # for the line below to not do way too much work; if it ever turns out to be so, we may even push the
-        # 'mostly_unique' property to be dynamically calculated (setting it to True when very many values are observed)
-        grouped = groupby(all_issue_tags, lambda x: x.value.key.key)
+        for d in ds:
+            issue_tags = [
+                issue_tag
+                for issue_tag in
+                (IssueTag.objects
+                 .filter(issue=self, key=d['key'])  # note: project is implied through issue
+                 .order_by("-count")
+                 .select_related("value", "key")[:other_cutoff + 1]  # +1 to see if we need to add "Other"
+                 )
+            ]

-        for key, group in grouped:
-            issue_tags = list(group)
-            total_seen = sum(issue_tag.count for issue_tag in issue_tags)
+            total_seen = d["count_sum"]

            seen_till_now = 0
            if len(issue_tags) > other_cutoff:
--- a/issues/templates/issues/base.html
+++ b/issues/templates/issues/base.html
@@ -220,7 +220,7 @@
            </div>
        </div>

-        {% if issue.tags_summary and tab != "tags" %}
+        {% if tab != "tags" and issue.tags_summary %}
        <div class="border-2 mb-4 mr-4"><!-- "issue: tags" box -->
            <div class="font-bold border-b-2">
                <div class="p-4 border-slate-50 text-slate-500">
@@ -230,7 +230,7 @@
            <div class="p-4">
                {% for issuetags in issue.tags_summary %}
                    <div class="mb-4">
-                        <div class="text-sm font-bold text-slate-500">{{ issuetags.0.value.key.key }}:</div>
+                        <div class="text-sm font-bold text-slate-500">{{ issuetags.0.key.key }}:</div>
                        <div>
                        {% for issuetag in issuetags %} 
                            <span>{{ issuetag.value.value }} <span class="text-xs">({{ issuetag.pct }}%)</span>{% if not forloop.last %}</span>,{% endif %}
--- a/issues/templates/issues/tags.html
+++ b/issues/templates/issues/tags.html
@@ -5,7 +5,7 @@

    {% for issuetags in issue.tags_all %}

-        <h1 id="{{ issuetags.0.value.key.key }}" class="text-2xl font-bold mt-4">{{ issuetags.0.value.key.key }}:</h1>
+        <h1 id="{{ issuetags.0.key.key }}" class="text-2xl font-bold mt-4">{{ issuetags.0.key.key }}:</h1>

        <div class="mb-6">
            {% for issuetag in issuetags %} 
--- a/tags/migrations/0001_initial.py
+++ b/tags/migrations/0001_initial.py
@@ -8,8 +8,8 @@ class Migration(migrations.Migration):

    dependencies = [
        ("issues", "0012_alter_issue_calculated_type_and_more"),
-        ("events", "0019_event_storage_backend"),
        ("projects", "0011_fill_stored_event_count"),
+        ("events", "0019_event_storage_backend"),
    ]

    operations = [
@@ -94,6 +94,12 @@ class Migration(migrations.Migration):
                        to="issues.issue",
                    ),
                ),
+                (
+                    "key",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.CASCADE, to="tags.tagkey"
+                    ),
+                ),
                (
                    "project",
                    models.ForeignKey(
@@ -112,7 +118,8 @@ class Migration(migrations.Migration):
            options={
                "indexes": [
                    models.Index(
-                        fields=["issue"], name="tags_issuet_issue_i_266542_idx"
+                        fields=["issue", "key", "count"],
+                        name="tags_issuet_issue_i_91a1dd_idx",
                    )
                ],
                "unique_together": {("value", "issue")},
--- a/tags/models.py
+++ b/tags/models.py
@@ -106,6 +106,9 @@ class EventTag(models.Model):
 class IssueTag(models.Model):
    project = models.ForeignKey(Project, blank=False, null=True, on_delete=models.SET_NULL)  # SET_NULL: cleanup 'later'

+    # denormalization that allows for a single-table-index for efficient search.
+    key = models.ForeignKey(TagKey, blank=False, null=False, on_delete=models.CASCADE)
+
    # value already implies key in our current setup.
    value = models.ForeignKey(TagValue, blank=False, null=False, on_delete=models.CASCADE)

@@ -129,15 +132,8 @@ class IssueTag(models.Model):
        unique_together = ('value', 'issue')

        indexes = [
-            # TODO I once had "value" here but I'd rather try without first, i.e. wouldn't the DB you basically "look up
-            # all for issue, filter withouth the use of indexes?
-            models.Index(fields=['issue']),  # for lookups by issue (for detail pages i.e. _get_issue_tags)
-
-            # A point _could_ be made for ['issue', '?value?' 'count'] since we order on that as part of _get_issue_tags
-            # However, I suspect that in practice the sorting on by count is done fast for a small number of items
-            # (and I suspect that if there are very many items, sorting is not useful, because the sparse information
-            # does not lead to much insight... but I might be wrong (a few tags with many values, and a long tail would
-            # be the counter-example))
+            # for lookups by issue (for detail pages i.e. _get_issue_tags)
+            models.Index(fields=['issue', 'key', 'count']),
        ]


@@ -180,7 +176,8 @@ def store_tags(event, issue, tags):
    #         tag_value, _ = TagValue.objects.get_or_create(project_id=event.project_id, key=tag_key, value=value)
    #         EventTag.objects.get_or_create(project_id=event.project_id, value=tag_value, event=event,
    #             defaults={'issue': issue})
-    #         IssueTag.objects.get_or_create(project_id=event.project_id, value=tag_value, issue=issue)
+    #         IssueTag.objects.get_or_create(
+    #            project_id=event.project_id, key_id=tag_value.key_id, value=tag_value, issue=issue)
    #
    #     # the 0-case is implied here too, which avoids some further guards in the code below
    #     return
@@ -220,7 +217,12 @@ def store_tags(event, issue, tags):
    ], ignore_conflicts=True)

    IssueTag.objects.bulk_create([
-        IssueTag(project_id=event.project_id, value=tag_value, issue=issue) for tag_value in tag_value_objects
+        IssueTag(
+            project_id=event.project_id,
+            key_id=tag_value.key_id,
+            value=tag_value,
+            issue=issue,
+        ) for tag_value in tag_value_objects
    ], ignore_conflicts=True)

    # Re 'project_id': this is not needed here, because it's already implied by both tag_value_objects and issue.