From cd7f3978cf37e267566d98b2ed2f2f4236c52f71 Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Tue, 11 Mar 2025 22:06:27 +0100 Subject: [PATCH] Improve tag-overview performance * denormalize IssueTag.key; this allows for key to be used in and index (issue, key, count). * rewrite to grouping-first, per-key-query-second. i.e. reverts part of bbfee84c6a1a. Reasoning: I don't want to rely on "mostly unique" always guessing correctly, and we don't dynamically determine that yet. Which means that (in the single query version) if you'd have a per-event value for some tag, you could end up iterating over as many values as there are events, which won't work. * in tags.py, do the tab-check first to avoid doing the tag-calculation twice. * further denormalation (of key__key, of value__str) actually turns out to not be required for both the grouping and indivdual queries to be fast. Performance tests, as always, against sqlite3. -- Roads not taken/background * This commit removes a future TODO that "A point _could_ be made for ['issue', '?value?' 'count']", I tried both versions of that index (against the group-then-query version, the only one which I trust) but without denormalization of key, I could not get it to be fast. * I thought about a hybrid approach (for those keys with low counts of values do the single-query thing) but as it stands the extra complexity isn't worth it. --- on the 1.2M events, 3 (user defined) tags / event test env this basically lowers the time from "seconds" to "miliseconds". --- issues/models.py | 32 ++++++++++++++++--------------- issues/templates/issues/base.html | 4 ++-- issues/templates/issues/tags.html | 2 +- tags/migrations/0001_initial.py | 11 +++++++++-- tags/models.py | 24 ++++++++++++----------- 5 files changed, 42 insertions(+), 31 deletions(-) diff --git a/issues/models.py b/issues/models.py index b019bee..3feafec 100644 --- a/issues/models.py +++ b/issues/models.py @@ -1,7 +1,6 @@ import json import uuid from functools import partial -from itertools import groupby from django.db import models, transaction from django.db.models import F, Value @@ -128,22 +127,25 @@ class Issue(models.Model): def _get_issue_tags(self, other_cutoff, other_label): result = [] - all_issue_tags = ( - IssueTag.objects - .filter(issue=self, value__key__mostly_unique=False) # note: project is implied through issue - .order_by("value__key__key", "-count") - .select_related("value", "value__key")) + ds = self.tags \ + .filter(key__mostly_unique=False)\ + .values("key")\ + .annotate(count_sum=models.Sum("count"))\ + .distinct()\ + .order_by("key__key") - # NOTE: at some point in the past we did grouping in SQL first; and then did 1 per-group query to get the - # the actual values. This was less efficient in terms of the number of queries, but had the advantage of quickly - # stopping work when there were very many values for a certain key. We now rely on the 'mostly_unique' property - # for the line below to not do way too much work; if it ever turns out to be so, we may even push the - # 'mostly_unique' property to be dynamically calculated (setting it to True when very many values are observed) - grouped = groupby(all_issue_tags, lambda x: x.value.key.key) + for d in ds: + issue_tags = [ + issue_tag + for issue_tag in + (IssueTag.objects + .filter(issue=self, key=d['key']) # note: project is implied through issue + .order_by("-count") + .select_related("value", "key")[:other_cutoff + 1] # +1 to see if we need to add "Other" + ) + ] - for key, group in grouped: - issue_tags = list(group) - total_seen = sum(issue_tag.count for issue_tag in issue_tags) + total_seen = d["count_sum"] seen_till_now = 0 if len(issue_tags) > other_cutoff: diff --git a/issues/templates/issues/base.html b/issues/templates/issues/base.html index d3e503c..0c0b58d 100644 --- a/issues/templates/issues/base.html +++ b/issues/templates/issues/base.html @@ -220,7 +220,7 @@ - {% if issue.tags_summary and tab != "tags" %} + {% if tab != "tags" and issue.tags_summary %}
@@ -230,7 +230,7 @@
{% for issuetags in issue.tags_summary %}
-
{{ issuetags.0.value.key.key }}:
+
{{ issuetags.0.key.key }}:
{% for issuetag in issuetags %} {{ issuetag.value.value }} ({{ issuetag.pct }}%){% if not forloop.last %},{% endif %} diff --git a/issues/templates/issues/tags.html b/issues/templates/issues/tags.html index 5691739..6dc8bec 100644 --- a/issues/templates/issues/tags.html +++ b/issues/templates/issues/tags.html @@ -5,7 +5,7 @@ {% for issuetags in issue.tags_all %} -

{{ issuetags.0.value.key.key }}:

+

{{ issuetags.0.key.key }}:

{% for issuetag in issuetags %} diff --git a/tags/migrations/0001_initial.py b/tags/migrations/0001_initial.py index 0a433e1..369ba4b 100644 --- a/tags/migrations/0001_initial.py +++ b/tags/migrations/0001_initial.py @@ -8,8 +8,8 @@ class Migration(migrations.Migration): dependencies = [ ("issues", "0012_alter_issue_calculated_type_and_more"), - ("events", "0019_event_storage_backend"), ("projects", "0011_fill_stored_event_count"), + ("events", "0019_event_storage_backend"), ] operations = [ @@ -94,6 +94,12 @@ class Migration(migrations.Migration): to="issues.issue", ), ), + ( + "key", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, to="tags.tagkey" + ), + ), ( "project", models.ForeignKey( @@ -112,7 +118,8 @@ class Migration(migrations.Migration): options={ "indexes": [ models.Index( - fields=["issue"], name="tags_issuet_issue_i_266542_idx" + fields=["issue", "key", "count"], + name="tags_issuet_issue_i_91a1dd_idx", ) ], "unique_together": {("value", "issue")}, diff --git a/tags/models.py b/tags/models.py index 2ecc6d4..dc98873 100644 --- a/tags/models.py +++ b/tags/models.py @@ -106,6 +106,9 @@ class EventTag(models.Model): class IssueTag(models.Model): project = models.ForeignKey(Project, blank=False, null=True, on_delete=models.SET_NULL) # SET_NULL: cleanup 'later' + # denormalization that allows for a single-table-index for efficient search. + key = models.ForeignKey(TagKey, blank=False, null=False, on_delete=models.CASCADE) + # value already implies key in our current setup. value = models.ForeignKey(TagValue, blank=False, null=False, on_delete=models.CASCADE) @@ -129,15 +132,8 @@ class IssueTag(models.Model): unique_together = ('value', 'issue') indexes = [ - # TODO I once had "value" here but I'd rather try without first, i.e. wouldn't the DB you basically "look up - # all for issue, filter withouth the use of indexes? - models.Index(fields=['issue']), # for lookups by issue (for detail pages i.e. _get_issue_tags) - - # A point _could_ be made for ['issue', '?value?' 'count'] since we order on that as part of _get_issue_tags - # However, I suspect that in practice the sorting on by count is done fast for a small number of items - # (and I suspect that if there are very many items, sorting is not useful, because the sparse information - # does not lead to much insight... but I might be wrong (a few tags with many values, and a long tail would - # be the counter-example)) + # for lookups by issue (for detail pages i.e. _get_issue_tags) + models.Index(fields=['issue', 'key', 'count']), ] @@ -180,7 +176,8 @@ def store_tags(event, issue, tags): # tag_value, _ = TagValue.objects.get_or_create(project_id=event.project_id, key=tag_key, value=value) # EventTag.objects.get_or_create(project_id=event.project_id, value=tag_value, event=event, # defaults={'issue': issue}) - # IssueTag.objects.get_or_create(project_id=event.project_id, value=tag_value, issue=issue) + # IssueTag.objects.get_or_create( + # project_id=event.project_id, key_id=tag_value.key_id, value=tag_value, issue=issue) # # # the 0-case is implied here too, which avoids some further guards in the code below # return @@ -220,7 +217,12 @@ def store_tags(event, issue, tags): ], ignore_conflicts=True) IssueTag.objects.bulk_create([ - IssueTag(project_id=event.project_id, value=tag_value, issue=issue) for tag_value in tag_value_objects + IssueTag( + project_id=event.project_id, + key_id=tag_value.key_id, + value=tag_value, + issue=issue, + ) for tag_value in tag_value_objects ], ignore_conflicts=True) # Re 'project_id': this is not needed here, because it's already implied by both tag_value_objects and issue.