Improve tag-overview performance

* denormalize IssueTag.key; this allows for key to be used in and index
  (issue, key, count).

* rewrite to grouping-first, per-key-query-second. i.e. reverts part of
  bbfee84c6a. Reasoning: I don't want to rely on "mostly unique" always
  guessing correctly, and we don't dynamically determine that yet. Which
  means that (in the single query version) if you'd have a per-event value for
  some tag, you could end up iterating over as many values as there are events,
  which won't work.

* in tags.py, do the tab-check first to avoid doing the tag-calculation twice.

* further denormalation (of key__key, of value__str) actually turns out to not
  be required for both the grouping and indivdual queries to be fast.

Performance tests, as always, against sqlite3.

--

Roads not taken/background

* This commit removes a future TODO that "A point _could_ be made for
  ['issue', '?value?' 'count']", I tried both versions of that index
  (against the group-then-query version, the only one which I trust)
  but without denormalization of key, I could not get it to be fast.

* I thought about a hybrid approach (for those keys with low counts of values
  do the single-query thing) but as it stands the extra complexity isn't worth
  it.

---
on the 1.2M events, 3 (user defined) tags / event test env this
basically lowers the time from "seconds" to "miliseconds".
This commit is contained in:
Klaas van Schelven
2025-03-11 22:06:27 +01:00
parent b031792784
commit cd7f3978cf
5 changed files with 42 additions and 31 deletions

View File

@@ -1,7 +1,6 @@
import json
import uuid
from functools import partial
from itertools import groupby
from django.db import models, transaction
from django.db.models import F, Value
@@ -128,22 +127,25 @@ class Issue(models.Model):
def _get_issue_tags(self, other_cutoff, other_label):
result = []
all_issue_tags = (
IssueTag.objects
.filter(issue=self, value__key__mostly_unique=False) # note: project is implied through issue
.order_by("value__key__key", "-count")
.select_related("value", "value__key"))
ds = self.tags \
.filter(key__mostly_unique=False)\
.values("key")\
.annotate(count_sum=models.Sum("count"))\
.distinct()\
.order_by("key__key")
# NOTE: at some point in the past we did grouping in SQL first; and then did 1 per-group query to get the
# the actual values. This was less efficient in terms of the number of queries, but had the advantage of quickly
# stopping work when there were very many values for a certain key. We now rely on the 'mostly_unique' property
# for the line below to not do way too much work; if it ever turns out to be so, we may even push the
# 'mostly_unique' property to be dynamically calculated (setting it to True when very many values are observed)
grouped = groupby(all_issue_tags, lambda x: x.value.key.key)
for d in ds:
issue_tags = [
issue_tag
for issue_tag in
(IssueTag.objects
.filter(issue=self, key=d['key']) # note: project is implied through issue
.order_by("-count")
.select_related("value", "key")[:other_cutoff + 1] # +1 to see if we need to add "Other"
)
]
for key, group in grouped:
issue_tags = list(group)
total_seen = sum(issue_tag.count for issue_tag in issue_tags)
total_seen = d["count_sum"]
seen_till_now = 0
if len(issue_tags) > other_cutoff:

View File

@@ -220,7 +220,7 @@
</div>
</div>
{% if issue.tags_summary and tab != "tags" %}
{% if tab != "tags" and issue.tags_summary %}
<div class="border-2 mb-4 mr-4"><!-- "issue: tags" box -->
<div class="font-bold border-b-2">
<div class="p-4 border-slate-50 text-slate-500">
@@ -230,7 +230,7 @@
<div class="p-4">
{% for issuetags in issue.tags_summary %}
<div class="mb-4">
<div class="text-sm font-bold text-slate-500">{{ issuetags.0.value.key.key }}:</div>
<div class="text-sm font-bold text-slate-500">{{ issuetags.0.key.key }}:</div>
<div>
{% for issuetag in issuetags %}
<span>{{ issuetag.value.value }} <span class="text-xs">({{ issuetag.pct }}%)</span>{% if not forloop.last %}</span>,{% endif %}

View File

@@ -5,7 +5,7 @@
{% for issuetags in issue.tags_all %}
<h1 id="{{ issuetags.0.value.key.key }}" class="text-2xl font-bold mt-4">{{ issuetags.0.value.key.key }}:</h1>
<h1 id="{{ issuetags.0.key.key }}" class="text-2xl font-bold mt-4">{{ issuetags.0.key.key }}:</h1>
<div class="mb-6">
{% for issuetag in issuetags %}

View File

@@ -8,8 +8,8 @@ class Migration(migrations.Migration):
dependencies = [
("issues", "0012_alter_issue_calculated_type_and_more"),
("events", "0019_event_storage_backend"),
("projects", "0011_fill_stored_event_count"),
("events", "0019_event_storage_backend"),
]
operations = [
@@ -94,6 +94,12 @@ class Migration(migrations.Migration):
to="issues.issue",
),
),
(
"key",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE, to="tags.tagkey"
),
),
(
"project",
models.ForeignKey(
@@ -112,7 +118,8 @@ class Migration(migrations.Migration):
options={
"indexes": [
models.Index(
fields=["issue"], name="tags_issuet_issue_i_266542_idx"
fields=["issue", "key", "count"],
name="tags_issuet_issue_i_91a1dd_idx",
)
],
"unique_together": {("value", "issue")},

View File

@@ -106,6 +106,9 @@ class EventTag(models.Model):
class IssueTag(models.Model):
project = models.ForeignKey(Project, blank=False, null=True, on_delete=models.SET_NULL) # SET_NULL: cleanup 'later'
# denormalization that allows for a single-table-index for efficient search.
key = models.ForeignKey(TagKey, blank=False, null=False, on_delete=models.CASCADE)
# value already implies key in our current setup.
value = models.ForeignKey(TagValue, blank=False, null=False, on_delete=models.CASCADE)
@@ -129,15 +132,8 @@ class IssueTag(models.Model):
unique_together = ('value', 'issue')
indexes = [
# TODO I once had "value" here but I'd rather try without first, i.e. wouldn't the DB you basically "look up
# all for issue, filter withouth the use of indexes?
models.Index(fields=['issue']), # for lookups by issue (for detail pages i.e. _get_issue_tags)
# A point _could_ be made for ['issue', '?value?' 'count'] since we order on that as part of _get_issue_tags
# However, I suspect that in practice the sorting on by count is done fast for a small number of items
# (and I suspect that if there are very many items, sorting is not useful, because the sparse information
# does not lead to much insight... but I might be wrong (a few tags with many values, and a long tail would
# be the counter-example))
# for lookups by issue (for detail pages i.e. _get_issue_tags)
models.Index(fields=['issue', 'key', 'count']),
]
@@ -180,7 +176,8 @@ def store_tags(event, issue, tags):
# tag_value, _ = TagValue.objects.get_or_create(project_id=event.project_id, key=tag_key, value=value)
# EventTag.objects.get_or_create(project_id=event.project_id, value=tag_value, event=event,
# defaults={'issue': issue})
# IssueTag.objects.get_or_create(project_id=event.project_id, value=tag_value, issue=issue)
# IssueTag.objects.get_or_create(
# project_id=event.project_id, key_id=tag_value.key_id, value=tag_value, issue=issue)
#
# # the 0-case is implied here too, which avoids some further guards in the code below
# return
@@ -220,7 +217,12 @@ def store_tags(event, issue, tags):
], ignore_conflicts=True)
IssueTag.objects.bulk_create([
IssueTag(project_id=event.project_id, value=tag_value, issue=issue) for tag_value in tag_value_objects
IssueTag(
project_id=event.project_id,
key_id=tag_value.key_id,
value=tag_value,
issue=issue,
) for tag_value in tag_value_objects
], ignore_conflicts=True)
# Re 'project_id': this is not needed here, because it's already implied by both tag_value_objects and issue.