mirror of
https://github.com/jlengrand/bugsink.git
synced 2026-03-10 08:01:17 +00:00
Improve tag-overview performance
* denormalize IssueTag.key; this allows for key to be used in and index
(issue, key, count).
* rewrite to grouping-first, per-key-query-second. i.e. reverts part of
bbfee84c6a. Reasoning: I don't want to rely on "mostly unique" always
guessing correctly, and we don't dynamically determine that yet. Which
means that (in the single query version) if you'd have a per-event value for
some tag, you could end up iterating over as many values as there are events,
which won't work.
* in tags.py, do the tab-check first to avoid doing the tag-calculation twice.
* further denormalation (of key__key, of value__str) actually turns out to not
be required for both the grouping and indivdual queries to be fast.
Performance tests, as always, against sqlite3.
--
Roads not taken/background
* This commit removes a future TODO that "A point _could_ be made for
['issue', '?value?' 'count']", I tried both versions of that index
(against the group-then-query version, the only one which I trust)
but without denormalization of key, I could not get it to be fast.
* I thought about a hybrid approach (for those keys with low counts of values
do the single-query thing) but as it stands the extra complexity isn't worth
it.
---
on the 1.2M events, 3 (user defined) tags / event test env this
basically lowers the time from "seconds" to "miliseconds".
This commit is contained in:
@@ -1,7 +1,6 @@
|
||||
import json
|
||||
import uuid
|
||||
from functools import partial
|
||||
from itertools import groupby
|
||||
|
||||
from django.db import models, transaction
|
||||
from django.db.models import F, Value
|
||||
@@ -128,22 +127,25 @@ class Issue(models.Model):
|
||||
def _get_issue_tags(self, other_cutoff, other_label):
|
||||
result = []
|
||||
|
||||
all_issue_tags = (
|
||||
IssueTag.objects
|
||||
.filter(issue=self, value__key__mostly_unique=False) # note: project is implied through issue
|
||||
.order_by("value__key__key", "-count")
|
||||
.select_related("value", "value__key"))
|
||||
ds = self.tags \
|
||||
.filter(key__mostly_unique=False)\
|
||||
.values("key")\
|
||||
.annotate(count_sum=models.Sum("count"))\
|
||||
.distinct()\
|
||||
.order_by("key__key")
|
||||
|
||||
# NOTE: at some point in the past we did grouping in SQL first; and then did 1 per-group query to get the
|
||||
# the actual values. This was less efficient in terms of the number of queries, but had the advantage of quickly
|
||||
# stopping work when there were very many values for a certain key. We now rely on the 'mostly_unique' property
|
||||
# for the line below to not do way too much work; if it ever turns out to be so, we may even push the
|
||||
# 'mostly_unique' property to be dynamically calculated (setting it to True when very many values are observed)
|
||||
grouped = groupby(all_issue_tags, lambda x: x.value.key.key)
|
||||
for d in ds:
|
||||
issue_tags = [
|
||||
issue_tag
|
||||
for issue_tag in
|
||||
(IssueTag.objects
|
||||
.filter(issue=self, key=d['key']) # note: project is implied through issue
|
||||
.order_by("-count")
|
||||
.select_related("value", "key")[:other_cutoff + 1] # +1 to see if we need to add "Other"
|
||||
)
|
||||
]
|
||||
|
||||
for key, group in grouped:
|
||||
issue_tags = list(group)
|
||||
total_seen = sum(issue_tag.count for issue_tag in issue_tags)
|
||||
total_seen = d["count_sum"]
|
||||
|
||||
seen_till_now = 0
|
||||
if len(issue_tags) > other_cutoff:
|
||||
|
||||
@@ -220,7 +220,7 @@
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{% if issue.tags_summary and tab != "tags" %}
|
||||
{% if tab != "tags" and issue.tags_summary %}
|
||||
<div class="border-2 mb-4 mr-4"><!-- "issue: tags" box -->
|
||||
<div class="font-bold border-b-2">
|
||||
<div class="p-4 border-slate-50 text-slate-500">
|
||||
@@ -230,7 +230,7 @@
|
||||
<div class="p-4">
|
||||
{% for issuetags in issue.tags_summary %}
|
||||
<div class="mb-4">
|
||||
<div class="text-sm font-bold text-slate-500">{{ issuetags.0.value.key.key }}:</div>
|
||||
<div class="text-sm font-bold text-slate-500">{{ issuetags.0.key.key }}:</div>
|
||||
<div>
|
||||
{% for issuetag in issuetags %}
|
||||
<span>{{ issuetag.value.value }} <span class="text-xs">({{ issuetag.pct }}%)</span>{% if not forloop.last %}</span>,{% endif %}
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
|
||||
{% for issuetags in issue.tags_all %}
|
||||
|
||||
<h1 id="{{ issuetags.0.value.key.key }}" class="text-2xl font-bold mt-4">{{ issuetags.0.value.key.key }}:</h1>
|
||||
<h1 id="{{ issuetags.0.key.key }}" class="text-2xl font-bold mt-4">{{ issuetags.0.key.key }}:</h1>
|
||||
|
||||
<div class="mb-6">
|
||||
{% for issuetag in issuetags %}
|
||||
|
||||
@@ -8,8 +8,8 @@ class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
("issues", "0012_alter_issue_calculated_type_and_more"),
|
||||
("events", "0019_event_storage_backend"),
|
||||
("projects", "0011_fill_stored_event_count"),
|
||||
("events", "0019_event_storage_backend"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
@@ -94,6 +94,12 @@ class Migration(migrations.Migration):
|
||||
to="issues.issue",
|
||||
),
|
||||
),
|
||||
(
|
||||
"key",
|
||||
models.ForeignKey(
|
||||
on_delete=django.db.models.deletion.CASCADE, to="tags.tagkey"
|
||||
),
|
||||
),
|
||||
(
|
||||
"project",
|
||||
models.ForeignKey(
|
||||
@@ -112,7 +118,8 @@ class Migration(migrations.Migration):
|
||||
options={
|
||||
"indexes": [
|
||||
models.Index(
|
||||
fields=["issue"], name="tags_issuet_issue_i_266542_idx"
|
||||
fields=["issue", "key", "count"],
|
||||
name="tags_issuet_issue_i_91a1dd_idx",
|
||||
)
|
||||
],
|
||||
"unique_together": {("value", "issue")},
|
||||
|
||||
@@ -106,6 +106,9 @@ class EventTag(models.Model):
|
||||
class IssueTag(models.Model):
|
||||
project = models.ForeignKey(Project, blank=False, null=True, on_delete=models.SET_NULL) # SET_NULL: cleanup 'later'
|
||||
|
||||
# denormalization that allows for a single-table-index for efficient search.
|
||||
key = models.ForeignKey(TagKey, blank=False, null=False, on_delete=models.CASCADE)
|
||||
|
||||
# value already implies key in our current setup.
|
||||
value = models.ForeignKey(TagValue, blank=False, null=False, on_delete=models.CASCADE)
|
||||
|
||||
@@ -129,15 +132,8 @@ class IssueTag(models.Model):
|
||||
unique_together = ('value', 'issue')
|
||||
|
||||
indexes = [
|
||||
# TODO I once had "value" here but I'd rather try without first, i.e. wouldn't the DB you basically "look up
|
||||
# all for issue, filter withouth the use of indexes?
|
||||
models.Index(fields=['issue']), # for lookups by issue (for detail pages i.e. _get_issue_tags)
|
||||
|
||||
# A point _could_ be made for ['issue', '?value?' 'count'] since we order on that as part of _get_issue_tags
|
||||
# However, I suspect that in practice the sorting on by count is done fast for a small number of items
|
||||
# (and I suspect that if there are very many items, sorting is not useful, because the sparse information
|
||||
# does not lead to much insight... but I might be wrong (a few tags with many values, and a long tail would
|
||||
# be the counter-example))
|
||||
# for lookups by issue (for detail pages i.e. _get_issue_tags)
|
||||
models.Index(fields=['issue', 'key', 'count']),
|
||||
]
|
||||
|
||||
|
||||
@@ -180,7 +176,8 @@ def store_tags(event, issue, tags):
|
||||
# tag_value, _ = TagValue.objects.get_or_create(project_id=event.project_id, key=tag_key, value=value)
|
||||
# EventTag.objects.get_or_create(project_id=event.project_id, value=tag_value, event=event,
|
||||
# defaults={'issue': issue})
|
||||
# IssueTag.objects.get_or_create(project_id=event.project_id, value=tag_value, issue=issue)
|
||||
# IssueTag.objects.get_or_create(
|
||||
# project_id=event.project_id, key_id=tag_value.key_id, value=tag_value, issue=issue)
|
||||
#
|
||||
# # the 0-case is implied here too, which avoids some further guards in the code below
|
||||
# return
|
||||
@@ -220,7 +217,12 @@ def store_tags(event, issue, tags):
|
||||
], ignore_conflicts=True)
|
||||
|
||||
IssueTag.objects.bulk_create([
|
||||
IssueTag(project_id=event.project_id, value=tag_value, issue=issue) for tag_value in tag_value_objects
|
||||
IssueTag(
|
||||
project_id=event.project_id,
|
||||
key_id=tag_value.key_id,
|
||||
value=tag_value,
|
||||
issue=issue,
|
||||
) for tag_value in tag_value_objects
|
||||
], ignore_conflicts=True)
|
||||
|
||||
# Re 'project_id': this is not needed here, because it's already implied by both tag_value_objects and issue.
|
||||
|
||||
Reference in New Issue
Block a user