From 3ee6f29f9c75372418051f7325247c139c6760c1 Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Fri, 7 Mar 2025 20:59:21 +0100 Subject: [PATCH] tags: fix the indexes this is the part I was able to do with careful reading (and rerunning the tests); actual performance implications will be checked based on this --- issues/models.py | 2 +- tags/migrations/0001_initial.py | 51 ++++++++++++++------------------- tags/models.py | 48 ++++++++++++++++++++++--------- tags/search.py | 4 ++- 4 files changed, 59 insertions(+), 46 deletions(-) diff --git a/issues/models.py b/issues/models.py index bbac9e0..b019bee 100644 --- a/issues/models.py +++ b/issues/models.py @@ -130,7 +130,7 @@ class Issue(models.Model): all_issue_tags = ( IssueTag.objects - .filter(project_id=self.project_id, issue=self, value__key__mostly_unique=False) + .filter(issue=self, value__key__mostly_unique=False) # note: project is implied through issue .order_by("value__key__key", "-count") .select_related("value", "value__key")) diff --git a/tags/migrations/0001_initial.py b/tags/migrations/0001_initial.py index 7060004..8529b89 100644 --- a/tags/migrations/0001_initial.py +++ b/tags/migrations/0001_initial.py @@ -7,9 +7,9 @@ class Migration(migrations.Migration): initial = True dependencies = [ - ("events", "0019_event_storage_backend"), ("issues", "0012_alter_issue_calculated_type_and_more"), ("projects", "0011_fill_stored_event_count"), + ("events", "0019_event_storage_backend"), ] operations = [ @@ -36,6 +36,9 @@ class Migration(migrations.Migration): ), ), ], + options={ + "unique_together": {("project", "key")}, + }, ), migrations.CreateModel( name="TagValue", @@ -66,7 +69,7 @@ class Migration(migrations.Migration): ), ], options={ - "unique_together": {("project", "key", "value")}, + "unique_together": {("key", "value")}, }, ), migrations.CreateModel( @@ -106,6 +109,14 @@ class Migration(migrations.Migration): ), ), ], + options={ + "indexes": [ + models.Index( + fields=["issue"], name="tags_issuet_issue_i_266542_idx" + ) + ], + "unique_together": {("value", "issue")}, + }, ), migrations.CreateModel( name="EventTag", @@ -142,33 +153,13 @@ class Migration(migrations.Migration): ), ), ], - ), - migrations.AddIndex( - model_name="tagkey", - index=models.Index(fields=["key"], name="tags_tagkey_key_5e8a5a_idx"), - ), - migrations.AlterUniqueTogether( - name="tagkey", - unique_together={("project", "key")}, - ), - migrations.AddIndex( - model_name="issuetag", - index=models.Index( - fields=["issue", "value"], name="tags_issuet_issue_i_f06f20_idx" - ), - ), - migrations.AlterUniqueTogether( - name="issuetag", - unique_together={("value", "issue")}, - ), - migrations.AddIndex( - model_name="eventtag", - index=models.Index( - fields=["event", "value"], name="tags_eventt_event_i_86e88e_idx" - ), - ), - migrations.AlterUniqueTogether( - name="eventtag", - unique_together={("value", "event")}, + options={ + "indexes": [ + models.Index( + fields=["event"], name="tags_eventt_event_i_ac6453_idx" + ) + ], + "unique_together": {("value", "event")}, + }, ), ] diff --git a/tags/models.py b/tags/models.py index b56c186..f2f3670 100644 --- a/tags/models.py +++ b/tags/models.py @@ -24,6 +24,16 @@ from django.db.models import Q, F from projects.models import Project from tags.utils import deduce_tags, is_mostly_unique +# Notes on .project as it lives on TagValue, IssueTag and EventTag: +# In all cases, project could be derived through other means: for TagValue it's implied by TagKey.project; for IssueTag +# it's implied by TagValue (directly, or transitively through TagKey) as well as Issue; likewise for EventTag (but +# through Event there). +# In our current setup, we also always reach these models through queries through one of the models that have .project +# already. Thus, a good case could be made to drop it from the model; but that more or less ties us to the current +# implementation, and I'm not quite willing to do that yet. Hence .project everywhere. +# Once we cleanup-by-project through a filter on project on the model-to-be deleted directly, we will need and +# additional index on 'project' for these models, i.e. models.Index(fields=['project']) + class TagKey(models.Model): project = models.ForeignKey(Project, blank=False, null=True, on_delete=models.SET_NULL) # SET_NULL: cleanup 'later' @@ -39,12 +49,8 @@ class TagKey(models.Model): # is_deduced = models.BooleanField(default=False) class Meta: + # the obvious constraint, which doubles as a lookup index for store_tags and search. unique_together = ('project', 'key') - indexes = [ - # Untested assumption: when searching, we search by TagValue[..]key__key=key, so we need an - # index-not-prefixed-by-project too - models.Index(fields=['key']), - ] class TagValue(models.Model): @@ -53,8 +59,10 @@ class TagValue(models.Model): value = models.CharField(max_length=200, blank=False, null=False, db_index=True) class Meta: - # matches what we do in search - unique_together = ('project', 'key', 'value') + # This is the obvious constraint, which doubles as a lookup index for + # * store_tags (where we look up by both) + # * search (where we join on key) + unique_together = ('key', 'value') def __str__(self): return f"{self.key.key}:{self.value}" @@ -73,9 +81,11 @@ class EventTag(models.Model): event = models.ForeignKey('events.Event', blank=False, null=False, on_delete=models.DO_NOTHING, related_name='tags') class Meta: + # This is the obvious constraint, which doubles as a lookup index for search (a big OR-match on value). unique_together = ('value', 'event') + indexes = [ - models.Index(fields=['event', 'value']), # make lookups by event (for details page, deletions) faster + models.Index(fields=['event']), # for lookups by event (for event-details page, event-deletions) ] @@ -103,12 +113,18 @@ class IssueTag(models.Model): count = models.PositiveIntegerField(default=0) class Meta: - # searching: by value, then Issue (is this so though? .qs is already on the other!) + # This is the obvious constraint, which doubles as a lookup index for: + # * search (a big OR-match on value). + # * store_tags (counter update filters on both of these) unique_together = ('value', 'issue') + indexes = [ - models.Index(fields=['issue', 'value']), # make lookups by issue (for detail pages) faster - # a point _could_ be made for ['issue', 'value' 'count'] we'll probably order by the latter in the end. - # but I suspect that in practice the sorting on by count can be done easily for a small number of items + # TODO I once had "value" here but I'd rather try without first, i.e. wouldn't the DB you basically "look up + # all for issue, filter withouth the use of indexes? + models.Index(fields=['issue']), # for lookups by issue (for detail pages i.e. _get_issue_tags) + + # A point _could_ be made for ['issue', '?value?' 'count'] since we order on that as part of _get_issue_tags + # However, I suspect that in practice the sorting on by count is done fast for a small number of items # (and I suspect that if there are very many items, sorting is not useful, because the sparse information # does not lead to much insight... but I might be wrong (a few tags with many values, and a long tail would # be the counter-example)) @@ -169,6 +185,8 @@ def store_tags(event, issue, tags): # update_conflicts parameter disable setting the primary key on each model instance (if the database normally # support it)." in Django 4.2; in Django 5.0 and up this is no longer so for `update_conflicts`, so we could use # that instead and save a query. + # Re 'project_id': in the selection-queries below the non-necessity of the project_id is mentioned a number of + # times, but that's precisly _because_ we encode the link to project in TagKay. i.e. here it is needed. tag_key_objects = TagKey.objects.filter(project_id=event.project_id, key__in=tags.keys()) TagValue.objects.bulk_create([ @@ -177,8 +195,9 @@ def store_tags(event, issue, tags): # Select-back what we just created (or was already there); see above; the resulting SQL is a bit more complex than # the previous one though, which raises the question whether this is performant. + # Re 'project_id': this is not needed here, because it's already implied by key_obj. tag_value_objects = TagValue.objects.filter(_or_join([ - Q(project_id=event.project_id, key=key_obj, value=tags[key_obj.key]) for key_obj in tag_key_objects])) + Q(key=key_obj, value=tags[key_obj.key]) for key_obj in tag_key_objects])) EventTag.objects.bulk_create([ EventTag(project_id=event.project_id, value=tag_value, event=event) for tag_value in tag_value_objects @@ -188,6 +207,7 @@ def store_tags(event, issue, tags): IssueTag(project_id=event.project_id, value=tag_value, issue=issue) for tag_value in tag_value_objects ], ignore_conflicts=True) - IssueTag.objects.filter(project_id=event.project_id, value__in=tag_value_objects, issue=issue).update( + # Re 'project_id': this is not needed here, because it's already implied by both tag_value_objects and issue. + IssueTag.objects.filter(value__in=tag_value_objects, issue=issue).update( count=F('count') + 1 ) diff --git a/tags/search.py b/tags/search.py index a0696bd..6e82f0b 100644 --- a/tags/search.py +++ b/tags/search.py @@ -72,7 +72,9 @@ def _search(TagClz, fk_fieldname, project, obj_list, q): clauses = [] for key, value in parsed.tags.items(): try: - tag_value_obj = TagValue.objects.get(project=project, key__key=key, value=value) + # Since we have project as a field on TagValue, we _could_ filter on project directly; with our current set + # of indexes the below formulation is a nice way to reuse the index on TagKey (project, key) though. + tag_value_obj = TagValue.objects.get(key__project=project, key__key=key, value=value) except TagValue.DoesNotExist: # if the tag doesn't exist, we can't have any issues with it; the below short-circuit is fine, I think (I # mean: we _could_ say "tag x is to blame" but that's not what one does generally in search, is it?