From 3ee6f29f9c75372418051f7325247c139c6760c1 Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Fri, 7 Mar 2025 20:59:21 +0100
Subject: [PATCH] tags: fix the indexes

this is the part I was able to do with careful reading (and rerunning the
tests); actual performance implications will be checked based on this
---
 issues/models.py                |  2 +-
 tags/migrations/0001_initial.py | 51 ++++++++++++++-------------------
 tags/models.py                  | 48 ++++++++++++++++++++++---------
 tags/search.py                  |  4 ++-
 4 files changed, 59 insertions(+), 46 deletions(-)

diff --git a/issues/models.py b/issues/models.py
index bbac9e0..b019bee 100644
--- a/issues/models.py
+++ b/issues/models.py
@@ -130,7 +130,7 @@ class Issue(models.Model):
 
         all_issue_tags = (
             IssueTag.objects
-            .filter(project_id=self.project_id, issue=self, value__key__mostly_unique=False)
+            .filter(issue=self, value__key__mostly_unique=False)  # note: project is implied through issue
             .order_by("value__key__key", "-count")
             .select_related("value", "value__key"))
 
diff --git a/tags/migrations/0001_initial.py b/tags/migrations/0001_initial.py
index 7060004..8529b89 100644
--- a/tags/migrations/0001_initial.py
+++ b/tags/migrations/0001_initial.py
@@ -7,9 +7,9 @@ class Migration(migrations.Migration):
     initial = True
 
     dependencies = [
-        ("events", "0019_event_storage_backend"),
         ("issues", "0012_alter_issue_calculated_type_and_more"),
         ("projects", "0011_fill_stored_event_count"),
+        ("events", "0019_event_storage_backend"),
     ]
 
     operations = [
@@ -36,6 +36,9 @@ class Migration(migrations.Migration):
                     ),
                 ),
             ],
+            options={
+                "unique_together": {("project", "key")},
+            },
         ),
         migrations.CreateModel(
             name="TagValue",
@@ -66,7 +69,7 @@ class Migration(migrations.Migration):
                 ),
             ],
             options={
-                "unique_together": {("project", "key", "value")},
+                "unique_together": {("key", "value")},
             },
         ),
         migrations.CreateModel(
@@ -106,6 +109,14 @@ class Migration(migrations.Migration):
                     ),
                 ),
             ],
+            options={
+                "indexes": [
+                    models.Index(
+                        fields=["issue"], name="tags_issuet_issue_i_266542_idx"
+                    )
+                ],
+                "unique_together": {("value", "issue")},
+            },
         ),
         migrations.CreateModel(
             name="EventTag",
@@ -142,33 +153,13 @@ class Migration(migrations.Migration):
                     ),
                 ),
             ],
-        ),
-        migrations.AddIndex(
-            model_name="tagkey",
-            index=models.Index(fields=["key"], name="tags_tagkey_key_5e8a5a_idx"),
-        ),
-        migrations.AlterUniqueTogether(
-            name="tagkey",
-            unique_together={("project", "key")},
-        ),
-        migrations.AddIndex(
-            model_name="issuetag",
-            index=models.Index(
-                fields=["issue", "value"], name="tags_issuet_issue_i_f06f20_idx"
-            ),
-        ),
-        migrations.AlterUniqueTogether(
-            name="issuetag",
-            unique_together={("value", "issue")},
-        ),
-        migrations.AddIndex(
-            model_name="eventtag",
-            index=models.Index(
-                fields=["event", "value"], name="tags_eventt_event_i_86e88e_idx"
-            ),
-        ),
-        migrations.AlterUniqueTogether(
-            name="eventtag",
-            unique_together={("value", "event")},
+            options={
+                "indexes": [
+                    models.Index(
+                        fields=["event"], name="tags_eventt_event_i_ac6453_idx"
+                    )
+                ],
+                "unique_together": {("value", "event")},
+            },
         ),
     ]
diff --git a/tags/models.py b/tags/models.py
index b56c186..f2f3670 100644
--- a/tags/models.py
+++ b/tags/models.py
@@ -24,6 +24,16 @@ from django.db.models import Q, F
 from projects.models import Project
 from tags.utils import deduce_tags, is_mostly_unique
 
+# Notes on .project as it lives on TagValue, IssueTag and EventTag:
+# In all cases, project could be derived through other means: for TagValue it's implied by TagKey.project; for IssueTag
+# it's implied by TagValue (directly, or transitively through TagKey) as well as Issue; likewise for EventTag (but
+# through Event there).
+# In our current setup, we also always reach these models through queries through one of the models that have .project
+# already. Thus, a good case could be made to drop it from the model; but that more or less ties us to the current
+# implementation, and I'm not quite willing to do that yet. Hence .project everywhere.
+# Once we cleanup-by-project through a filter on project on the model-to-be deleted directly, we will need and
+# additional index on 'project' for these models, i.e. models.Index(fields=['project'])
+
 
 class TagKey(models.Model):
     project = models.ForeignKey(Project, blank=False, null=True, on_delete=models.SET_NULL)  # SET_NULL: cleanup 'later'
@@ -39,12 +49,8 @@ class TagKey(models.Model):
     # is_deduced = models.BooleanField(default=False)
 
     class Meta:
+        # the obvious constraint, which doubles as a lookup index for store_tags and search.
         unique_together = ('project', 'key')
-        indexes = [
-            # Untested assumption: when searching, we search by TagValue[..]key__key=key, so we need an
-            # index-not-prefixed-by-project too
-            models.Index(fields=['key']),
-        ]
 
 
 class TagValue(models.Model):
@@ -53,8 +59,10 @@ class TagValue(models.Model):
     value = models.CharField(max_length=200, blank=False, null=False, db_index=True)
 
     class Meta:
-        # matches what we do in search
-        unique_together = ('project', 'key', 'value')
+        # This is the obvious constraint, which doubles as a lookup index for
+        # * store_tags (where we look up by both)
+        # * search (where we join on key)
+        unique_together = ('key', 'value')
 
     def __str__(self):
         return f"{self.key.key}:{self.value}"
@@ -73,9 +81,11 @@ class EventTag(models.Model):
     event = models.ForeignKey('events.Event', blank=False, null=False, on_delete=models.DO_NOTHING, related_name='tags')
 
     class Meta:
+        # This is the obvious constraint, which doubles as a lookup index for search (a big OR-match on value).
         unique_together = ('value', 'event')
+
         indexes = [
-            models.Index(fields=['event', 'value']),  # make lookups by event (for details page, deletions) faster
+            models.Index(fields=['event']),  # for lookups by event (for event-details page, event-deletions)
         ]
 
 
@@ -103,12 +113,18 @@ class IssueTag(models.Model):
     count = models.PositiveIntegerField(default=0)
 
     class Meta:
-        # searching: by value, then Issue (is this so though? .qs is already on the other!)
+        # This is the obvious constraint, which doubles as a lookup index for:
+        # * search (a big OR-match on value).
+        # * store_tags (counter update filters on both of these)
         unique_together = ('value', 'issue')
+
         indexes = [
-            models.Index(fields=['issue', 'value']),  # make lookups by issue (for detail pages) faster
-            # a point _could_ be made for ['issue', 'value' 'count'] we'll probably order by the latter in the end.
-            # but I suspect that in practice the sorting on by count can be done easily for a small number of items
+            # TODO I once had "value" here but I'd rather try without first, i.e. wouldn't the DB you basically "look up
+            # all for issue, filter withouth the use of indexes?
+            models.Index(fields=['issue']),  # for lookups by issue (for detail pages i.e. _get_issue_tags)
+
+            # A point _could_ be made for ['issue', '?value?' 'count'] since we order on that as part of _get_issue_tags
+            # However, I suspect that in practice the sorting on by count is done fast for a small number of items
             # (and I suspect that if there are very many items, sorting is not useful, because the sparse information
             # does not lead to much insight... but I might be wrong (a few tags with many values, and a long tail would
             # be the counter-example))
@@ -169,6 +185,8 @@ def store_tags(event, issue, tags):
     # update_conflicts parameter disable setting the primary key on each model instance (if the database normally
     # support it)." in Django 4.2; in Django 5.0 and up this is no longer so for `update_conflicts`, so we could use
     # that instead and save a query.
+    # Re 'project_id': in the selection-queries below the non-necessity of the project_id is mentioned a number of
+    # times, but that's precisly _because_ we encode the link to project in TagKay. i.e. here it is needed.
     tag_key_objects = TagKey.objects.filter(project_id=event.project_id, key__in=tags.keys())
 
     TagValue.objects.bulk_create([
@@ -177,8 +195,9 @@ def store_tags(event, issue, tags):
 
     # Select-back what we just created (or was already there); see above; the resulting SQL is a bit more complex than
     # the previous one though, which raises the question whether this is performant.
+    # Re 'project_id': this is not needed here, because it's already implied by key_obj.
     tag_value_objects = TagValue.objects.filter(_or_join([
-        Q(project_id=event.project_id, key=key_obj, value=tags[key_obj.key]) for key_obj in tag_key_objects]))
+        Q(key=key_obj, value=tags[key_obj.key]) for key_obj in tag_key_objects]))
 
     EventTag.objects.bulk_create([
         EventTag(project_id=event.project_id, value=tag_value, event=event) for tag_value in tag_value_objects
@@ -188,6 +207,7 @@ def store_tags(event, issue, tags):
         IssueTag(project_id=event.project_id, value=tag_value, issue=issue) for tag_value in tag_value_objects
     ], ignore_conflicts=True)
 
-    IssueTag.objects.filter(project_id=event.project_id, value__in=tag_value_objects, issue=issue).update(
+    # Re 'project_id': this is not needed here, because it's already implied by both tag_value_objects and issue.
+    IssueTag.objects.filter(value__in=tag_value_objects, issue=issue).update(
         count=F('count') + 1
     )
diff --git a/tags/search.py b/tags/search.py
index a0696bd..6e82f0b 100644
--- a/tags/search.py
+++ b/tags/search.py
@@ -72,7 +72,9 @@ def _search(TagClz, fk_fieldname, project, obj_list, q):
     clauses = []
     for key, value in parsed.tags.items():
         try:
-            tag_value_obj = TagValue.objects.get(project=project, key__key=key, value=value)
+            # Since we have project as a field on TagValue, we _could_ filter on project directly; with our current set
+            # of indexes the below formulation is a nice way to reuse the index on TagKey (project, key) though.
+            tag_value_obj = TagValue.objects.get(key__project=project, key__key=key, value=value)
         except TagValue.DoesNotExist:
             # if the tag doesn't exist, we can't have any issues with it; the below short-circuit is fine, I think (I
             # mean: we _could_ say "tag x is to blame" but that's not what one does generally in search, is it?