From 49e6700d4a8101d62fd344dd08a7c1b456ba4a50 Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Tue, 6 May 2025 11:32:19 +0200 Subject: [PATCH] Grouping.grouping_key: hash it for the index --- ingest/views.py | 7 ++++++- issues/factories.py | 2 ++ .../0014_grouping_grouping_key_hash.py | 17 ++++++++++++++++ issues/migrations/0015_set_grouping_hash.py | 20 +++++++++++++++++++ ...=> 0016_alter_grouping_unique_together.py} | 4 ++-- issues/models.py | 8 ++++---- 6 files changed, 51 insertions(+), 7 deletions(-) create mode 100644 issues/migrations/0014_grouping_grouping_key_hash.py create mode 100644 issues/migrations/0015_set_grouping_hash.py rename issues/migrations/{0014_alter_grouping_unique_together.py => 0016_alter_grouping_unique_together.py} (69%) diff --git a/ingest/views.py b/ingest/views.py index 1f4562b..e45923c 100644 --- a/ingest/views.py +++ b/ingest/views.py @@ -1,3 +1,4 @@ +import hashlib import os import logging import io @@ -270,7 +271,10 @@ class BaseIngestAPIView(View): grouping_key = get_issue_grouper_for_data(event_data, calculated_type, calculated_value) try: - grouping = Grouping.objects.get(project_id=event_metadata["project_id"], grouping_key=grouping_key) + grouping = Grouping.objects.get( + project_id=event_metadata["project_id"], grouping_key=grouping_key, + grouping_key_hash=hashlib.sha256(grouping_key.encode()).hexdigest()) + issue = grouping.issue issue_created = False @@ -300,6 +304,7 @@ class BaseIngestAPIView(View): grouping = Grouping.objects.create( project_id=event_metadata["project_id"], grouping_key=grouping_key, + grouping_key_hash=hashlib.sha256(grouping_key.encode()).hexdigest(), issue=issue, ) diff --git a/issues/factories.py b/issues/factories.py index d04d1bc..1438ec2 100644 --- a/issues/factories.py +++ b/issues/factories.py @@ -1,3 +1,4 @@ +import hashlib from django.utils import timezone from projects.models import Project @@ -26,6 +27,7 @@ def get_or_create_issue(project=None, event_data=None): grouping = Grouping.objects.create( project=project, grouping_key=grouping_key, + grouping_key_hash=hashlib.sha256(grouping_key.encode()).hexdigest(), issue=issue, ) diff --git a/issues/migrations/0014_grouping_grouping_key_hash.py b/issues/migrations/0014_grouping_grouping_key_hash.py new file mode 100644 index 0000000..a9638a5 --- /dev/null +++ b/issues/migrations/0014_grouping_grouping_key_hash.py @@ -0,0 +1,17 @@ +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("issues", "0013_fix_issue_stored_event_counts"), + ] + + operations = [ + migrations.AddField( + model_name="grouping", + name="grouping_key_hash", + field=models.CharField(default="", max_length=64), + preserve_default=False, + ), + ] diff --git a/issues/migrations/0015_set_grouping_hash.py b/issues/migrations/0015_set_grouping_hash.py new file mode 100644 index 0000000..326f4f5 --- /dev/null +++ b/issues/migrations/0015_set_grouping_hash.py @@ -0,0 +1,20 @@ +import hashlib +from django.db import migrations + + +def set_grouping_hash(apps, schema_editor): + Grouping = apps.get_model("issues", "Grouping") + for grouping in Grouping.objects.all(): + grouping.grouping_key_hash = hashlib.sha256(grouping.grouping_key.encode()).hexdigest() + grouping.save() + + +class Migration(migrations.Migration): + + dependencies = [ + ("issues", "0014_grouping_grouping_key_hash"), + ] + + operations = [ + migrations.RunPython(set_grouping_hash), + ] diff --git a/issues/migrations/0014_alter_grouping_unique_together.py b/issues/migrations/0016_alter_grouping_unique_together.py similarity index 69% rename from issues/migrations/0014_alter_grouping_unique_together.py rename to issues/migrations/0016_alter_grouping_unique_together.py index 9ffcbbe..19eb884 100644 --- a/issues/migrations/0014_alter_grouping_unique_together.py +++ b/issues/migrations/0016_alter_grouping_unique_together.py @@ -5,12 +5,12 @@ class Migration(migrations.Migration): dependencies = [ ("projects", "0011_fill_stored_event_count"), - ("issues", "0013_fix_issue_stored_event_counts"), + ("issues", "0015_set_grouping_hash"), ] operations = [ migrations.AlterUniqueTogether( name="grouping", - unique_together={("project", "grouping_key")}, + unique_together={("project", "grouping_key_hash")}, ), ] diff --git a/issues/models.py b/issues/models.py index ac8daba..6fdf3bf 100644 --- a/issues/models.py +++ b/issues/models.py @@ -197,11 +197,11 @@ class Grouping(models.Model): project = models.ForeignKey( "projects.Project", blank=False, null=True, on_delete=models.SET_NULL) # SET_NULL: cleanup 'later' - # NOTE: I don't want to have any principled maximum on the grouping key, nor do I want to prematurely optimize the - # lookup. If lookups are slow (even with an index), we _could_ examine whether manually hashing these values and - # matching on the hash helps. grouping_key = models.TextField(blank=False, null=False) + # we hash the key to make it indexable on MySQL, see https://code.djangoproject.com/ticket/2495 + grouping_key_hash = models.CharField(max_length=64, blank=False, null=False) + issue = models.ForeignKey("Issue", blank=False, null=True, on_delete=models.SET_NULL) # SET_NULL: cleanup 'later' def __str__(self): @@ -211,7 +211,7 @@ class Grouping(models.Model): unique_together = [ # principled: grouping _key_ is a _key_ for a reason (within a project). This also implies the main way of # looking up groupings has an appropriate index. - ("project", "grouping_key"), + ("project", "grouping_key_hash"), ]