Grouping.grouping_key: hash it for the index

This commit is contained in:
Klaas van Schelven
2025-05-06 11:32:19 +02:00
parent d5a449020d
commit 49e6700d4a
6 changed files with 51 additions and 7 deletions

View File

@@ -1,3 +1,4 @@
import hashlib
import os
import logging
import io
@@ -270,7 +271,10 @@ class BaseIngestAPIView(View):
grouping_key = get_issue_grouper_for_data(event_data, calculated_type, calculated_value)
try:
grouping = Grouping.objects.get(project_id=event_metadata["project_id"], grouping_key=grouping_key)
grouping = Grouping.objects.get(
project_id=event_metadata["project_id"], grouping_key=grouping_key,
grouping_key_hash=hashlib.sha256(grouping_key.encode()).hexdigest())
issue = grouping.issue
issue_created = False
@@ -300,6 +304,7 @@ class BaseIngestAPIView(View):
grouping = Grouping.objects.create(
project_id=event_metadata["project_id"],
grouping_key=grouping_key,
grouping_key_hash=hashlib.sha256(grouping_key.encode()).hexdigest(),
issue=issue,
)

View File

@@ -1,3 +1,4 @@
import hashlib
from django.utils import timezone
from projects.models import Project
@@ -26,6 +27,7 @@ def get_or_create_issue(project=None, event_data=None):
grouping = Grouping.objects.create(
project=project,
grouping_key=grouping_key,
grouping_key_hash=hashlib.sha256(grouping_key.encode()).hexdigest(),
issue=issue,
)

View File

@@ -0,0 +1,17 @@
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("issues", "0013_fix_issue_stored_event_counts"),
]
operations = [
migrations.AddField(
model_name="grouping",
name="grouping_key_hash",
field=models.CharField(default="", max_length=64),
preserve_default=False,
),
]

View File

@@ -0,0 +1,20 @@
import hashlib
from django.db import migrations
def set_grouping_hash(apps, schema_editor):
Grouping = apps.get_model("issues", "Grouping")
for grouping in Grouping.objects.all():
grouping.grouping_key_hash = hashlib.sha256(grouping.grouping_key.encode()).hexdigest()
grouping.save()
class Migration(migrations.Migration):
dependencies = [
("issues", "0014_grouping_grouping_key_hash"),
]
operations = [
migrations.RunPython(set_grouping_hash),
]

View File

@@ -5,12 +5,12 @@ class Migration(migrations.Migration):
dependencies = [
("projects", "0011_fill_stored_event_count"),
("issues", "0013_fix_issue_stored_event_counts"),
("issues", "0015_set_grouping_hash"),
]
operations = [
migrations.AlterUniqueTogether(
name="grouping",
unique_together={("project", "grouping_key")},
unique_together={("project", "grouping_key_hash")},
),
]

View File

@@ -197,11 +197,11 @@ class Grouping(models.Model):
project = models.ForeignKey(
"projects.Project", blank=False, null=True, on_delete=models.SET_NULL) # SET_NULL: cleanup 'later'
# NOTE: I don't want to have any principled maximum on the grouping key, nor do I want to prematurely optimize the
# lookup. If lookups are slow (even with an index), we _could_ examine whether manually hashing these values and
# matching on the hash helps.
grouping_key = models.TextField(blank=False, null=False)
# we hash the key to make it indexable on MySQL, see https://code.djangoproject.com/ticket/2495
grouping_key_hash = models.CharField(max_length=64, blank=False, null=False)
issue = models.ForeignKey("Issue", blank=False, null=True, on_delete=models.SET_NULL) # SET_NULL: cleanup 'later'
def __str__(self):
@@ -211,7 +211,7 @@ class Grouping(models.Model):
unique_together = [
# principled: grouping _key_ is a _key_ for a reason (within a project). This also implies the main way of
# looking up groupings has an appropriate index.
("project", "grouping_key"),
("project", "grouping_key_hash"),
]