From 96e07c4dc37bb0eac0fe934459c7b3892d5e5d50 Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Fri, 7 Mar 2025 11:46:12 +0100 Subject: [PATCH] Tags: delete EventTag when Events are evicted and document related things --- events/retention.py | 6 ++++++ tags/migrations/0001_initial.py | 9 +++------ tags/models.py | 14 +++++++++----- 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/events/retention.py b/events/retention.py index 11b750d..eb6e9b3 100644 --- a/events/retention.py +++ b/events/retention.py @@ -272,6 +272,7 @@ def evict_for_irrelevance( def evict_for_epoch_and_irrelevance(project, max_epoch, max_irrelevance, max_event_count, include_never_evict): from issues.models import TurningPoint from .models import Event + from tags.models import EventTag # evicting "at", based on the total irrelevance split out into 2 parts: max item irrelevance, and an epoch as # implied by the age-based irrelevance. @@ -322,6 +323,11 @@ def evict_for_epoch_and_irrelevance(project, max_epoch, max_irrelevance, max_eve Event.objects.filter(pk__in=pks_to_delete).exclude(storage_backend=None) .values_list("id", "storage_backend") ) + + # Rather than rely on Django's implementation of CASCADE, we "just do this ourselves"; Reason is: Django does an + # extra, expensive (all-column), query on Event[...](id__in=pks_to_delete) to extract the Event ids (which we + # already have). + EventTag.objects.filter(event_id__in=pks_to_delete).delete() nr_of_deletions = Event.objects.filter(pk__in=pks_to_delete).delete()[1].get("events.Event", 0) else: nr_of_deletions = 0 diff --git a/tags/migrations/0001_initial.py b/tags/migrations/0001_initial.py index 2abaf19..7060004 100644 --- a/tags/migrations/0001_initial.py +++ b/tags/migrations/0001_initial.py @@ -1,5 +1,3 @@ -# Generated by Django 4.2.19 on 2025-03-03 10:28 - from django.db import migrations, models import django.db.models.deletion @@ -9,9 +7,9 @@ class Migration(migrations.Migration): initial = True dependencies = [ - ("projects", "0011_fill_stored_event_count"), ("events", "0019_event_storage_backend"), - ("issues", "0010_issue_list_indexes"), + ("issues", "0012_alter_issue_calculated_type_and_more"), + ("projects", "0011_fill_stored_event_count"), ] operations = [ @@ -124,8 +122,7 @@ class Migration(migrations.Migration): ( "event", models.ForeignKey( - null=True, - on_delete=django.db.models.deletion.SET_NULL, + on_delete=django.db.models.deletion.DO_NOTHING, related_name="tags", to="events.event", ), diff --git a/tags/models.py b/tags/models.py index c148320..457f859 100644 --- a/tags/models.py +++ b/tags/models.py @@ -49,8 +49,6 @@ class TagKey(models.Model): class TagValue(models.Model): project = models.ForeignKey(Project, blank=False, null=True, on_delete=models.SET_NULL) # SET_NULL: cleanup 'later' - # CASCADE: TBD; one argument might be: decouple deletions from events ingestion, but at least have them internally - # consistent key = models.ForeignKey(TagKey, blank=False, null=False, on_delete=models.CASCADE) value = models.CharField(max_length=200, blank=False, null=False, db_index=True) @@ -68,12 +66,16 @@ class EventTag(models.Model): # value already implies key in our current setup. value = models.ForeignKey(TagValue, blank=False, null=False, on_delete=models.CASCADE) - event = models.ForeignKey('events.Event', blank=False, null=True, on_delete=models.SET_NULL, related_name='tags') + # DO_NOTHING: we manually implement CASCADE (i.e. when an event is cleaned up, clean up associated tags) in the + # eviction process. Why CASCADE? [1] you'll have to do it "at some point", so you might as well do it right when + # evicting (async in the 'most resilient setup' anyway, b/c that happens when ingesting) [2] the order of magnitude + # is "tens of deletions per event", so that's no reason to postpone. + event = models.ForeignKey('events.Event', blank=False, null=False, on_delete=models.DO_NOTHING, related_name='tags') class Meta: unique_together = ('value', 'event') indexes = [ - models.Index(fields=['event', 'value']), # make lookups by event (for details page) faster + models.Index(fields=['event', 'value']), # make lookups by event (for details page, deletions) faster ] @@ -87,15 +89,17 @@ class IssueTag(models.Model): # value already implies key in our current setup. value = models.ForeignKey(TagValue, blank=False, null=False, on_delete=models.CASCADE) + # SET_NULL: Issue deletion is not actually possible yet, so this is moot (for now). issue = models.ForeignKey('issues.Issue', blank=False, null=True, on_delete=models.SET_NULL, related_name='tags') - # As it stands, there is only a single counter per issue-tagvalue combination. In principle/theory this type of + # 1. As it stands, there is only a single counter per issue-tagvalue combination. In principle/theory this type of # denormalization will break down when you want to show this kind of information filtered by some other dimension, # because combining such slicings will lead to a combinatorial explosion. However, the only practical such dimension # we'll likely want to filter by is "environment". So once we'll actually offer that it's more likely that we'll # just implement a separate counter for that dimension, rather than try to do the generic thing ("snuba" at Sentry). # And in that case we can probably add the additional constraint that the number of practically observed # environments is some small number like 10 (even just because of UI constraints like dropdowns). + # 2. counts are "seen counts", i.e. when events are deleted, the counts are not updated. This is intentional. count = models.PositiveIntegerField(default=0) class Meta: