From 1b7865d3b9f3cce291b1b7576fb3ee6368bc39db Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Wed, 19 Mar 2025 11:56:55 +0100 Subject: [PATCH] Eviction: Tests and rewrite-for-understanding of epoch_bounds_with_irrelevance --- events/retention.py | 28 +++++++++++++++++++++----- events/tests.py | 49 ++++++++++++++++++++++++++++++++++++++++++--- ingest/tests.py | 2 +- 3 files changed, 70 insertions(+), 9 deletions(-) diff --git a/events/retention.py b/events/retention.py index 5a8f73a..075798c 100644 --- a/events/retention.py +++ b/events/retention.py @@ -122,6 +122,8 @@ def get_age_for_irrelevance(age_based_irrelevance): def get_epoch_bounds_with_irrelevance(project, current_timestamp, qs_kwargs={"never_evict": False}): + """Returns the epoch bounds for the project (newest first), with the age-based irrelevance for each epoch.""" + from .models import Event oldest = Event.objects.filter(project=project, **qs_kwargs).aggregate(val=Min('digested_at'))['val'] @@ -131,11 +133,15 @@ def get_epoch_bounds_with_irrelevance(project, current_timestamp, qs_kwargs={"ne difference = current_epoch - first_epoch - # because we construct in reverse order (from the most recent to the oldest) we end up with the pairs swapped - swapped_bounds = pairwise( - [None] + [current_epoch - age for age in list(map_N_until(get_age_for_irrelevance, difference))] + [None]) + ages = list(map_N_until(get_age_for_irrelevance, difference)) # e.g. [0, 3, 15] + epochs = [current_epoch - age for age in ages] # e.g. [100, 97, 85] + swapped_bounds = pairwise([None] + epochs + [None]) # e.g. [(None, 100), (100, 97), (97, 85), (85, None)] - return [((lb, ub), age_based_irrelevance) for age_based_irrelevance, (ub, lb) in enumerate(swapped_bounds)] + # because ages and epochs are constructed new-to-old, we get (newer, older) pairs but we need the reverse + bounds = [(older, newer) for (newer, older) in swapped_bounds] # e.g [(100, None), (97, 100), ...] + + # age based irrelevance is simply "counting back" now; e.g. [((100, None), 0), ((97, 100), 1), ...] + return [((lb, ub), age_based_irrelevance) for (age_based_irrelevance, (lb, ub)) in enumerate(bounds)] def get_irrelevance_pairs(project, epoch_bounds_with_irrelevance, qs_kwargs={"never_evict": False}): @@ -196,6 +202,18 @@ def eviction_target(max_event_count, stored_event_count): def evict_for_max_events(project, timestamp, stored_event_count=None, include_never_evict=False): + # This function evicts, with a number-based target of events in mind. It does so by repeatedly calling + # evict_for_irrelevance (i.e. irrelevance-based target), lowering the target irrelevance until the target number of + # events has been evicted. + # + # Before any actual work is done, we map out the terrain: + # + # * epoch_bounds_with_irrelevance: pairs of epoch bounds with their age_based_irrelevance + # * pairs: adds to that the observed max irrelevance for each epoch + # + # epoch bounds are passed to the irrelevance-based eviction; pairs is used to determine the starting max irrelevance + # and in an optimization (skipping epochs where it's obvious no work is needed). + from .models import Event qs_kwargs = {} if include_never_evict else {"never_evict": False} @@ -309,7 +327,7 @@ def evict_for_epoch_and_irrelevance(project, max_epoch, max_irrelevance, max_eve # We generate the list of events-to-delete (including the LIMIT) before proceeding; this allows us: # A. to have a portable delete_with_limit (e.g. Django does not support that, nor does Postgres). - # B. to apply both deletion and cleanup_events_on_storage() on the same list. + # B. to apply deletions of Events and their consequences (cleanup_events_on_storage(), EventTag) on the same list. # # Implementation notes: # 1. We force evaluation here with a `list()`; this means subsequent usages do _not_ try to "just use an inner diff --git a/events/tests.py b/events/tests.py index cd4fc78..5181624 100644 --- a/events/tests.py +++ b/events/tests.py @@ -13,7 +13,7 @@ from issues.models import Issue from issues.factories import denormalized_issue_fields from .factories import create_event -from .retention import eviction_target, should_evict, evict_for_max_events +from .retention import eviction_target, should_evict, evict_for_max_events, get_epoch_bounds_with_irrelevance from .utils import annotate_with_meta User = get_user_model() @@ -101,8 +101,51 @@ class RetentionUtilsTestCase(RegularTestCase): class RetentionTestCase(DjangoTestCase): - # test-second TestCase for retention/eviction. "At least have something that touches the main parts of the code to - # avoid the most obvious kinds of breakages". Not a full test of all features/code paths (yet). + + def test_epoch_bounds_with_irrelevance_empty_project(self): + project = Project.objects.create() + current_timestamp = datetime.datetime(2022, 1, 1, 0, 0, 0, tzinfo=timezone.utc) + + bounds = get_epoch_bounds_with_irrelevance(project, current_timestamp) + + self.assertEqual([((None, None), 0)], bounds) + + def test_epoch_bounds_with_irrelevance_single_current_event(self): + project = Project.objects.create() + current_timestamp = datetime.datetime(2022, 1, 1, 0, 0, 0, tzinfo=timezone.utc) + create_event(project, timestamp=current_timestamp) + + bounds = get_epoch_bounds_with_irrelevance(project, current_timestamp) + + # all events are in the same (current) epoch, i.e. no explicit bounds, and no age-based irrelevance + self.assertEqual([((None, None), 0)], bounds) + + def test_epoch_bounds_with_irrelevance_single_hour_old_event(self): + project = Project.objects.create() + current_timestamp = datetime.datetime(2022, 1, 1, 0, 0, 0, tzinfo=timezone.utc) + create_event(project, timestamp=current_timestamp - datetime.timedelta(hours=1)) + + bounds = get_epoch_bounds_with_irrelevance(project, current_timestamp) + + # with an observed event in the previous epoch, we get explicit bounds. + self.assertEqual([ + ((455832, None), 0), # present-till-future; no age-based irrelevance + ((None, 455832), 1) # all the past-till-present; age-based irrelevance 1 + ], bounds) + + def test_epoch_bounds_with_irrelevance_day_old_event(self): + project = Project.objects.create() + current_timestamp = datetime.datetime(2022, 1, 1, 0, 0, 0, tzinfo=timezone.utc) + create_event(project, timestamp=current_timestamp - datetime.timedelta(days=1)) + + bounds = get_epoch_bounds_with_irrelevance(project, current_timestamp) + + self.assertEqual([ + ((455832, None), 0), + ((455829, 455832), 1), + ((455817, 455829), 2), + ((None, 455817), 3)], + bounds) def test_retention(self): # this test contains just the key bits of ingest/views.py such that we can test `evict_for_max_events` diff --git a/ingest/tests.py b/ingest/tests.py index ae06e55..f0688f9 100644 --- a/ingest/tests.py +++ b/ingest/tests.py @@ -408,7 +408,7 @@ class IngestViewTestCase(TransactionTestCase): def test_filestore(self): # quick & dirty way to test the filestore; in absence of a proper test for it, we just run a more-or-less # integration test with the FileEventStorage activated. This will at least show the absence of the most obvious - # errors. We then run + # errors. We then run eviction to make sure that the eviction removes the files as expected. with tempfile.TemporaryDirectory() as tempdir: with override_event_storages({"local_flat_files": { "STORAGE": "events.storage.FileEventStorage",