From a2f3ad900b41417d3a548b6f0f242364c340f66a Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Wed, 19 Mar 2025 15:23:31 +0100 Subject: [PATCH] eviction-target not reached handling changes this error has shown up for one of our users; I can't reproduce yet, but I can make it better: * log-don't-crash: not worth failing for this (drops the event, and also rolls back the transaction such that nothing is achieved regarding eviction) * provide more info on-error (various counts) NB: I've also changed the < into a <=, and combined it with a check on "loop not done". I _think_ they are functionally equivalent, and that the new version is simply more clear as well as slightly more efficient. In my understanding: the old version simply looped one more time before giving up (because it was < it needed one more iteration, and because there was no explicit check on 'loop done' that inefficiency was needed in the old formulation). I say "I think" because I don't have a test specific to the edge-case. --- events/retention.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/events/retention.py b/events/retention.py index ce3dd96..32eda4f 100644 --- a/events/retention.py +++ b/events/retention.py @@ -248,15 +248,23 @@ def evict_for_max_events(project, timestamp, stored_event_count=None, include_ne target - evicted, ) - if max_total_irrelevance < -1: # < -1: as in `evict_for_irrelevance` + if evicted < target and max_total_irrelevance <= -1: + # if max_total_irrelevance <= -1 the analogous check for max_total_irrelevance in evict_for_irrelevance + # will certainly have fired (item_irr <= total_irr) which means "no more can be achieved". + if not include_never_evict: # everything that remains is 'never_evict'. 'never say never' and evict those as a last measure return evicted + evict_for_max_events(project, timestamp, stored_event_count - evicted, True) - raise Exception("No more effective eviction possible but target not reached") # "should not happen" + # "should not happen", let's log it and break out of the loop + # the reason I can think of this happening is when stored_event_count is wrong (too high). + bugsink_logger.error( + "Failed to evict enough events; %d < %d (max %d, stored %d) including never_evict=%s", evicted, + target, project.retention_max_event_count, stored_event_count, include_never_evict) + break # phase 0: SELECT statements to identify per-epoch observed irrelevances - # phase 1: DELETE (evictions) and SELECT total count ("are we there yet?") + # phase 1: SELECT with LIMIT for deletions and DELETE themselves (evictions) performance_logger.info( "%6.2fms EVICT; down to %d, max irr. from %d to %d in %dms+%dms and %d+%d queries", phase0.took + phase1.took,