diff --git a/events/retention.py b/events/retention.py index bcfe7ff..c060682 100644 --- a/events/retention.py +++ b/events/retention.py @@ -171,17 +171,17 @@ def filter_for_work(epoch_bounds_with_irrelevance, pairs, max_total_irrelevance) def lowered_target(max_event_count): - # we want to evict down to 95% of the max event count; this is a bit arbitrary but it's a good starting point. - # the reason is: we want to avoid having to evict again immediately after we've just evicted. because eviction is - # relatively expensive, we want to avoid doing it too often. when using e.g. 10_000 as a max event count (completely - # reasonable), we evict at least 500 events at a time. We do "a lot" (perhaps 10s, but certainly not 500) of queries - # per eviction, so after amortization this is far less than 1 query extra per event as a result of the actual - # eviction (checking for the need to evict is a different story, but that's a different problem). - # A reason to pick 95% instead of 90% is that eviction, as we've implemented it, also has its own 'overshooting' - # (i.e. it will evict more than strictly necessary, because it evicts all items with an irrelevance strictly greater - # than the given value). We don't want to be "doubly conservative" in this regard. (Alternatively we could work with - # a [maxed] constant value of e.g. n - 500) - return int(max_event_count * 0.95) + # We always evict at least 500 events, or 5% of the max event count, whichever is less. The reason is: we want to + # avoid having to evict again immediately after we've just evicted. Because eviction is relatively expensive, we + # want to avoid doing it too often. For the completely reasonable quota of 10_000 events or more, this just means + # 500; for lower quota we take 5% to avoid evicting too much (at a small performance penalty). + # + # Although eviction triggers "a lot" of queries, "a lot" is in the order of 25, so after amortization this is far + # less than 1 query extra per event (as a result of the actual eviction, checking for the need to evict is a + # different story). A reason to pick 5% instead of 10% is that eviction, as we've implemented it, also has its own + # 'overshooting' (i.e. it will evict more than strictly necessary, because it evicts all items with an irrelevance + # strictly greater than the given value). We don't want to be "doubly conservative" in this regard. + return min(500, int(max_event_count * 0.95)) def evict_for_max_events(project, timestamp, stored_event_count=None):