Files
bugsink/events/retention.py
Klaas van Schelven fe6c955465 never_evict events that are a Historic Turning Point
Both for technical (foreign keys) and business reasons (these are events you
care about)
2024-06-24 22:50:00 +02:00

277 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import logging
from django.db.models import Q, Min, Max
from random import random
from datetime import timezone, datetime
from performance.context_managers import time_and_query_count
performance_logger = logging.getLogger("bugsink.performance.retention")
def get_epoch(datetime_obj):
# The basic rythm for eviction is 'hourly'; we define an 'epoch' for eviction as the number of hours since 1970.
# Why pick hours rather than something else? It's certainly granular enough for our purposes, and it makes it
# possible for actual humans to understand what's going on (e.g. when debugging). Note that w.r.t. the outcome of
# our algorithm, the choice of epoch size translates into a constant addition to the age-based irrelevance. (i.e.
# when switching to days the age-based irrelvance would come out approximately 4 lower. But this would be corrected
# in the search for a cut-off value for the total irrelevance, so it doesn't matter in the end.)
# assuming we use model fields this 'just works' because Django's stores its stuff in timezone-aware UTC in the DB.
assert datetime_obj.tzinfo == timezone.utc
return int(datetime_obj.timestamp() / 3600)
def datetime_for_epoch(epoch):
return datetime.fromtimestamp(epoch * 3600, timezone.utc)
def get_epoch_bounds(lower, upper=None):
if lower is None and upper is None:
return Q()
if lower is None:
return Q(server_side_timestamp__lt=datetime_for_epoch(upper))
if upper is None:
return Q(server_side_timestamp__gte=datetime_for_epoch(lower))
return Q(server_side_timestamp__gte=datetime_for_epoch(lower), server_side_timestamp__lt=datetime_for_epoch(upper))
def nonzero_leading_bits(n):
"""
Return the non-roundness of a number when represented in binary, i.e. the number of leading bits until the last 1.
examples:
100000 -> 1
101000 -> 3
110001 -> 6
"""
s = format(n, 'b')
return len(s.rstrip('0'))
def get_random_irrelevance(event_count):
"""
gets a fixed-at-creation irrelevance-score for an Event; the basic idea is: the more events you have for a certain
issue, the less relevant any new event will be _on average_; but when you have many events you will on average still
have more relevant events than if you have few events.
irrelevance is basically determined by `nonzero_leading_bits`; we add some randomization to avoid repeated outcomes
if `cnt` "hovers" around a certain value (which is likely to happen when there's repeated eviction/fill-up). ×2 is
simply to correct for random() (which returns .5 on average).
"""
return nonzero_leading_bits(round(random() * event_count * 2))
def should_evict(project, timestamp, stored_event_count):
# if/when we implement 'just drop' this might go somewhere (maybe not here)
# if (project.retention_last_eviction is not None and
# get_epoch(project.retention_last_eviction) != get_epoch(timestamp)):
# return True
if stored_event_count > project.retention_max_event_count: # > because: do something when _over_ the max
return True
return False
def get_age_for_irrelevance(age_based_irrelevance):
#
# age based irrelevance is defined as `log(age + 1, 2)`
#
# (This is what we chose because we want 0-aged to have an age-based irrelevance of 0); i.e. that's where the +1
# comes from.
#
# at the integer values for irrelevance this works out like so:
# age = 0 => irrelevance = 0
# age = 1 => irrelevance = 1
# age = 2 => irrelevance = 1.58
# age = 3 => irrelevance = 2
# ...
# age = 7 => irrelevance = 3
#
# to work back from a given integer "budget" of irrelevance (after the (integer) item-based irrelevance has been
# subtracted from the total max), we can simply take `2^budget - 1` to get the 'age of eviction', the number of
# epochs we must go back. The following code helps me understand this:
#
# >>> for budget in range(8):
# ... age = pow(2, budget) - 1
# ... print("budget: %s, age: %s" % (budget, age))
return pow(2, age_based_irrelevance) - 1
def get_epoch_bounds_with_irrelevance(project, current_timestamp):
from .models import Event
oldest = Event.objects.filter(project=project, never_evict=False).aggregate(val=Min('server_side_timestamp'))['val']
first_epoch = get_epoch(oldest) if oldest is not None else get_epoch(current_timestamp)
current_epoch = get_epoch(current_timestamp)
difference = current_epoch - first_epoch
# because we construct in reverse order (from the most recent to the oldest) we end up with the pairs swapped
swapped_bounds = pairwise(
[None] + [current_epoch - n for n in list(map_N_until(get_age_for_irrelevance, difference))] + [None])
return [((lb, ub), age_based_irrelevance) for age_based_irrelevance, (ub, lb) in enumerate(swapped_bounds)]
def get_irrelevance_pairs(project, epoch_bounds_with_irrelevance):
"""tuples of `age_based_irrelevance` and, per associated period, the max observed (evictable) event irrelevance"""
from .models import Event
for (lower_bound, upper_bound), age_based_irrelevance in epoch_bounds_with_irrelevance:
d = Event.objects.filter(
get_epoch_bounds(lower_bound, upper_bound),
project=project,
never_evict=False,
).aggregate(Max('irrelevance_for_retention'))
max_event_irrelevance = d["irrelevance_for_retention__max"] or 0
yield (age_based_irrelevance, max_event_irrelevance)
def map_N_until(f, until, onemore=False):
n = 0
result = f(n)
while result < until:
yield result
n += 1
result = f(n)
if onemore:
yield result
def pairwise(it):
it = iter(it)
try:
prev = next(it)
except StopIteration:
return
for current in it:
yield (prev, current)
prev = current
def filter_for_work(epoch_bounds_with_irrelevance, pairs, max_total_irrelevance):
# Here, we filter out epoch bounds for which there is obviously no work to be done, based on the pairs that we have
# selected at the beginning of the algo. We compare the selected irrelevances with the total, and if they do not
# exceed it no work will need to be done for that set of epochs.
# Since it uses only the (already available) information that was gathered at the beginning of the algo, it is not a
# full filter for avoiding useless deletions, but at least we use the available info (from queries) to the fullest.
for pair, ebwi in zip(pairs, epoch_bounds_with_irrelevance):
if sum(pair) > max_total_irrelevance: # > because only if it is strictly greater will anything be evicted.
yield ebwi
def lowered_target(max_event_count):
# we want to evict down to 95% of the max event count; this is a bit arbitrary but it's a good starting point.
# the reason is: we want to avoid having to evict again immediately after we've just evicted. because eviction is
# relatively expensive, we want to avoid doing it too often. when using e.g. 10_000 as a max event count (completely
# reasonable), we evict at least 500 events at a time. We do "a lot" (perhaps 10s, but certainly not 500) of queries
# per eviction, so after amortization this is far less than 1 query extra per event as a result of the actual
# eviction (checking for the need to evict is a different story, but that's a different problem).
# A reason to pick 95% instead of 90% is that eviction, as we've implemented it, also has its own 'overshooting'
# (i.e. it will evict more than strictly necessary, because it evicts all items with an irrelevance strictly greater
# than the given value). We don't want to be "doubly conservative" in this regard. (Alternatively we could work with
# a [maxed] constant value of e.g. n - 500)
return int(max_event_count * 0.95)
def evict_for_max_events(project, timestamp, stored_event_count=None):
from .models import Event
with time_and_query_count() as phase0:
if stored_event_count is None:
# allowed as a pass-in to save a query (we generally start off knowing this); +1 because call-before-add
stored_event_count = Event.objects.filter(project=project).count() + 1
epoch_bounds_with_irrelevance = get_epoch_bounds_with_irrelevance(project, timestamp)
# we start off with the currently observed max total irrelevance
pairs = list(get_irrelevance_pairs(project, epoch_bounds_with_irrelevance))
max_total_irrelevance = orig_max_total_irrelevance = max(sum(pair) for pair in pairs)
with time_and_query_count() as phase1:
while stored_event_count > lowered_target(project.retention_max_event_count): # > as in `should_evict`
# -1 at the beginning of the loop; this means the actually observed max value is precisely the first thing
# that will be evicted (since `evict_for_irrelevance` will evict anything above (but not including) the
# given value)
max_total_irrelevance -= 1
evict_for_irrelevance(
project,
max_total_irrelevance,
list(filter_for_work(epoch_bounds_with_irrelevance, pairs, max_total_irrelevance)))
stored_event_count = Event.objects.filter(project=project).count() + 1
if max_total_irrelevance < -1: # < -1: as in `evict_for_irrelevance`
# could still happen if there's max_size items that cannot be evicted at all
raise Exception("No more effective eviction possible but target not reached")
# phase 0: SELECT statements to identify per-epoch observed irrelevances
# phase 1: DELETE (evictions) and SELECT total count ("are we there yet?")
performance_logger.info(
"%6.2fms EVICT; down to %d, max irr. from %d to %d in %dms+%dms and %d+%d queries",
phase0.took + phase1.took, stored_event_count, orig_max_total_irrelevance, max_total_irrelevance, phase0.took,
phase1.took, phase0.count, phase1.count)
return max_total_irrelevance
def evict_for_irrelevance(project, max_total_irrelevance, epoch_bounds_with_irrelevance):
# print("evict_for_irrelevance(%d, %s)" % (max_total_irrelevance, epoch_bounds_with_irrelevance))
# max_total_irrelevance, i.e. the total may not exceed this (but it may equal it)
for (_, epoch_ub_exclusive), irrelevance_for_age in epoch_bounds_with_irrelevance:
max_item_irrelevance = max_total_irrelevance - irrelevance_for_age
evict_for_epoch_and_irrelevance(project, epoch_ub_exclusive, max_item_irrelevance)
if max_item_irrelevance <= -1:
# in the actual eviction, the test on max_item_irrelevance is done exclusively, i.e. only items of greater
# irrelevance are evicted. The minimal actually occuring value is 0. Such items can be evicted with a call
# with max_item_irrelevance = -1. This means that if we just did such an eviction, we're done for all epochs
break
def evict_for_epoch_and_irrelevance(project, max_epoch, max_irrelevance):
# print("evict_for_epoch_and_irrelevance(%s, %s)" % (max_epoch, max_irrelevance))
from .models import Event
# evicting "at", based on the total irrelevance split out into 2 parts: max item irrelevance, and an epoch as
# implied by the age-based irrelevance.
# (both max_epoch and max_irrelevance are _exclusive_)
# Note: we simply use a single age-based UB-check to delete; an alternative is to also use associated time-based-LB
# for a given `irrelevance_for_age`; in practice it doesn't matter, because in the same `evict_for_irrelevance` call
# the older epochs will be visited later with an even lower value for `max_irrelevance` which would delete the same.
# But we might use this fact at some point in the future (e.g. for performance considerations, or to evict in
# smaller steps).
#
# As a picture (time on X, irrelevance on the Y axis, lower rows have higher irrelevance as in the simulation):
#
# . . . . . . .
# B B .
# a a a a x x A
#
# As implemented, we evict the points marked `A`, `x` and `a` all in a single go. The alternative would be: `A` in
# this call, and only when `B` is cleaned will the points `x` be cleaned. (as-is, they are part of the selection,
# but will already have been deleted)
qs = Event.objects.filter(project=project, irrelevance_for_retention__gt=max_irrelevance, never_evict=False)
if max_epoch is not None:
qs = qs.filter(server_side_timestamp__lt=datetime_for_epoch(max_epoch))
qs.delete()