Files
bugsink/events/retention.py
2024-07-02 09:03:17 +02:00

327 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import logging
from django.db.models import Q, Min, Max
from random import random
from datetime import timezone, datetime
from django.db.models.sql.compiler import SQLDeleteCompiler
from django.db import connection
from bugsink.moreiterutils import pairwise, map_N_until
from performance.context_managers import time_and_query_count
performance_logger = logging.getLogger("bugsink.performance.retention")
def get_epoch(datetime_obj):
# The basic rythm for eviction is 'hourly'; we define an 'epoch' for eviction as the number of hours since 1970.
# Why pick hours rather than something else? It's certainly granular enough for our purposes, and it makes it
# possible for actual humans to understand what's going on (e.g. when debugging). Note that w.r.t. the outcome of
# our algorithm, the choice of epoch size translates into a constant addition to the age-based irrelevance. (i.e.
# when switching to days the age-based irrelvance would come out approximately 4 lower. But this would be corrected
# in the search for a cut-off value for the total irrelevance, so it doesn't matter in the end.)
# assuming we use model fields this 'just works' because Django's stores its stuff in timezone-aware UTC in the DB.
assert datetime_obj.tzinfo == timezone.utc
return int(datetime_obj.timestamp() / 3600)
def datetime_for_epoch(epoch):
return datetime.fromtimestamp(epoch * 3600, timezone.utc)
def get_epoch_bounds(lower, upper=None):
if lower is None and upper is None:
return Q()
if lower is None:
return Q(server_side_timestamp__lt=datetime_for_epoch(upper))
if upper is None:
return Q(server_side_timestamp__gte=datetime_for_epoch(lower))
return Q(server_side_timestamp__gte=datetime_for_epoch(lower), server_side_timestamp__lt=datetime_for_epoch(upper))
def nonzero_leading_bits(n):
"""
Return the non-roundness of a number when represented in binary, i.e. the number of leading bits until the last 1.
examples:
100000 -> 1
101000 -> 3
110001 -> 6
"""
s = format(n, 'b')
return len(s.rstrip('0'))
def get_random_irrelevance(event_count):
"""
gets a fixed-at-creation irrelevance-score for an Event; the basic idea is: the more events you have for a certain
issue, the less relevant any new event will be _on average_; but when you have many events you will on average still
have more relevant events than if you have few events.
irrelevance is basically determined by `nonzero_leading_bits`; we add some randomization to avoid repeated outcomes
if `cnt` "hovers" around a certain value (which is likely to happen when there's repeated eviction/fill-up). ×2 is
simply to correct for random() (which returns .5 on average).
"""
return nonzero_leading_bits(round(random() * event_count * 2))
def should_evict(project, timestamp, stored_event_count):
# if/when we implement 'just drop' this might go somewhere (maybe not here)
# if (project.retention_last_eviction is not None and
# get_epoch(project.retention_last_eviction) != get_epoch(timestamp)):
# return True
if stored_event_count > project.retention_max_event_count: # > because: do something when _over_ the max
return True
return False
def get_age_for_irrelevance(age_based_irrelevance):
# age based irrelevance is defined as `log(age + 1, 4)`
#
# (This is what we chose because we want 0-aged to have an age-based irrelevance of 0); i.e. that's where the +1
# comes from.
#
# The base of 4 was chosen after some experimentation/consideration; it's certainly not a scientific choice. Note
# that the age based irrelevance is the easiest to tune out of the 2, because it's a simple logarithm (i.e. we don't
# need to count bits/leading zeroes) and because it is calculated on demand rather than stored in the DB.
#
# Why we picked 4: if you consider quota in the range of 10_000 - 1_000_000, the nonzero_leading_bits will lead to
# event-irrelevances of max 15 - 21 respectively. However, after evicting for max events I've observed this to be in
# the 8 - 12 range for the 10_000 case. Choosing 4 as the base for age-based means that the irrelevance for an event
# that is 1 year old is about 6.5 (log(24 * 365, 4)), which makes it so that even 1 year old events are not
# necessarily evicted if they were "the most relevant ones".
#
# Another way of thinking about this: for the value of 4, the change in irrelevance for an event going from one week
# old to one month old, or an event going from a bit over a day to a week old is comparable to an event being one of
# twice as many events. This feels more correct than e.g. using base 2, where the change in irrelevance takes a
# "step" at each doubling.
#
# at the integer values for irrelevance this works out like so:
# age = 0 => irrelevance = 0
# age = 1 => irrelevance = 0.5
# age = 2 => irrelevance = 0.792
# age = 3 => irrelevance = 1
# age = 15 => irrelevance = 2
#
# to work back from a given integer "budget" of irrelevance (after the (integer) item-based irrelevance has been
# subtracted from the total max), we can simply take `4^budget - 1` to get the 'age of eviction', the number of
# epochs we must go back. The following code helps me understand this:
#
# >>> for budget in range(20):
# ... age = pow(4, budget) - 1
# ... print("budget: %s, age: %s" % (budget, age))
return pow(4, age_based_irrelevance) - 1
def get_epoch_bounds_with_irrelevance(project, current_timestamp, qs_kwargs={"never_evict": False}):
from .models import Event
oldest = Event.objects.filter(project=project, **qs_kwargs).aggregate(val=Min('server_side_timestamp'))['val']
first_epoch = get_epoch(oldest) if oldest is not None else get_epoch(current_timestamp)
current_epoch = get_epoch(current_timestamp)
difference = current_epoch - first_epoch
# because we construct in reverse order (from the most recent to the oldest) we end up with the pairs swapped
swapped_bounds = pairwise(
[None] + [current_epoch - age for age in list(map_N_until(get_age_for_irrelevance, difference))] + [None])
return [((lb, ub), age_based_irrelevance) for age_based_irrelevance, (ub, lb) in enumerate(swapped_bounds)]
def get_irrelevance_pairs(project, epoch_bounds_with_irrelevance, qs_kwargs={"never_evict": False}):
"""tuples of `age_based_irrelevance` and, per associated period, the max observed (evictable) event irrelevance"""
from .models import Event
for (lower_bound, upper_bound), age_based_irrelevance in epoch_bounds_with_irrelevance:
d = Event.objects.filter(
get_epoch_bounds(lower_bound, upper_bound),
project=project,
**qs_kwargs,
).aggregate(Max('irrelevance_for_retention'))
max_event_irrelevance = d["irrelevance_for_retention__max"] or 0
yield (age_based_irrelevance, max_event_irrelevance)
def filter_for_work(epoch_bounds_with_irrelevance, pairs, max_total_irrelevance):
# Here, we filter out epoch bounds for which there is obviously no work to be done, based on the pairs that we have
# selected at the beginning of the algo. We compare the selected irrelevances with the total, and if they do not
# exceed it no work will need to be done for that set of epochs.
# Since it uses only the (already available) information that was gathered at the beginning of the algo, it is not a
# full filter for avoiding useless deletions, but at least we use the available info (from queries) to the fullest.
for pair, ebwi in zip(pairs, epoch_bounds_with_irrelevance):
if sum(pair) > max_total_irrelevance: # > because only if it is strictly greater will anything be evicted.
yield ebwi
def eviction_target(max_event_count, stored_event_count):
# We always evict at least 500 events, or 5% of the max event count, whichever is less. The reason is: we want to
# avoid having to evict again immediately after we've just evicted. Because eviction is relatively expensive, we
# want to avoid doing it too often. For the completely reasonable quota of 10_000 events or more, this just means
# 500; for lower quota we take 5% to avoid evicting too much (at a small performance penalty).
#
# One reason to pick no higher than 500 (and to delete_with_limit) is that we want to avoid blocking too long on a
# single eviction. (both on a single query, to avoid timeouts, but also on the eviciton as a whole, because it
# would block other threads/processes and trigger timeouts there). On the slow VPS we've observed deletions taking
# in the order of 1-4ms per event, so 500 would put us at 2s, which is still less than 50% of the timeout value.
#
# Although eviction triggers "a lot" of queries, "a lot" is in the order of 25, so after amortization this is far
# less than 1 query extra per event (as a result of the actual eviction, checking for the need to evict is a
# different story). 5% seems close enough to the limit to stem the "why was so much deleted" questions.
#
# We also evict at least the number of events that we are over the max event count; this takes care of 2 scenarios:
# * a quota that has been adjusted downwards (we want to get rid of the excess);
# * quota so ridiculously low that 5% rounds down to 0, in those cases at least delete 1
return min(
max(
int(max_event_count * 0.05),
stored_event_count - max_event_count,
),
500,
)
def evict_for_max_events(project, timestamp, stored_event_count=None, include_never_evict=False):
from .models import Event
qs_kwargs = {} if include_never_evict else {"never_evict": False}
with time_and_query_count() as phase0:
if stored_event_count is None:
# allowed as a pass-in to save a query (we generally start off knowing this); +1 because call-before-add
stored_event_count = Event.objects.filter(project=project).count() + 1
epoch_bounds_with_irrelevance = get_epoch_bounds_with_irrelevance(project, timestamp, qs_kwargs)
# we start off with the currently observed max total irrelevance
pairs = list(get_irrelevance_pairs(project, epoch_bounds_with_irrelevance, qs_kwargs))
max_total_irrelevance = orig_max_total_irrelevance = max(sum(pair) for pair in pairs)
with time_and_query_count() as phase1:
evicted = 0
target = eviction_target(project.retention_max_event_count, stored_event_count)
while evicted < target:
# -1 at the beginning of the loop; this means the actually observed max value is precisely the first thing
# that will be evicted (since `evict_for_irrelevance` will evict anything above (but not including) the
# given value)
max_total_irrelevance -= 1
evicted += evict_for_irrelevance(
project,
max_total_irrelevance,
list(filter_for_work(epoch_bounds_with_irrelevance, pairs, max_total_irrelevance)),
include_never_evict,
target - evicted,
)
if max_total_irrelevance < -1: # < -1: as in `evict_for_irrelevance`
if not include_never_evict:
# everything that remains is 'never_evict'. 'never say never' and evict those as a last measure
return evict_for_max_events(project, timestamp, stored_event_count - evicted, True)
raise Exception("No more effective eviction possible but target not reached") # "should not happen"
# phase 0: SELECT statements to identify per-epoch observed irrelevances
# phase 1: DELETE (evictions) and SELECT total count ("are we there yet?")
performance_logger.info(
"%6.2fms EVICT; down to %d, max irr. from %d to %d in %dms+%dms and %d+%d queries",
phase0.took + phase1.took,
stored_event_count - evicted - 1, # down to: -1, because the +1 happens post-eviction
orig_max_total_irrelevance, max_total_irrelevance, phase0.took, phase1.took, phase0.count, phase1.count)
return max_total_irrelevance
def evict_for_irrelevance(
project, max_total_irrelevance, epoch_bounds_with_irrelevance, include_never_evict=False, max_event_count=None):
# max_total_irrelevance: the total may not exceed this (but it may equal it)
# max_event_count is optional in anticipation of non-count (i.e. size-based) based methods of eviction
evicted = 0
for (_, epoch_ub_exclusive), irrelevance_for_age in epoch_bounds_with_irrelevance:
max_item_irrelevance = max_total_irrelevance - irrelevance_for_age
current_max = max_event_count - evicted if max_event_count is not None else None
evicted += evict_for_epoch_and_irrelevance(
project, epoch_ub_exclusive, max_item_irrelevance, current_max, include_never_evict)
if max_item_irrelevance <= -1:
# in the actual eviction, the test on max_item_irrelevance is done exclusively, i.e. only items of greater
# irrelevance are evicted. The minimal actually occuring value is 0. Such items can be evicted with a call
# with max_item_irrelevance = -1. This means that if we just did such an eviction, we're done for all epochs
break
if max_event_count is not None and evicted >= max_event_count:
# We've reached the target; we can stop early. In this case not all events with greater than max_total_irr
# will have been evicted; if this is the case older items are more likely to be spared (because epochs are
# visited in reverse order).
break
return evicted
def delete_with_limit(qs, limit):
# Django does not support this out of the box (i.e. it does not support LIMIT in DELETE queries). Sqlite does in
# fact support it (whereas many other DBs do not), so we reach down into Django's internals to get this done.
sql, params = SQLDeleteCompiler(qs.query, connection, 'default').as_sql()
limited_sql = sql + " LIMIT %s"
limited_params = params + (limit,)
with connection.cursor() as cursor:
cursor.execute(limited_sql, limited_params)
nr_of_deletions = cursor.rowcount
return nr_of_deletions
def evict_for_epoch_and_irrelevance(project, max_epoch, max_irrelevance, max_event_count, include_never_evict):
from issues.models import TurningPoint
from .models import Event
# evicting "at", based on the total irrelevance split out into 2 parts: max item irrelevance, and an epoch as
# implied by the age-based irrelevance.
# (both max_epoch and max_irrelevance are _exclusive_)
# Note: we simply use a single age-based UB-check to delete; an alternative is to also use associated time-based-LB
# for a given `irrelevance_for_age`; in practice it doesn't matter, because in the same `evict_for_irrelevance` call
# the older epochs will be visited later with an even lower value for `max_irrelevance` which would delete the same.
# But we might use this fact at some point in the future (e.g. for performance considerations, or to evict in
# smaller steps).
#
# As a picture (time on X, irrelevance on the Y axis, lower rows have higher irrelevance as in the simulation):
#
# . . . . . . .
# B B .
# a a a a x x A
#
# As implemented, we evict the points marked `A`, `x` and `a` all in a single go. The alternative would be: `A` in
# this call, and only when `B` is cleaned will the points `x` be cleaned. (as-is, they are part of the selection,
# but will already have been deleted)
qs_kwargs = {} if include_never_evict else {"never_evict": False}
qs = Event.objects.filter(project=project, irrelevance_for_retention__gt=max_irrelevance, **qs_kwargs)
if max_epoch is not None:
qs = qs.filter(server_side_timestamp__lt=datetime_for_epoch(max_epoch))
if include_never_evict:
# we need to manually ensure that no FKs to the deleted items exist:
TurningPoint.objects.filter(triggering_event__in=qs).update(triggering_event=None)
if max_event_count is None:
nr_of_deletions, _ = qs.delete()
else:
nr_of_deletions = delete_with_limit(qs, max_event_count)
return nr_of_deletions