Add tool to generate insight in retention (and fix bugs that that insight revelead)

This commit is contained in:
Klaas van Schelven
2024-06-24 10:59:04 +02:00
parent 63afba020a
commit bdc6193214
2 changed files with 77 additions and 7 deletions

View File

@@ -22,12 +22,12 @@ def get_epoch_bounds(lower, upper=None):
return Q()
if lower is None:
return Q(timestamp__lt=datetime_for_epoch(upper))
return Q(server_side_timestamp__lt=datetime_for_epoch(upper))
if upper is None:
return Q(timestamp__gte=datetime_for_epoch(lower))
return Q(server_side_timestamp__gte=datetime_for_epoch(lower))
return Q(timestamp__gte=datetime_for_epoch(lower), timestamp__lt=datetime_for_epoch(upper))
return Q(server_side_timestamp__gte=datetime_for_epoch(lower), server_side_timestamp__lt=datetime_for_epoch(upper))
def nonzero_leading_bits(n):
@@ -188,6 +188,7 @@ def evict_for_max_events(project, timestamp, stored_event_count=None):
max_total_irrelevance -= 1
evict_for_irrelevance(
project,
max_total_irrelevance,
list(filter_for_work(epoch_bounds_with_irrelevance, pairs, max_total_irrelevance)))
@@ -199,10 +200,13 @@ def evict_for_max_events(project, timestamp, stored_event_count=None):
raise Exception("No more effective eviction possible but target not reached")
# print("Evicted down to %d with a max_total_irrelevance of %d" % (observed_size, max_total_irrelevance)) TODO log
for query in connection.queries[pre:]:
print(query['sql'])
print("Reached", stored_event_count, "events")
return max_total_irrelevance
def evict_for_irrelevance(max_total_irrelevance, epoch_bounds_with_irrelevance):
def evict_for_irrelevance(project, max_total_irrelevance, epoch_bounds_with_irrelevance):
# print("evict_for_irrelevance(%d, %s)" % (max_total_irrelevance, epoch_bounds_with_irrelevance))
# max_total_irrelevance, i.e. the total may not exceed this (but it may equal it)
@@ -210,7 +214,7 @@ def evict_for_irrelevance(max_total_irrelevance, epoch_bounds_with_irrelevance):
for (_, epoch_ub_exclusive), irrelevance_for_age in epoch_bounds_with_irrelevance:
max_item_irrelevance = max_total_irrelevance - irrelevance_for_age
evict_for_epoch_and_irrelevance(epoch_ub_exclusive, max_item_irrelevance)
evict_for_epoch_and_irrelevance(project, epoch_ub_exclusive, max_item_irrelevance)
if max_item_irrelevance <= -1:
# in the actual eviction, the test on max_item_irrelevance is done exclusively, i.e. only items of greater
@@ -219,7 +223,7 @@ def evict_for_irrelevance(max_total_irrelevance, epoch_bounds_with_irrelevance):
break
def evict_for_epoch_and_irrelevance(max_epoch, max_irrelevance):
def evict_for_epoch_and_irrelevance(project, max_epoch, max_irrelevance):
# print("evict_for_epoch_and_irrelevance(%s, %s)" % (max_epoch, max_irrelevance))
from .models import Event
@@ -244,7 +248,7 @@ def evict_for_epoch_and_irrelevance(max_epoch, max_irrelevance):
# this call, and only when `B` is cleaned will the points `x` be cleaned. (as-is, they are part of the selection,
# but will already have been deleted)
qs = Event.objects.filter(irrelevance_for_retention__gt=max_irrelevance)
qs = Event.objects.filter(project=project, irrelevance_for_retention__gt=max_irrelevance)
if max_epoch is not None:
qs = qs.filter(server_side_timestamp__lt=datetime_for_epoch(max_epoch))

View File

@@ -0,0 +1,66 @@
from datetime import datetime, timezone
from .retention import get_epoch_bounds_with_irrelevance, get_irrelevance_pairs, datetime_for_epoch
from .models import Event
def retention_insight_values(project):
timestamp = datetime.now(tz=timezone.utc)
epoch_bounds_with_irrelevance = get_epoch_bounds_with_irrelevance(project, timestamp)
pairs = list(get_irrelevance_pairs(project, epoch_bounds_with_irrelevance))
print("epoch_bounds_with_irrelevance")
for x in epoch_bounds_with_irrelevance:
print(x)
print("pairs")
for x in pairs:
print(x)
yielded = 0
for (age_based_irrelevance, max_obsered_irrelevance), ((lb, ub), _) in reversed(list(zip(pairs, epoch_bounds_with_irrelevance))):
print("?", age_based_irrelevance, max_obsered_irrelevance, lb, ub)
results = {}
for irrelevance in range(max_obsered_irrelevance + 1):
qs = Event.objects.filter(
project=project,
irrelevance_for_retention=irrelevance
)
if lb is not None:
qs = qs.filter(server_side_timestamp__gte=datetime_for_epoch(lb))
if ub is not None:
qs = qs.filter(server_side_timestamp__lt=datetime_for_epoch(ub))
howmany = qs.count()
results[irrelevance] = howmany
yielded += howmany
yield (lb, results) # lb makes more sense visually
assert Event.objects.filter(project=project).count() == yielded, "%d != %d" % (Event.objects.filter(project=project).count(), yielded)
def retention_insight(project):
data = list(retention_insight_values(project))
print(data)
max_irrelevance = max(max(d.keys() for _, d in data), default=0)
# max_count = max(max(d.values() for _, d in data), default=0) idea: use for formatting, but dates are bigger
# len("2000-01-01 16h") == 14 -> 16 for padding
fmt = lambda epoch: datetime_for_epoch(epoch).strftime("%Y-%m-%d %Hh ") if epoch is not None else " " * 16 # noqa
# headers
print(" " * 5, end="")
for epoch, _ in data:
print(fmt(epoch), end="")
print()
for irrelevance in range(max_irrelevance + 1):
print("%3d| " % irrelevance, end="")
for epoch, results in data:
print("%14d " % results.get(irrelevance, 0), end="")
print()
print("Total: ", sum(sum(d.values()) for _, d in data))