Files
bugsink/ingest/event_counter.py
Klaas van Schelven 354af7ea0a Fix issues as reported by bandit or mark as nosec
Nothing worrying, but good to have checked this regardless
and important to have a green pipeline.

Fix #175
2025-07-30 12:16:40 +02:00

72 lines
4.2 KiB
Python

from datetime import timezone, datetime
from django.db.models import Min
from bugsink.period_utils import add_periods_to_datetime, sub_periods_from_datetime
from bugsink.utils import assert_
def _filter_for_periods(qs, period_name, nr_of_periods, now):
if period_name == "total":
return qs
return qs.filter(digested_at__gte=sub_periods_from_datetime(now, nr_of_periods, period_name))
def check_for_thresholds(qs, now, thresholds, add_for_current=0):
# thresholds :: [(period_name, nr_of_periods, gte_threshold), ...]
# returns [(state, below_threshold_from, check_again_after, (period_name, nr_of_periods, gte_threshold)), ...]
# This function does aggregation, so it's reasonably expensive (I haven't measured exactly, but it seems to be at
# least as expensive per-call as the whole of the rest of digestion). We solve this by not calling it often, using
# the `check_again_after` mechanism (which relies on simple counting, and the fact that a threshold for any given
# period of time will certainly not be crossed sooner than that the number of observations over _any_ time period
# exceeds the given threshold. The amorization then happens over the difference between the threshold and the
# actually observed number of events over the relevant time-period; in other words, unless for some weird reason you
# are consitently super-close to the quota but not over-quota the amortization will happen over a reasonable
# fraction of the quota, and if the quota is reasonably high (which is the only relevant-for-performance case
# anyway) this means the cost will be amortized away. (e.g. quota of 1_000; a check every 100 events in a bad case).
# The only relevant cost that this mechanism thus adds is the per-project counting of digested events.
# we only allow UTC, and we generally use Django model fields, which are UTC, so this should be good:
assert_(now.tzinfo == timezone.utc)
states = []
for (period_name, nr_of_periods, gte_threshold) in thresholds:
count = _filter_for_periods(qs, period_name, nr_of_periods, now).count() + add_for_current
state = count >= gte_threshold
if state:
if period_name == "total":
# when counting the 'total', there will never be a time when the condition becomes false. We
# just pick an arbitrarily large date; we'll deal with it by the end of the myria-annum.
# unlikely to actually end up in the DB (because it would imply the use of a quota for total).
below_threshold_from = datetime(9999, 12, 31, 23, 59, tzinfo=timezone.utc)
else:
# `below_threshold_from` is the first moment in time where the condition no longer applies. Assuming
# the present function is called "often enough" (i.e is called for the moment the switch to state=True
# happens, and not thereafter), there will be _excactly_ `gte_threshold` items in the qs (potentially
# with 1 implied one if `add_for_current` applies). Taking the min of those and adding the time period
# brings us to the point in time where the condition will become False again.
#
# (The assumption of "often enough, and no more" holds for us because for quota we stop accepting events
# once the quota is met; for muted we remove the vbc once unmuted). For the "overshoot" case (see tests,
# not really expected) this has the consequence of seeing a result that is "too old", and hence going
# back to accepting too soon. But this is self-correcting, so no need to deal with it.
below_threshold_from = add_periods_to_datetime(
_filter_for_periods(qs, period_name, nr_of_periods, now).aggregate(
agg=Min('digested_at'))['agg'] or now, # `or now` to handle funny `gte_threshold==0`
nr_of_periods, period_name)
else:
below_threshold_from = None
check_again_after = gte_threshold - count
states.append((state, below_threshold_from, check_again_after, (period_name, nr_of_periods, gte_threshold)))
return states