From 806bf1a0c5bbe7b244046eaf80204cd6056e66d7 Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Mon, 18 Dec 2023 23:32:08 +0100 Subject: [PATCH] Tool to test w/ bursty data --- performance/bursty_data.py | 66 ++++++++++++++++++++++++++++++++++++++ performance/some_script.py | 3 ++ 2 files changed, 69 insertions(+) create mode 100644 performance/bursty_data.py diff --git a/performance/bursty_data.py b/performance/bursty_data.py new file mode 100644 index 0000000..664142d --- /dev/null +++ b/performance/bursty_data.py @@ -0,0 +1,66 @@ +import datetime +import math +import random + + +# a way to generate some bursty streams of points-in-time. +# I'm sure there's a 100 things wrong with this, but at least it's +# +# * not simply distributed at random +# * has some form of periodic pattern in it as real data surely has +# * has bursts (errors come in bursts!) +# +# this will give us at least some base to test in somewhat natural settings. + + +def generate_bursty_data(nr_of_waves=1, base_amplitude=1, expected_nr_of_bursts=1, burst_amplitude=5, num_buckets=1000): + """returns `num_buckets` histogram-like buckets""" + + burst_prob = expected_nr_of_bursts / num_buckets + period = num_buckets / nr_of_waves + + buckets = [0] * num_buckets + + for i in range(num_buckets): + # We pick math.sin as an arbitrary periodic pattern. Normalize for period and >0 + periodic_pattern = (1 + math.sin(i / period * 2 * math.pi)) / 2 + + # Introduce burst with probability 'burst_prob' + if random.random() < burst_prob: + burst = abs(random.gauss(0, burst_amplitude)) + buckets[i] = periodic_pattern + burst + else: + buckets[i] = periodic_pattern + + return buckets + + +def buckets_to_points_in_time(buckets, begin, end, total_points): + """given: + + * histogram-like list of 'buckets', where each bucket is a float that is a relative business of that period + * a begin and an end (both datetime) + * a total amount of points + + generates a list of points of length `total_points` that conforms to the distribution denoted by the buckets, and + where the points-in-time are distributed at random within the buckets. + """ + + total_weight = sum(buckets) + + time_range_size = end - begin + bucket_size = time_range_size.total_seconds() / len(buckets) + + points = [] + + rounding_difference = 0 + + for i, bucket_weight in enumerate(buckets): + bucket_points = (bucket_weight / total_weight) * total_points + rounding_difference + rounding_difference = bucket_points - round(bucket_points) + bucket_points = round(bucket_points) + + for j in range(bucket_points): + points.append(begin + datetime.timedelta(seconds=bucket_size * (i + random.uniform(0, 1)))) + + return sorted(points) diff --git a/performance/some_script.py b/performance/some_script.py index d897289..a4cd1b7 100644 --- a/performance/some_script.py +++ b/performance/some_script.py @@ -30,4 +30,7 @@ slow results for a check in. """) +def print_thoughts_about_inc(): + + print_thoughts_about_prev_tup()