From 5559fba7542c32f78aa65dd54f6edfedd31ce963 Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Wed, 12 Feb 2025 17:11:24 +0100 Subject: [PATCH] Introduce FileEventStorage An (optional) way to store the `event_data` (full event as JSON) outside the DB. This is expected to be useful for larger setups, because it gives you: * A more portable database (e.g. backups); (depeding on event size the impact on your DB is ~50x. * Less worries about hitting "physical" limits (e.g. disk size, max file size) for your DB. Presumably (more testing will happen going forwards) it will: * Speed up migrations (especially on sqlite, which does full table copies) * Speed up event ingestion(?) Further improvements in this commit: * `delete_with_limit` was removed; this removes one tie-in to MySQL/Sqlite (See #21 for this bullet) --- .gitignore | 3 + bugsink/app_settings.py | 1 + bugsink/settings/development.py | 9 +++ .../migrations/0019_event_storage_backend.py | 20 +++++++ events/models.py | 34 ++++++++++- events/retention.py | 53 +++++++++++------ events/storage.py | 58 +++++++++++++++++++ events/storage_registry.py | 50 ++++++++++++++++ 8 files changed, 207 insertions(+), 21 deletions(-) create mode 100644 events/migrations/0019_event_storage_backend.py create mode 100644 events/storage.py create mode 100644 events/storage_registry.py diff --git a/.gitignore b/.gitignore index cabf0c8..2661ff5 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,9 @@ __pycache__ /bugsink/_version.py /dist/ +# This matches the filestorage mentioned in development.py +/filestorage + # sqlite /db.sqlite3 diff --git a/bugsink/app_settings.py b/bugsink/app_settings.py index 2eecdc1..a79a094 100644 --- a/bugsink/app_settings.py +++ b/bugsink/app_settings.py @@ -62,6 +62,7 @@ DEFAULTS = { # Locations of files & directories: "INGEST_STORE_BASE_DIR": "/tmp/bugsink/ingestion", + "EVENT_STORAGES": {}, # Security: "MINIMIZE_INFORMATION_EXPOSURE": False, diff --git a/bugsink/settings/development.py b/bugsink/settings/development.py index a47dfb6..1fe7bef 100644 --- a/bugsink/settings/development.py +++ b/bugsink/settings/development.py @@ -95,6 +95,15 @@ BUGSINK = { "MAX_EVENTS_PER_PROJECT_PER_5_MINUTES": 1_000_000, "MAX_EVENTS_PER_PROJECT_PER_HOUR": 50_000_000, + "EVENT_STORAGES": { + "local_flat_files": { + "STORAGE": "events.storage.FileEventStorage", + "OPTIONS": { + "basepath": os.path.join(BASE_DIR, "filestorage"), + }, + "USE_FOR_WRITE": True, + }, + } } diff --git a/events/migrations/0019_event_storage_backend.py b/events/migrations/0019_event_storage_backend.py new file mode 100644 index 0000000..bfb04be --- /dev/null +++ b/events/migrations/0019_event_storage_backend.py @@ -0,0 +1,20 @@ +# Generated by Django 4.2.19 on 2025-02-11 20:48 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("events", "0018_remove_event_has_exception_remove_event_has_logentry"), + ] + + operations = [ + migrations.AddField( + model_name="event", + name="storage_backend", + field=models.CharField( + blank=True, default=None, editable=False, max_length=255, null=True + ), + ), + ] diff --git a/events/models.py b/events/models.py index 11b6d89..f070847 100644 --- a/events/models.py +++ b/events/models.py @@ -12,6 +12,7 @@ from compat.timestamp import parse_timestamp from issues.utils import get_title_for_exception_type_and_value from .retention import get_random_irrelevance +from .storage_registry import get_write_storage, get_storage class Platform(models.TextChoices): @@ -48,6 +49,15 @@ def maybe_empty(s): return "" if not s else s +def write_to_storage(event_id, parsed_data): + """ + event_id means event.id, i.e. the internal one. This saves us from thinking about the security implications of + using an externally provided ID across storage backends. + """ + with get_write_storage().open(event_id, "w") as f: + json.dump(parsed_data, f) + + class Event(models.Model): # Lines quotes with ">" are from the following to resources: # https://develop.sentry.dev/sdk/event-payloads/ (supposedly more human-readable) @@ -139,6 +149,8 @@ class Event(models.Model): irrelevance_for_retention = models.PositiveIntegerField(blank=False, null=False) never_evict = models.BooleanField(blank=False, null=False, default=False) + storage_backend = models.CharField(max_length=255, blank=True, null=True, default=None, editable=False) + # The following list of attributes are mentioned in the docs but are not attrs on our model (because we don't need # them to be [yet]): # @@ -168,10 +180,20 @@ class Event(models.Model): ] def get_raw_data(self): - return self.data + if self.storage_backend is None: + return self.data + + storage = get_storage(self.storage_backend) + with storage.open(self.id, "r") as f: + return f.read() def get_parsed_data(self): - return json.loads(self.data) + if self.storage_backend is None: + return json.loads(self.data) + + storage = get_storage(self.storage_backend) + with storage.open(self.id, "r") as f: + return json.load(f) def get_absolute_url(self): return f"/issues/issue/{ self.issue_id }/event/{ self.id }/" @@ -197,6 +219,8 @@ class Event(models.Model): irrelevance_for_retention = get_random_irrelevance(stored_event_count) + write_storage = get_write_storage() + # A note on truncation (max_length): the fields we truncate here are directly from the SDK, so they "should have # been" truncated already. But we err on the side of caution: this is the kind of SDK error that we can, and # just want to, paper over (it's not worth dropping the event for). @@ -208,7 +232,8 @@ class Event(models.Model): grouping=grouping, ingested_at=event_metadata["ingested_at"], digested_at=digested_at, - data=json.dumps(parsed_data), + data=json.dumps(parsed_data) if write_storage is None else "", + storage_backend=None if write_storage is None else write_storage.name, timestamp=parse_timestamp(parsed_data["timestamp"]), platform=parsed_data["platform"][:64], @@ -235,6 +260,9 @@ class Event(models.Model): ) created = True + if write_storage is not None: + write_to_storage(event.id, parsed_data) + return event, created except IntegrityError as e: if not re.match( diff --git a/events/retention.py b/events/retention.py index 167d174..520bc20 100644 --- a/events/retention.py +++ b/events/retention.py @@ -4,12 +4,12 @@ from django.db.models import Q, Min, Max from random import random from datetime import timezone, datetime -from django.db.models.sql.compiler import SQLDeleteCompiler -from django.db import connection - from bugsink.moreiterutils import pairwise, map_N_until from performance.context_managers import time_and_query_count +from .storage_registry import get_storage + +bugsink_logger = logging.getLogger("bugsink") performance_logger = logging.getLogger("bugsink.performance.retention") @@ -269,20 +269,6 @@ def evict_for_irrelevance( return evicted -def delete_with_limit(qs, limit): - # Django does not support this out of the box (i.e. it does not support LIMIT in DELETE queries). Both Sqlite and - # MySQL do in fact support it (whereas many other DBs do not), so we just reach down into Django's internals. - sql, params = SQLDeleteCompiler(qs.query, connection, 'default').as_sql() - limited_sql = sql + " LIMIT %s" - limited_params = params + (limit,) - - with connection.cursor() as cursor: - cursor.execute(limited_sql, limited_params) - nr_of_deletions = cursor.rowcount - - return nr_of_deletions - - def evict_for_epoch_and_irrelevance(project, max_epoch, max_irrelevance, max_event_count, include_never_evict): from issues.models import TurningPoint from .models import Event @@ -317,5 +303,36 @@ def evict_for_epoch_and_irrelevance(project, max_epoch, max_irrelevance, max_eve # we need to manually ensure that no FKs to the deleted items exist: TurningPoint.objects.filter(triggering_event__in=qs).update(triggering_event=None) - nr_of_deletions = delete_with_limit(qs, max_event_count) + # We generate the list of events-to-delete (including the LIMIT) before proceeding; this allows us: + # A. to have a portable delete_with_limit (e.g. Django does not support that, nor does Postgres). + # B. to apply both deletion and cleanup_events_on_storage() on the same list. + # + # Implementation notes: + # 1. We force evaluation here with a `list()`; this means subsequent usages do _not_ try to "just use an inner + # query". Although inner queries are attractive in the abstract, the literature suggests that performance may be + # unpredictable (e.g. on MySQL). By using a list, we lift the (max 500) ids-to-match to the actual query, which + # is quite ugly, but predictable and (at least on sqlite where I tested this) lightning-fast. + # 2. order_by: "pick something" to ensure the 2 usages of the "subquery" point to the same thing. (somewhat + # superceded by [1] above; but I like to be defensive and predictable). tie-breaking on digest_order seems + # consistent with the semantics of eviction. + pks_to_delete = list(qs.order_by("digest_order")[:max_event_count].values_list("pk", flat=True)) + + if len(pks_to_delete) > 0: + cleanup_events_on_storage( + Event.objects.filter(pk__in=pks_to_delete).exclude(storage_backend=None) + .values_list("id", "storage_backend") + ) + nr_of_deletions = Event.objects.filter(pk__in=pks_to_delete).delete()[1]["events.Event"] + else: + nr_of_deletions = 0 + return nr_of_deletions + + +def cleanup_events_on_storage(todos): + for event_id, storage_backend in todos: + try: + get_storage(storage_backend).delete(event_id) + except Exception as e: + # in a try/except such that we can continue with the rest of the batch + bugsink_logger.error("Error during cleanup of %s on %s: %s", event_id, storage_backend, e) diff --git a/events/storage.py b/events/storage.py new file mode 100644 index 0000000..2ba1d98 --- /dev/null +++ b/events/storage.py @@ -0,0 +1,58 @@ +import contextlib +import os.path + + +class EventStorage(object): + + def __init__(self, name, **options): + self.name = name + + def save(self): + raise NotImplementedError() + + def exists(self, event_id): + raise NotImplementedError() + + def delete(self, event_id): + raise NotImplementedError() + + def open(self, event_id): + raise NotImplementedError() + + # one might imagine contexts where something like "url" is useful (e.g. S3, and pointing the end-user straight at + # the event file) but such a model means you'll need to think about the security implications of that URL, which is + # not worth it, so we only support "pass through application layer" (where the auth stuff is) models of usage. + + +class FileEventStorage(EventStorage): + + def __init__(self, name, basepath=None): + super().__init__(name) + + if basepath is None: + raise ValueError("Basepath must be provided") + + self.basepath = basepath + + def _event_path(self, event_id): + # the dashes in uuid are preserved in the filename for readability; since their location is consistent, this is + # not a problem. + return os.path.join(self.basepath, str(event_id) + ".json") + + @contextlib.contextmanager + def open(self, event_id, mode='r'): + if mode not in ('r', 'w'): + # EventStorage's API is generally _very_ limited (unique IDs, write-once) so we can (and should) be very + # strict about what we allow; we further imply "text mode" and "utf-8 encoding" given the JSON context. + raise ValueError("EventStorage.open() mode must be 'r' or 'w'") + + # We open with utf-8 encoding explicitly to pre-empt the future of pep-0686 (it's also the only thing that makes + # sense in the context of JSON) + with open(self._event_path(event_id), mode, encoding="utf-8") as f: + yield f + + def exists(self, event_id): + return os.path.exists(self._event_path(event_id)) + + def delete(self, event_id): + os.remove(self._event_path(event_id)) diff --git a/events/storage_registry.py b/events/storage_registry.py new file mode 100644 index 0000000..c60c464 --- /dev/null +++ b/events/storage_registry.py @@ -0,0 +1,50 @@ +import importlib + +from bugsink.app_settings import get_settings + + +_storages = None +_write_storage = None + + +def _ensure_storages(): + global _storages + global _write_storage + + if _storages is not None: + return + + _storages = {name: _resolve(name, conf) for name, conf in get_settings().EVENT_STORAGES.items()} + + matching = [name for name, conf in get_settings().EVENT_STORAGES.items() if conf.get("USE_FOR_WRITE", False)] + + if len(matching) == 1: + _write_storage = _storages[matching[0]] + + elif len(matching) > 1: + raise ValueError("Multiple USE_FOR_WRITE storages found.") + + # else len==0 is implied by the initial value of _write_storage (None) + + +def get_write_storage(): + """ + Gets the USE_FOR_WRITE storage. None means "in-database" (which is not shoe-horned in the EventStorage API). + """ + _ensure_storages() + return _write_storage + + +def get_storage(storage_name): + _ensure_storages() + return _storages[storage_name] + + +def _resolve(name, conf): + storage_name = conf["STORAGE"] + + module_name, class_name = storage_name.rsplit('.', 1) + module = importlib.import_module(module_name) + clazz = getattr(module, class_name) + + return clazz(name, **conf.get("OPTIONS", {}))