add vacuum_files command

Fix #129
2026-03-09 23:51:20 +00:00 · 2025-07-17 09:05:16 +02:00
parent 2e32ec78a3
commit 99f782f4e3
8 changed files with 125 additions and 4 deletions
--- a/events/utils.py
+++ b/events/utils.py
@@ -1,9 +1,15 @@
+from datetime import datetime, timezone
 from uuid import UUID
 import json
 import sourcemap
 from issues.utils import get_values

+from bugsink.transaction import delay_on_commit
+
+from compat.timestamp import format_timestamp
+
 from files.models import FileMetadata
+from files.tasks import record_file_accesses


 # Dijkstra, Sourcemaps and Python lists start at 0, but editors and our UI show lines starting at 1.
@@ -116,6 +122,9 @@ def apply_sourcemaps(event_data):
            debug_id__in=debug_id_for_filename.values(), file_type="source_map").select_related("file")
    }

+    metadata_ids = [metadata_obj.id for metadata_obj in metadata_obj_lookup.values()]
+    delay_on_commit(record_file_accesses, metadata_ids, format_timestamp(datetime.now(timezone.utc)))
+
    filenames_with_metas = [
        (filename, metadata_obj_lookup[debug_id])
        for (filename, debug_id) in debug_id_for_filename.items()
--- a/files/admin.py
+++ b/files/admin.py
@@ -7,14 +7,14 @@ from .models import Chunk, File, FileMetadata

@admin.register(Chunk)
 class ChunkAdmin(admin.ModelAdmin):
-    list_display = ('checksum', 'size')
+    list_display = ('checksum', 'size', 'created_at')
    search_fields = ('checksum',)
    readonly_fields = ('data',)


@admin.register(File)
 class FileAdmin(admin.ModelAdmin):
-    list_display = ('filename', 'checksum', 'size', 'download_link')
+    list_display = ('filename', 'checksum', 'size', 'download_link', 'created_at', 'accessed_at')
    search_fields = ('checksum',)
    readonly_fields = ('data', 'download_link')

@@ -27,5 +27,5 @@ class FileAdmin(admin.ModelAdmin):

@admin.register(FileMetadata)
 class FileMetadataAdmin(admin.ModelAdmin):
-    list_display = ('debug_id', 'file_type', 'file')
+    list_display = ('debug_id', 'file_type', 'file', 'created_at')
    search_fields = ('file__checksum', 'debug_id', 'file_type')
--- a/files/management/init.py
+++ b/files/management/init.py
--- a/files/management/commands/init.py
+++ b/files/management/commands/init.py
--- a/files/management/commands/vacuum_files.py
+++ b/files/management/commands/vacuum_files.py
@@ -0,0 +1,10 @@
+from django.core.management.base import BaseCommand
+from files.tasks import vacuum_files
+
+
+class Command(BaseCommand):
+    help = "Kick off (sourcemaps-)files cleanup by vacuuming old entries."
+
+    def handle(self, *args, **options):
+        vacuum_files.delay()
+        self.stdout.write("Called vacuum_files.delay(); the task will run in the background (snapea).")
--- a/files/migrations/0002_chunk_created_at_file_accessed_at_file_created_at_and_more.py
+++ b/files/migrations/0002_chunk_created_at_file_accessed_at_file_created_at_and_more.py
@@ -0,0 +1,44 @@
+from django.db import migrations, models
+import django.utils.timezone
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("files", "0001_initial"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="chunk",
+            name="created_at",
+            field=models.DateTimeField(
+                auto_now_add=True, db_index=True, default=django.utils.timezone.now
+            ),
+            preserve_default=False,
+        ),
+        migrations.AddField(
+            model_name="file",
+            name="accessed_at",
+            field=models.DateTimeField(
+                auto_now_add=True, db_index=True, default=django.utils.timezone.now
+            ),
+            preserve_default=False,
+        ),
+        migrations.AddField(
+            model_name="file",
+            name="created_at",
+            field=models.DateTimeField(
+                auto_now_add=True, db_index=True, default=django.utils.timezone.now
+            ),
+            preserve_default=False,
+        ),
+        migrations.AddField(
+            model_name="filemetadata",
+            name="created_at",
+            field=models.DateTimeField(
+                auto_now_add=True, db_index=True, default=django.utils.timezone.now
+            ),
+            preserve_default=False,
+        ),
+    ]
--- a/files/models.py
+++ b/files/models.py
@@ -5,6 +5,7 @@ class Chunk(models.Model):
    checksum = models.CharField(max_length=40, unique=True)  # unique implies index, which we also use for lookups
    size = models.PositiveIntegerField()
    data = models.BinaryField(null=False)  # as with Events, we can "eventually" move this out of the database
+    created_at = models.DateTimeField(auto_now_add=True, editable=False, db_index=True)

    def __str__(self):
        return self.checksum
@@ -23,6 +24,8 @@ class File(models.Model):

    size = models.PositiveIntegerField()
    data = models.BinaryField(null=False)  # as with Events, we can "eventually" move this out of the database
+    created_at = models.DateTimeField(auto_now_add=True, editable=False, db_index=True)
+    accessed_at = models.DateTimeField(auto_now_add=True, editable=False, db_index=True)

    def __str__(self):
        return self.filename
@@ -36,6 +39,7 @@ class FileMetadata(models.Model):
    debug_id = models.UUIDField(max_length=40, null=True, blank=True)
    file_type = models.CharField(max_length=255, null=True, blank=True)
    data = models.TextField()  # we just dump the rest in here; let's see how much we really need.
+    created_at = models.DateTimeField(auto_now_add=True, editable=False, db_index=True)

    def __str__(self):
        # somewhat useless when debug_id is None; but that's not the case we care about ATM
--- a/files/tasks.py
+++ b/files/tasks.py
@@ -1,12 +1,15 @@
+from datetime import timedelta
 from zipfile import ZipFile
 import json
 from hashlib import sha1
 from io import BytesIO
 from os.path import basename
+from django.utils import timezone

+from compat.timestamp import parse_timestamp
 from snappea.decorators import shared_task

-from bugsink.transaction import immediate_atomic
+from bugsink.transaction import immediate_atomic, delay_on_commit
 from bugsink.app_settings import get_settings

 from .models import Chunk, File, FileMetadata
@@ -92,3 +95,54 @@ def assemble_file(checksum, chunk_checksums, filename):
    # is unlikely.
    chunks.delete()
    return result
+
+
+@shared_task
+def record_file_accesses(metadata_ids, accessed_at):
+    # implemented as a task to get around the fact that file-access happens in an otherwise read-only view (and the fact
+    # that the access happened is a write to the DB).
+
+    # a few thoughts on the context of "doing this as a task": [1] the expected througput is relatively low (UI) so the
+    # task overhead should be OK [2] it's not "absolutely criticial" to always record this (99% is enough) and [3] it's
+    # not related to the reading transaction _at all_ (all we need to record is the fact that it happened.
+    #
+    # thought on instead pulling it to the top of the UI's view: code-wise, it's annoying but doable (annoying b/c
+    # 'for_request_method' won't work anymore). But this would still make this key UI view depend on the write lock
+    # which is such a shame for responsiveness so we'll stick with task-based.
+
+    with immediate_atomic():
+        parsed_accessed_at = parse_timestamp(accessed_at)
+
+        # note: filtering on IDs comes with "robust for deletions" out-of-the-box (and: 2 queries only)
+        file_ids = FileMetadata.objects.filter(id__in=metadata_ids).values_list("file_id", flat=True)
+        File.objects.filter(id__in=file_ids).update(accessed_at=parsed_accessed_at)
+
+
+@shared_task
+def vacuum_files():
+    now = timezone.now()
+    with immediate_atomic():
+        # budget is not yet tuned; reasons for high values: we're dealing with "leaves in the model-dep-tree here";
+        # reasons for low values: deletion of files might just be expensive.
+        budget = 500
+        num_deleted = 0
+
+        for model, field_name, max_days in [
+            (Chunk, 'created_at', 1,),  # 1 is already quite long... Chunks are used immediately, or not at all.
+            (File, 'accessed_at', 90),
+            # for FileMetadata we rely on cascading from File (which will always happen "eventually")
+                ]:
+
+            while num_deleted < budget:
+                ids = (model.objects.filter(**{f"{field_name}__lt": now - timedelta(days=max_days)})[:budget].
+                       values_list('id', flat=True))
+
+                if len(ids) == 0:
+                    break
+
+                model.objects.filter(id__in=ids).delete()
+                num_deleted += len(ids)
+
+        if num_deleted == budget:
+            # budget exhausted but possibly more to delete, so we re-schedule the task
+            delay_on_commit(vacuum_files)