add vacuum_files command

Fix #129
This commit is contained in:
Klaas van Schelven
2025-07-17 09:05:16 +02:00
parent 2e32ec78a3
commit 99f782f4e3
8 changed files with 125 additions and 4 deletions

View File

@@ -1,9 +1,15 @@
from datetime import datetime, timezone
from uuid import UUID
import json
import sourcemap
from issues.utils import get_values
from bugsink.transaction import delay_on_commit
from compat.timestamp import format_timestamp
from files.models import FileMetadata
from files.tasks import record_file_accesses
# Dijkstra, Sourcemaps and Python lists start at 0, but editors and our UI show lines starting at 1.
@@ -116,6 +122,9 @@ def apply_sourcemaps(event_data):
debug_id__in=debug_id_for_filename.values(), file_type="source_map").select_related("file")
}
metadata_ids = [metadata_obj.id for metadata_obj in metadata_obj_lookup.values()]
delay_on_commit(record_file_accesses, metadata_ids, format_timestamp(datetime.now(timezone.utc)))
filenames_with_metas = [
(filename, metadata_obj_lookup[debug_id])
for (filename, debug_id) in debug_id_for_filename.items()

View File

@@ -7,14 +7,14 @@ from .models import Chunk, File, FileMetadata
@admin.register(Chunk)
class ChunkAdmin(admin.ModelAdmin):
list_display = ('checksum', 'size')
list_display = ('checksum', 'size', 'created_at')
search_fields = ('checksum',)
readonly_fields = ('data',)
@admin.register(File)
class FileAdmin(admin.ModelAdmin):
list_display = ('filename', 'checksum', 'size', 'download_link')
list_display = ('filename', 'checksum', 'size', 'download_link', 'created_at', 'accessed_at')
search_fields = ('checksum',)
readonly_fields = ('data', 'download_link')
@@ -27,5 +27,5 @@ class FileAdmin(admin.ModelAdmin):
@admin.register(FileMetadata)
class FileMetadataAdmin(admin.ModelAdmin):
list_display = ('debug_id', 'file_type', 'file')
list_display = ('debug_id', 'file_type', 'file', 'created_at')
search_fields = ('file__checksum', 'debug_id', 'file_type')

View File

View File

View File

@@ -0,0 +1,10 @@
from django.core.management.base import BaseCommand
from files.tasks import vacuum_files
class Command(BaseCommand):
help = "Kick off (sourcemaps-)files cleanup by vacuuming old entries."
def handle(self, *args, **options):
vacuum_files.delay()
self.stdout.write("Called vacuum_files.delay(); the task will run in the background (snapea).")

View File

@@ -0,0 +1,44 @@
from django.db import migrations, models
import django.utils.timezone
class Migration(migrations.Migration):
dependencies = [
("files", "0001_initial"),
]
operations = [
migrations.AddField(
model_name="chunk",
name="created_at",
field=models.DateTimeField(
auto_now_add=True, db_index=True, default=django.utils.timezone.now
),
preserve_default=False,
),
migrations.AddField(
model_name="file",
name="accessed_at",
field=models.DateTimeField(
auto_now_add=True, db_index=True, default=django.utils.timezone.now
),
preserve_default=False,
),
migrations.AddField(
model_name="file",
name="created_at",
field=models.DateTimeField(
auto_now_add=True, db_index=True, default=django.utils.timezone.now
),
preserve_default=False,
),
migrations.AddField(
model_name="filemetadata",
name="created_at",
field=models.DateTimeField(
auto_now_add=True, db_index=True, default=django.utils.timezone.now
),
preserve_default=False,
),
]

View File

@@ -5,6 +5,7 @@ class Chunk(models.Model):
checksum = models.CharField(max_length=40, unique=True) # unique implies index, which we also use for lookups
size = models.PositiveIntegerField()
data = models.BinaryField(null=False) # as with Events, we can "eventually" move this out of the database
created_at = models.DateTimeField(auto_now_add=True, editable=False, db_index=True)
def __str__(self):
return self.checksum
@@ -23,6 +24,8 @@ class File(models.Model):
size = models.PositiveIntegerField()
data = models.BinaryField(null=False) # as with Events, we can "eventually" move this out of the database
created_at = models.DateTimeField(auto_now_add=True, editable=False, db_index=True)
accessed_at = models.DateTimeField(auto_now_add=True, editable=False, db_index=True)
def __str__(self):
return self.filename
@@ -36,6 +39,7 @@ class FileMetadata(models.Model):
debug_id = models.UUIDField(max_length=40, null=True, blank=True)
file_type = models.CharField(max_length=255, null=True, blank=True)
data = models.TextField() # we just dump the rest in here; let's see how much we really need.
created_at = models.DateTimeField(auto_now_add=True, editable=False, db_index=True)
def __str__(self):
# somewhat useless when debug_id is None; but that's not the case we care about ATM

View File

@@ -1,12 +1,15 @@
from datetime import timedelta
from zipfile import ZipFile
import json
from hashlib import sha1
from io import BytesIO
from os.path import basename
from django.utils import timezone
from compat.timestamp import parse_timestamp
from snappea.decorators import shared_task
from bugsink.transaction import immediate_atomic
from bugsink.transaction import immediate_atomic, delay_on_commit
from bugsink.app_settings import get_settings
from .models import Chunk, File, FileMetadata
@@ -92,3 +95,54 @@ def assemble_file(checksum, chunk_checksums, filename):
# is unlikely.
chunks.delete()
return result
@shared_task
def record_file_accesses(metadata_ids, accessed_at):
# implemented as a task to get around the fact that file-access happens in an otherwise read-only view (and the fact
# that the access happened is a write to the DB).
# a few thoughts on the context of "doing this as a task": [1] the expected througput is relatively low (UI) so the
# task overhead should be OK [2] it's not "absolutely criticial" to always record this (99% is enough) and [3] it's
# not related to the reading transaction _at all_ (all we need to record is the fact that it happened.
#
# thought on instead pulling it to the top of the UI's view: code-wise, it's annoying but doable (annoying b/c
# 'for_request_method' won't work anymore). But this would still make this key UI view depend on the write lock
# which is such a shame for responsiveness so we'll stick with task-based.
with immediate_atomic():
parsed_accessed_at = parse_timestamp(accessed_at)
# note: filtering on IDs comes with "robust for deletions" out-of-the-box (and: 2 queries only)
file_ids = FileMetadata.objects.filter(id__in=metadata_ids).values_list("file_id", flat=True)
File.objects.filter(id__in=file_ids).update(accessed_at=parsed_accessed_at)
@shared_task
def vacuum_files():
now = timezone.now()
with immediate_atomic():
# budget is not yet tuned; reasons for high values: we're dealing with "leaves in the model-dep-tree here";
# reasons for low values: deletion of files might just be expensive.
budget = 500
num_deleted = 0
for model, field_name, max_days in [
(Chunk, 'created_at', 1,), # 1 is already quite long... Chunks are used immediately, or not at all.
(File, 'accessed_at', 90),
# for FileMetadata we rely on cascading from File (which will always happen "eventually")
]:
while num_deleted < budget:
ids = (model.objects.filter(**{f"{field_name}__lt": now - timedelta(days=max_days)})[:budget].
values_list('id', flat=True))
if len(ids) == 0:
break
model.objects.filter(id__in=ids).delete()
num_deleted += len(ids)
if num_deleted == budget:
# budget exhausted but possibly more to delete, so we re-schedule the task
delay_on_commit(vacuum_files)