Files
bugsink/files/tasks.py
Klaas van Schelven 70b8afb842 Factor out assemble_artifact_bundle into an async task
I did not do any research into actual timings (i.e. whether there really is a
need to pull this out of the request/response loop) but it seems somewhat
prudent.  Because Sentry also does this (and does it at this point), the whole
thing of matching Sentry's interface is not being complicated by it (if
anything: the contrary).
2025-04-14 09:37:05 +02:00

80 lines
2.9 KiB
Python

from zipfile import ZipFile
import json
from hashlib import sha1
from io import BytesIO
from os.path import basename
from snappea.decorators import shared_task
from .models import Chunk, File, FileMetadata
@shared_task
def assemble_artifact_bundle(bundle_checksum, chunk_checksums):
# NOTE: as it stands we don't store the (optional) extra info of release/dist.
# NOTE: there's also the concept of an artifact bundle as _tied_ to a release, i.e. without debug_ids. We don't
# support that, but if we ever were to support it we'd need a separate method/param to distinguish it.
bundle_file, _ = assemble_file(bundle_checksum, chunk_checksums, filename=f"{bundle_checksum}.zip")
bundle_zip = ZipFile(BytesIO(bundle_file.data)) # NOTE: in-memory handling of zips.
manifest_bytes = bundle_zip.read("manifest.json")
manifest = json.loads(manifest_bytes.decode("utf-8"))
for filename, manifest_entry in manifest["files"].items():
file_data = bundle_zip.read(filename)
checksum = sha1(file_data).hexdigest()
filename = basename(manifest_entry.get("url", filename))[:255]
file, _ = File.objects.get_or_create(
checksum=checksum,
defaults={
"filename": filename,
"size": len(file_data),
"data": file_data,
})
debug_id = manifest_entry.get("headers", {}).get("debug-id", None)
file_type = manifest_entry.get("type", None)
if debug_id is None or file_type is None:
# such records exist and we could store them, but we don't, since we don't have a purpose for them.
continue
FileMetadata.objects.get_or_create(
debug_id=debug_id,
file_type=file_type,
defaults={
"file": file,
"data": json.dumps(manifest_entry),
}
)
# NOTE we _could_ get rid of the file at this point (but we don't). Ties in to broader questions of retention.
def assemble_file(checksum, chunk_checksums, filename):
"""Assembles a file from chunks"""
# NOTE: unimplemented checks/tricks
# * total file-size v.s. some max
# * explicit check chunk availability (as it stands, our processing is synchronous, so no need)
# * skip-on-checksum-exists
chunks = Chunk.objects.filter(checksum__in=chunk_checksums)
chunks_dicts = {chunk.checksum: chunk for chunk in chunks}
chunks_in_order = [chunks_dicts[checksum] for checksum in chunk_checksums] # implicitly checks chunk availability
data = b"".join([chunk.data for chunk in chunks_in_order])
if sha1(data).hexdigest() != checksum:
raise Exception("checksum mismatch")
return File.objects.get_or_create(
checksum=checksum,
defaults={
"size": len(data),
"data": data,
"filename": filename,
})