PoC for difs_assemble

works w/ sentry-client; no actual handling yet; TODOs in-code

See #82
This commit is contained in:
Klaas van Schelven
2025-11-05 21:40:07 +01:00
parent 21297eff2a
commit 7d008da4a1
3 changed files with 85 additions and 9 deletions

View File

@@ -105,8 +105,8 @@ def assemble_file(checksum, chunk_checksums, filename):
# NOTE: unimplemented checks/tricks
# * total file-size v.s. some max
# * explicit check chunk availability (as it stands, our processing is synchronous, so no need)
# * skip-on-checksum-exists
# * explicit check chunk availability
# * skip this whole thing when the (whole-file) checksum exists
chunks = Chunk.objects.filter(checksum__in=chunk_checksums)
chunks_dicts = {chunk.checksum: chunk for chunk in chunks}
@@ -117,7 +117,7 @@ def assemble_file(checksum, chunk_checksums, filename):
if sha1(data, usedforsecurity=False).hexdigest() != checksum:
raise Exception("checksum mismatch")
result = File.objects.get_or_create(
file, created = File.objects.get_or_create(
checksum=checksum,
defaults={
"size": len(data),
@@ -129,7 +129,7 @@ def assemble_file(checksum, chunk_checksums, filename):
# be used in multiple files (which are still being assembled) but with chunksizes in the order of 1MiB, I'd say this
# is unlikely.
chunks.delete()
return result
return file, created
@shared_task

View File

@@ -15,8 +15,8 @@ from bugsink.app_settings import get_settings
from bugsink.transaction import durable_atomic, immediate_atomic
from bsmain.models import AuthToken
from .models import Chunk, File
from .tasks import assemble_artifact_bundle
from .models import Chunk, File, FileMetadata
from .tasks import assemble_artifact_bundle, assemble_file
logger = logging.getLogger("bugsink.api")
@@ -86,7 +86,8 @@ def get_chunk_upload_settings(request, organization_slug):
# yet.
"release_files",
# this would seem to be the "javascript sourcemaps" thing, but how exactly I did not check yet.
# on second reading I would say: this is "actual source code", but I did not check yet and "don't touch it"
# (even though we don't actually have an implementation for sources yet)
"sources",
# https://github.com/getsentry/sentry/discussions/46967
@@ -100,7 +101,7 @@ def get_chunk_upload_settings(request, organization_slug):
# "artifact_bundles_v2",
# the rest of the options are below:
# "debug_files",
"debug_files",
# "release_files",
# "pdbs",
# "bcsymbolmaps",
@@ -199,6 +200,78 @@ def artifact_bundle_assemble(request, organization_slug):
return JsonResponse({"state": ChunkFileState.CREATED, "missingChunks": []})
@csrf_exempt # we're in API context here; this could potentially be pulled up to a higher level though
@requires_auth_token
def difs_assemble(request, organization_slug, project_slug):
# TODO move to tasks.something.delay
# TODO think about the right transaction around this
data = json.loads(request.body)
file_checksums = set(data.keys())
existing_files = {
f.file.checksum: f
for f in FileMetadata.objects.filter(file__checksum__in=file_checksums)
}
all_requested_chunks = {
chunk
for file_info in data.values()
for chunk in file_info.get("chunks", [])
}
available_chunks = set(
Chunk.objects.filter(checksum__in=all_requested_chunks).values_list("checksum", flat=True)
)
response = {}
for file_checksum, file_info in data.items():
if file_checksum in existing_files:
response[file_checksum] = {
"state": ChunkFileState.OK,
"missingChunks": [],
# "dif": serialize(existing_files[file_checksum]), # TODO: figure out if this is required.
}
continue
file_chunks = file_info.get("chunks", [])
# the sentry-cli sends an empty "chunks" list when just polling for file existence; since we already handled the
# case of existing files above, we can simply return NOT_FOUND here.
if not file_chunks:
response[file_checksum] = {
"state": ChunkFileState.NOT_FOUND,
"missingChunks": [],
}
continue
missing_chunks = [c for c in file_chunks if c not in available_chunks]
if missing_chunks:
response[file_checksum] = {
"state": ChunkFileState.NOT_FOUND,
"missingChunks": missing_chunks,
}
continue
file, _ = assemble_file(file_checksum, file_chunks, filename=file_info["name"])
FileMetadata.objects.get_or_create(
debug_id=file_info["debug_id"],
file_type="dif", # I think? check!
defaults={
"file": file,
"data": "{}", # this is the "catch all" field but I don't think we have anything in this case.
}
)
response[file_checksum] = {
"state": ChunkFileState.OK,
"missingChunks": [],
}
return JsonResponse(response)
@user_passes_test(lambda u: u.is_superuser)
@durable_atomic
def download_file(request, checksum):