PoC of uploading sourcemap artifact bundles

* debug_id-only * various TODOs (e.g. auth, async, retention) See #19
2026-03-09 23:51:20 +00:00 · 2025-04-10 10:30:30 +02:00
parent d572ff3601
commit eb266d805c
12 changed files with 397 additions and 0 deletions
--- a/bugsink/settings/default.py
+++ b/bugsink/settings/default.py
@@ -77,6 +77,7 @@ BUGSINK_APPS = [
    'releases',
    'ingest',
    'issues',
+    'files',
    'events',
    'tags',
    'alerts',
--- a/bugsink/urls.py
+++ b/bugsink/urls.py
@@ -10,6 +10,7 @@ from teams.views import debug_email as debug_teams_email
 from bugsink.app_settings import get_settings
 from users.views import signup, confirm_email, resend_confirmation, request_reset_password, reset_password, preferences
 from ingest.views import download_envelope
+from files.views import chunk_upload, artifact_bundle_assemble

 from .views import home, trigger_error, favicon, settings_view, silence_email_system_warning
 from .debug_views import csrf_debug
@@ -38,6 +39,13 @@ urlpatterns = [
    # many user-related views are directly exposed above (/accounts/), the rest is here:
    path("users/", include("users.urls")),

+    # these are sentry-cli endpoint for uploading; they're unrelated to e.g. the ingestion API.
+    # the /api/0/ is just a hard prefix (for the ingest API, that position indicates the project id, but here it's just
+    # a prefix)
+    path("api/0/organizations/<slug:organization_slug>/chunk-upload/", chunk_upload, name="chunk_upload"),
+    path("api/0/organizations/<slug:organization_slug>/artifactbundle/assemble/", artifact_bundle_assemble,
+         name="artifact_bundle_assemble"),
+
    path('api/', include('ingest.urls')),

    # not in /api/ because it's not part of the ingest API, but still part of the ingest app
--- a/files/init.py
+++ b/files/init.py
--- a/files/admin.py
+++ b/files/admin.py
@@ -0,0 +1,22 @@
+from django.contrib import admin
+from .models import Chunk, File, FileMetadata
+
+
+@admin.register(Chunk)
+class ChunkAdmin(admin.ModelAdmin):
+    list_display = ('checksum', 'size')
+    search_fields = ('checksum',)
+    readonly_fields = ('data',)
+
+
+@admin.register(File)
+class FileAdmin(admin.ModelAdmin):
+    list_display = ('checksum', 'size')
+    search_fields = ('checksum',)
+    readonly_fields = ('data',)
+
+
+@admin.register(FileMetadata)
+class FileMetadataAdmin(admin.ModelAdmin):
+    list_display = ('debug_id', 'file_type', 'file')
+    search_fields = ('file__checksum', 'debug_id', 'file_type')
--- a/files/apps.py
+++ b/files/apps.py
@@ -0,0 +1,6 @@
+from django.apps import AppConfig
+
+
+class FilesConfig(AppConfig):
+    default_auto_field = "django.db.models.BigAutoField"
+    name = "files"
--- a/files/migrations/0001_initial.py
+++ b/files/migrations/0001_initial.py
@@ -0,0 +1,76 @@
+# Generated by Django 4.2.19 on 2025-04-10 08:15
+
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+    initial = True
+
+    dependencies = []
+
+    operations = [
+        migrations.CreateModel(
+            name="Chunk",
+            fields=[
+                (
+                    "id",
+                    models.BigAutoField(
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
+                    ),
+                ),
+                ("checksum", models.CharField(max_length=40, unique=True)),
+                ("size", models.PositiveIntegerField()),
+                ("data", models.BinaryField()),
+            ],
+        ),
+        migrations.CreateModel(
+            name="File",
+            fields=[
+                (
+                    "id",
+                    models.BigAutoField(
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
+                    ),
+                ),
+                ("checksum", models.CharField(max_length=40, unique=True)),
+                ("size", models.PositiveIntegerField()),
+                ("data", models.BinaryField()),
+            ],
+        ),
+        migrations.CreateModel(
+            name="FileMetadata",
+            fields=[
+                (
+                    "id",
+                    models.BigAutoField(
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
+                    ),
+                ),
+                ("debug_id", models.UUIDField(blank=True, null=True)),
+                ("file_type", models.CharField(blank=True, max_length=255, null=True)),
+                ("data", models.TextField()),
+                (
+                    "file",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.CASCADE,
+                        related_name="metadatas",
+                        to="files.file",
+                    ),
+                ),
+            ],
+            options={
+                "unique_together": {("debug_id", "file_type")},
+            },
+        ),
+    ]
--- a/files/migrations/init.py
+++ b/files/migrations/init.py
--- a/files/models.py
+++ b/files/models.py
@@ -0,0 +1,42 @@
+from django.db import models
+
+
+class Chunk(models.Model):
+    checksum = models.CharField(max_length=40, unique=True)  # unique implies index, which we also use for lookups
+    size = models.PositiveIntegerField()
+    data = models.BinaryField(null=False)  # as with Events, we can "eventually" move this out of the database
+
+    def __str__(self):
+        return self.checksum
+
+
+class File(models.Model):
+    # NOTE: as it stands, this is exactly the same thing as Chunk; and since we do single-chunk uploads, optimizations
+    # are imaginable. Make it work first though
+
+    checksum = models.CharField(max_length=40, unique=True)  # unique implies index, which we also use for lookups
+    size = models.PositiveIntegerField()
+    data = models.BinaryField(null=False)  # as with Events, we can "eventually" move this out of the database
+
+    def __str__(self):
+        return self.checksum
+
+
+class FileMetadata(models.Model):
+    file = models.ForeignKey(File, null=False, on_delete=models.CASCADE, related_name="metadatas")
+
+    # debug_id & file_type nullability: such data exists in manifest.json; we are future-proof for it although we
+    # currently don't store it as such.
+    debug_id = models.UUIDField(max_length=40, null=True, blank=True)
+    file_type = models.CharField(max_length=255, null=True, blank=True)
+    data = models.TextField()  # we just dump the rest in here; let's see how much we really need.
+
+    def __str__(self):
+        # somewhat useless when debug_id is None; but that's not the case we care about ATM
+        return f"debug_id: {self.debug_id} ({self.file_type})"
+
+    class Meta:
+        # it's _imaginable_ that the below does not actually hold (we just trust the CLI, after all), but that wouldn't
+        # make any sense, so we just enforce a property that makes sense. Pro: lookups work. Con: if the client sends
+        # garbage, this is not exposed.
+        unique_together = (("debug_id", "file_type"),)
--- a/files/tests.py
+++ b/files/tests.py
@@ -0,0 +1 @@
+# from django.test import TestCase
--- a/files/views.py
+++ b/files/views.py
@@ -0,0 +1,224 @@
+from zipfile import ZipFile
+import json
+from hashlib import sha1
+from gzip import GzipFile
+from io import BytesIO
+
+from django.http import JsonResponse, HttpResponse
+from django.views.decorators.csrf import csrf_exempt
+
+from sentry.assemble import ChunkFileState
+
+from bugsink.app_settings import get_settings
+
+from .models import Chunk, File, FileMetadata
+
+
+_KIBIBYTE = 1024
+_MEBIBYTE = 1024 * _KIBIBYTE
+
+
+class NamedBytesIO(BytesIO):
+    def __init__(self, data, name):
+        super().__init__(data)
+        self.name = name
+
+
+def get_chunk_upload_settings(request, organization_slug):
+    # Sentry / Sentry-CLI has a whole bunch of logic surrounding URLs, which I do not understand and which presumably
+    # doesn't make it past Bugsink's cost/benefit-analysis. feature-completeness. For now, we just return our own URL
+    # which seems to "just work". If we ever want to go down this path :
+    #
+    # https://github.com/getsentry/sentry/pull/7095/files <= upload-url-prefix: introduced, but rationale not explained
+    #
+    # 2 more starting points for the whole "relative" idea
+    # * https://github.com/getsentry/sentry-cli/issues/839
+    # * https://github.com/getsentry/sentry/pull/29347
+    url = get_settings().BASE_URL + "/api/0/organizations/" + organization_slug + "/chunk-upload/"
+
+    # Our "chunk_upload" is chunked in name only; i.e. we only "speak chunked" for the purpose of API-compatability with
+    # sentry-cli, but we provide params here such that that cli will only send a single chunk.
+
+    return JsonResponse({
+        "url": url,
+
+        # For now, staying close to the default MAX_ENVELOPE_COMPRESSED_SIZE, which is 20MiB;
+        # I _think_ I saw a note somewhere on (one of) these values having to be a power of 2; hence 32 here.
+        #
+        # When implementing uploading, it was done to support sourcemaps. It seems that over at Sentry, the reason they
+        # went so complicated in the first place was to enable DIF support (hunderds of MiB regularly).
+        "chunkSize": 32 * _MEBIBYTE,
+        "maxRequestSize": 32 * _MEBIBYTE,
+
+        # I didn't check the supposed relationship between maxRequestSize and maxFileSize, but assume something similar
+        # to what happens w/ envelopes; hence harmonizing with MAX_ENVELOPE_SIZE (and rounding up to a power of 2) here
+        "maxFileSize": 128 * _MEBIBYTE,
+
+        # force single-chunk by setting these to 1.
+        "concurrency": 1,
+        "chunksPerRequest": 1,
+
+        "hashAlgorithm": "sha1",
+        "compression": ["gzip"],
+
+        "accept": [
+            # I don't claim to fully understand how the sentry-cli switches based on these advertised capabilities, but
+            # the list below works for now. Any understanding that I did gain is documented.
+            # for a full list of types we _could_ accept, see src/sentry/api/endpoints/chunk.py
+            #
+
+            # If the below is off, sentry-cli complains "A release slug is required". Because release-less artifacts are
+            # actually the simpler thing, that's undesirable. Other consequences of turning it on have not been charted
+            # yet.
+            "release_files",
+
+            # this would seem to be the "javascript sourcemaps" thing, but how exactly I did not check yet.
+            "sources",
+
+            # https://github.com/getsentry/sentry/discussions/46967
+            # artifact_bundles is a concept originating from sentry that uses debug_ids to link maps & sources. Despite
+            # it being relatively new, it's my _first_ target for getting sourcemaps to work, because it's actually the
+            # most simple and reliable thing (uuid, bidirectional mapping)
+            "artifact_bundles",
+
+            # AFAIU the only thing _v2 would signify is the ability to "Implement de-duplication with chunking in the
+            # assemble endpoint for artifact bundles (#51224)". Which is needlessly complex from my point of view.
+            # "artifact_bundles_v2",
+
+            # the rest of the options are below:
+            # "debug_files",
+            # "release_files",
+            # "pdbs",
+            # "bcsymbolmaps",
+            # "il2cpp",
+            # "portablepdbs",
+            # "artifact_bundles",
+            # "proguard",
+        ]
+    })
+
+
+@csrf_exempt
+def chunk_upload(request, organization_slug):
+    # TODO authenticate
+    # Bugsink has a single-organization model; we simply ignore organization_slug
+    # NOTE: we don't check against chunkSize, maxRequestSize and chunksPerRequest (yet), we expect the CLI to behave.
+
+    if request.method == "GET":
+        # a GET at this endpoint returns a dict of settings that the CLI takes into account when uploading
+        return get_chunk_upload_settings(request, organization_slug)
+
+    # POST: upload (full-size) "chunks" and store them as Chunk objects; file.name whould be the sha1 of the content.
+    chunks = []
+    if request.FILES:
+        chunks = request.FILES.getlist("file")
+
+        # NOTE: we read the whole unzipped file into memory; we _could_ take an approach like bugsink/streams.py.
+        # (Note that, because of the auth layer in front, we're slightly less worried about adverserial scenarios)
+        chunks += [
+            NamedBytesIO(GzipFile(fileobj=file_gzip, mode="rb").read(), name=file_gzip.name)
+            for file_gzip in request.FILES.getlist("file_gzip")]
+
+    for chunk in chunks:
+        data = chunk.getvalue()
+
+        if sha1(data).hexdigest() != chunk.name:
+            raise Exception("checksum mismatch")
+
+        _, _ = Chunk.objects.get_or_create(
+            checksum=chunk.name,
+            defaults={
+                "size": len(data),
+                "data": data,  # NOTE: further possible optimization: don't even read the file when already existing
+            })
+
+    open('/tmp/chunk.zip', "wb").write(data)  # TODO: remove this line; it's just for debugging
+
+    return HttpResponse()
+
+
+def assemble_artifact_bundle(bundle_checksum, chunk_checksums):
+    # NOTE: as it stands we don't store the (optional) extra info of release/dist.
+
+    # NOTE: there's also the concept of an artifact bundle as _tied_ to a release, i.e. without debug_ids. We don't
+    # support that, but if we ever were to support it we'd need a separate method/param to distinguish it.
+
+    bundle_file, _ = assemble_file(bundle_checksum, chunk_checksums)
+
+    bundle_zip = ZipFile(BytesIO(bundle_file.data))  # NOTE: in-memory handling of zips.
+    manifest_bytes = bundle_zip.read("manifest.json")
+    manifest = json.loads(manifest_bytes.decode("utf-8"))
+
+    for filename, manifest_entry in manifest["files"].items():
+        file_data = bundle_zip.read(filename)
+
+        checksum = sha1(file_data).hexdigest()
+
+        file, _ = File.objects.get_or_create(
+            checksum=checksum,
+            defaults={
+                "size": len(file_data),
+                "data": file_data,
+            })
+
+        debug_id = manifest_entry.get("headers", {}).get("debug-id", None)
+        file_type = manifest_entry.get("type", None)
+        if debug_id is None or file_type is None:
+            # such records exist and we could store them, but we don't, since we don't have a purpose for them.
+            continue
+
+        FileMetadata.objects.get_or_create(
+            debug_id=debug_id,
+            file_type=file_type,
+            defaults={
+                "file": file,
+                "data": json.dumps(manifest_entry),
+            }
+        )
+
+    # NOTE we _could_ get rid of the file at this point (but we don't). Ties in to broader questions of retention.
+
+
+def assemble_file(checksum, chunk_checksums):
+    """Assembles a file from chunks"""
+
+    # NOTE: unimplemented checks/tricks
+    # * total file-size v.s. some max
+    # * explicit check chunk availability (as it stands, our processing is synchronous, so no need)
+    # * skip-on-checksum-exists
+
+    chunks = Chunk.objects.filter(checksum__in=chunk_checksums)
+    chunks_dicts = {chunk.checksum: chunk for chunk in chunks}
+    chunks_in_order = [chunks_dicts[checksum] for checksum in chunk_checksums]  # implicitly checks chunk availability
+    data = b"".join([chunk.data for chunk in chunks_in_order])
+
+    if sha1(data).hexdigest() != checksum:
+        raise Exception("checksum mismatch")
+
+    return File.objects.get_or_create(
+        checksum=checksum,
+        defaults={
+            "size": len(data),
+            "data": data,
+        })
+
+
+@csrf_exempt  # we're in API context here; this could potentially be pulled up to a higher level though
+def artifact_bundle_assemble(request, organization_slug):
+    # TODO authenticate
+    # Bugsink has a single-organization model; we simply ignore organization_slug
+
+    # NOTE a JSON-schema for this endpoint is available under Apache 2 license (2 year anniversary rule) at
+    # https://github.com/getsentry/sentry/blob/8df7543848b4/src/sentry/api/endpoints/organization_artifactbundle_assemble.py#L24
+    # (not worth the trouble of extracting right now, since our /sentry dir contains BSD-3 licensed code (2019 version)
+
+    data = json.loads(request.body)
+    assemble_artifact_bundle(data["checksum"], data["chunks"])
+
+    # NOTE sentry & glitchtip _always_ return an empty list for "missingChunks" in this view; I don't really understand
+    # what's being achieved with that, but it seems to be the expected behavior. Working hypothesis: this was introduced
+    # for DIF uploads, and the present endpoint doesn't use it at all. Not even for "v2", surprisingly.
+
+    # NOTE: as it stands, we process the bundle inline, so arguably we could return "OK" here too; "CREATED" is what
+    # sentry returns though, so for faithful mimicking it's the safest bet.
+    return JsonResponse({"state": ChunkFileState.CREATED, "missingChunks": []})
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,6 +46,7 @@ include = [
    "ee*",
    "ingest*",
    "issues*",
+    "files*",
    "performance*",
    "phonehome*",
    "projects*",
--- a/sentry/assemble.py
+++ b/sentry/assemble.py
@@ -0,0 +1,16 @@
+# from src/sentry/tasks/assemble.py
+
+
+def enum(**named_values):
+    """Creates an enum type."""
+    return type("Enum", (), named_values)
+
+
+ChunkFileState = enum(
+    OK="ok",  # File in database
+    NOT_FOUND="not_found",  # File not found in database
+    CREATED="created",  # File was created in the request and send to the worker for assembling
+    ASSEMBLING="assembling",  # File still being processed by worker
+    ERROR="error",  # Error happened during assembling
+)
+