diff --git a/bugsink/settings/default.py b/bugsink/settings/default.py index b7b45bb..6e3e06a 100644 --- a/bugsink/settings/default.py +++ b/bugsink/settings/default.py @@ -77,6 +77,7 @@ BUGSINK_APPS = [ 'releases', 'ingest', 'issues', + 'files', 'events', 'tags', 'alerts', diff --git a/bugsink/urls.py b/bugsink/urls.py index 84243c1..80a32f5 100644 --- a/bugsink/urls.py +++ b/bugsink/urls.py @@ -10,6 +10,7 @@ from teams.views import debug_email as debug_teams_email from bugsink.app_settings import get_settings from users.views import signup, confirm_email, resend_confirmation, request_reset_password, reset_password, preferences from ingest.views import download_envelope +from files.views import chunk_upload, artifact_bundle_assemble from .views import home, trigger_error, favicon, settings_view, silence_email_system_warning from .debug_views import csrf_debug @@ -38,6 +39,13 @@ urlpatterns = [ # many user-related views are directly exposed above (/accounts/), the rest is here: path("users/", include("users.urls")), + # these are sentry-cli endpoint for uploading; they're unrelated to e.g. the ingestion API. + # the /api/0/ is just a hard prefix (for the ingest API, that position indicates the project id, but here it's just + # a prefix) + path("api/0/organizations//chunk-upload/", chunk_upload, name="chunk_upload"), + path("api/0/organizations//artifactbundle/assemble/", artifact_bundle_assemble, + name="artifact_bundle_assemble"), + path('api/', include('ingest.urls')), # not in /api/ because it's not part of the ingest API, but still part of the ingest app diff --git a/files/__init__.py b/files/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/files/admin.py b/files/admin.py new file mode 100644 index 0000000..532296e --- /dev/null +++ b/files/admin.py @@ -0,0 +1,22 @@ +from django.contrib import admin +from .models import Chunk, File, FileMetadata + + +@admin.register(Chunk) +class ChunkAdmin(admin.ModelAdmin): + list_display = ('checksum', 'size') + search_fields = ('checksum',) + readonly_fields = ('data',) + + +@admin.register(File) +class FileAdmin(admin.ModelAdmin): + list_display = ('checksum', 'size') + search_fields = ('checksum',) + readonly_fields = ('data',) + + +@admin.register(FileMetadata) +class FileMetadataAdmin(admin.ModelAdmin): + list_display = ('debug_id', 'file_type', 'file') + search_fields = ('file__checksum', 'debug_id', 'file_type') diff --git a/files/apps.py b/files/apps.py new file mode 100644 index 0000000..a5b5712 --- /dev/null +++ b/files/apps.py @@ -0,0 +1,6 @@ +from django.apps import AppConfig + + +class FilesConfig(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + name = "files" diff --git a/files/migrations/0001_initial.py b/files/migrations/0001_initial.py new file mode 100644 index 0000000..969dda4 --- /dev/null +++ b/files/migrations/0001_initial.py @@ -0,0 +1,76 @@ +# Generated by Django 4.2.19 on 2025-04-10 08:15 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [] + + operations = [ + migrations.CreateModel( + name="Chunk", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("checksum", models.CharField(max_length=40, unique=True)), + ("size", models.PositiveIntegerField()), + ("data", models.BinaryField()), + ], + ), + migrations.CreateModel( + name="File", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("checksum", models.CharField(max_length=40, unique=True)), + ("size", models.PositiveIntegerField()), + ("data", models.BinaryField()), + ], + ), + migrations.CreateModel( + name="FileMetadata", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("debug_id", models.UUIDField(blank=True, null=True)), + ("file_type", models.CharField(blank=True, max_length=255, null=True)), + ("data", models.TextField()), + ( + "file", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="metadatas", + to="files.file", + ), + ), + ], + options={ + "unique_together": {("debug_id", "file_type")}, + }, + ), + ] diff --git a/files/migrations/__init__.py b/files/migrations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/files/models.py b/files/models.py new file mode 100644 index 0000000..6c58219 --- /dev/null +++ b/files/models.py @@ -0,0 +1,42 @@ +from django.db import models + + +class Chunk(models.Model): + checksum = models.CharField(max_length=40, unique=True) # unique implies index, which we also use for lookups + size = models.PositiveIntegerField() + data = models.BinaryField(null=False) # as with Events, we can "eventually" move this out of the database + + def __str__(self): + return self.checksum + + +class File(models.Model): + # NOTE: as it stands, this is exactly the same thing as Chunk; and since we do single-chunk uploads, optimizations + # are imaginable. Make it work first though + + checksum = models.CharField(max_length=40, unique=True) # unique implies index, which we also use for lookups + size = models.PositiveIntegerField() + data = models.BinaryField(null=False) # as with Events, we can "eventually" move this out of the database + + def __str__(self): + return self.checksum + + +class FileMetadata(models.Model): + file = models.ForeignKey(File, null=False, on_delete=models.CASCADE, related_name="metadatas") + + # debug_id & file_type nullability: such data exists in manifest.json; we are future-proof for it although we + # currently don't store it as such. + debug_id = models.UUIDField(max_length=40, null=True, blank=True) + file_type = models.CharField(max_length=255, null=True, blank=True) + data = models.TextField() # we just dump the rest in here; let's see how much we really need. + + def __str__(self): + # somewhat useless when debug_id is None; but that's not the case we care about ATM + return f"debug_id: {self.debug_id} ({self.file_type})" + + class Meta: + # it's _imaginable_ that the below does not actually hold (we just trust the CLI, after all), but that wouldn't + # make any sense, so we just enforce a property that makes sense. Pro: lookups work. Con: if the client sends + # garbage, this is not exposed. + unique_together = (("debug_id", "file_type"),) diff --git a/files/tests.py b/files/tests.py new file mode 100644 index 0000000..e4defab --- /dev/null +++ b/files/tests.py @@ -0,0 +1 @@ +# from django.test import TestCase diff --git a/files/views.py b/files/views.py new file mode 100644 index 0000000..02854f5 --- /dev/null +++ b/files/views.py @@ -0,0 +1,224 @@ +from zipfile import ZipFile +import json +from hashlib import sha1 +from gzip import GzipFile +from io import BytesIO + +from django.http import JsonResponse, HttpResponse +from django.views.decorators.csrf import csrf_exempt + +from sentry.assemble import ChunkFileState + +from bugsink.app_settings import get_settings + +from .models import Chunk, File, FileMetadata + + +_KIBIBYTE = 1024 +_MEBIBYTE = 1024 * _KIBIBYTE + + +class NamedBytesIO(BytesIO): + def __init__(self, data, name): + super().__init__(data) + self.name = name + + +def get_chunk_upload_settings(request, organization_slug): + # Sentry / Sentry-CLI has a whole bunch of logic surrounding URLs, which I do not understand and which presumably + # doesn't make it past Bugsink's cost/benefit-analysis. feature-completeness. For now, we just return our own URL + # which seems to "just work". If we ever want to go down this path : + # + # https://github.com/getsentry/sentry/pull/7095/files <= upload-url-prefix: introduced, but rationale not explained + # + # 2 more starting points for the whole "relative" idea + # * https://github.com/getsentry/sentry-cli/issues/839 + # * https://github.com/getsentry/sentry/pull/29347 + url = get_settings().BASE_URL + "/api/0/organizations/" + organization_slug + "/chunk-upload/" + + # Our "chunk_upload" is chunked in name only; i.e. we only "speak chunked" for the purpose of API-compatability with + # sentry-cli, but we provide params here such that that cli will only send a single chunk. + + return JsonResponse({ + "url": url, + + # For now, staying close to the default MAX_ENVELOPE_COMPRESSED_SIZE, which is 20MiB; + # I _think_ I saw a note somewhere on (one of) these values having to be a power of 2; hence 32 here. + # + # When implementing uploading, it was done to support sourcemaps. It seems that over at Sentry, the reason they + # went so complicated in the first place was to enable DIF support (hunderds of MiB regularly). + "chunkSize": 32 * _MEBIBYTE, + "maxRequestSize": 32 * _MEBIBYTE, + + # I didn't check the supposed relationship between maxRequestSize and maxFileSize, but assume something similar + # to what happens w/ envelopes; hence harmonizing with MAX_ENVELOPE_SIZE (and rounding up to a power of 2) here + "maxFileSize": 128 * _MEBIBYTE, + + # force single-chunk by setting these to 1. + "concurrency": 1, + "chunksPerRequest": 1, + + "hashAlgorithm": "sha1", + "compression": ["gzip"], + + "accept": [ + # I don't claim to fully understand how the sentry-cli switches based on these advertised capabilities, but + # the list below works for now. Any understanding that I did gain is documented. + # for a full list of types we _could_ accept, see src/sentry/api/endpoints/chunk.py + # + + # If the below is off, sentry-cli complains "A release slug is required". Because release-less artifacts are + # actually the simpler thing, that's undesirable. Other consequences of turning it on have not been charted + # yet. + "release_files", + + # this would seem to be the "javascript sourcemaps" thing, but how exactly I did not check yet. + "sources", + + # https://github.com/getsentry/sentry/discussions/46967 + # artifact_bundles is a concept originating from sentry that uses debug_ids to link maps & sources. Despite + # it being relatively new, it's my _first_ target for getting sourcemaps to work, because it's actually the + # most simple and reliable thing (uuid, bidirectional mapping) + "artifact_bundles", + + # AFAIU the only thing _v2 would signify is the ability to "Implement de-duplication with chunking in the + # assemble endpoint for artifact bundles (#51224)". Which is needlessly complex from my point of view. + # "artifact_bundles_v2", + + # the rest of the options are below: + # "debug_files", + # "release_files", + # "pdbs", + # "bcsymbolmaps", + # "il2cpp", + # "portablepdbs", + # "artifact_bundles", + # "proguard", + ] + }) + + +@csrf_exempt +def chunk_upload(request, organization_slug): + # TODO authenticate + # Bugsink has a single-organization model; we simply ignore organization_slug + # NOTE: we don't check against chunkSize, maxRequestSize and chunksPerRequest (yet), we expect the CLI to behave. + + if request.method == "GET": + # a GET at this endpoint returns a dict of settings that the CLI takes into account when uploading + return get_chunk_upload_settings(request, organization_slug) + + # POST: upload (full-size) "chunks" and store them as Chunk objects; file.name whould be the sha1 of the content. + chunks = [] + if request.FILES: + chunks = request.FILES.getlist("file") + + # NOTE: we read the whole unzipped file into memory; we _could_ take an approach like bugsink/streams.py. + # (Note that, because of the auth layer in front, we're slightly less worried about adverserial scenarios) + chunks += [ + NamedBytesIO(GzipFile(fileobj=file_gzip, mode="rb").read(), name=file_gzip.name) + for file_gzip in request.FILES.getlist("file_gzip")] + + for chunk in chunks: + data = chunk.getvalue() + + if sha1(data).hexdigest() != chunk.name: + raise Exception("checksum mismatch") + + _, _ = Chunk.objects.get_or_create( + checksum=chunk.name, + defaults={ + "size": len(data), + "data": data, # NOTE: further possible optimization: don't even read the file when already existing + }) + + open('/tmp/chunk.zip', "wb").write(data) # TODO: remove this line; it's just for debugging + + return HttpResponse() + + +def assemble_artifact_bundle(bundle_checksum, chunk_checksums): + # NOTE: as it stands we don't store the (optional) extra info of release/dist. + + # NOTE: there's also the concept of an artifact bundle as _tied_ to a release, i.e. without debug_ids. We don't + # support that, but if we ever were to support it we'd need a separate method/param to distinguish it. + + bundle_file, _ = assemble_file(bundle_checksum, chunk_checksums) + + bundle_zip = ZipFile(BytesIO(bundle_file.data)) # NOTE: in-memory handling of zips. + manifest_bytes = bundle_zip.read("manifest.json") + manifest = json.loads(manifest_bytes.decode("utf-8")) + + for filename, manifest_entry in manifest["files"].items(): + file_data = bundle_zip.read(filename) + + checksum = sha1(file_data).hexdigest() + + file, _ = File.objects.get_or_create( + checksum=checksum, + defaults={ + "size": len(file_data), + "data": file_data, + }) + + debug_id = manifest_entry.get("headers", {}).get("debug-id", None) + file_type = manifest_entry.get("type", None) + if debug_id is None or file_type is None: + # such records exist and we could store them, but we don't, since we don't have a purpose for them. + continue + + FileMetadata.objects.get_or_create( + debug_id=debug_id, + file_type=file_type, + defaults={ + "file": file, + "data": json.dumps(manifest_entry), + } + ) + + # NOTE we _could_ get rid of the file at this point (but we don't). Ties in to broader questions of retention. + + +def assemble_file(checksum, chunk_checksums): + """Assembles a file from chunks""" + + # NOTE: unimplemented checks/tricks + # * total file-size v.s. some max + # * explicit check chunk availability (as it stands, our processing is synchronous, so no need) + # * skip-on-checksum-exists + + chunks = Chunk.objects.filter(checksum__in=chunk_checksums) + chunks_dicts = {chunk.checksum: chunk for chunk in chunks} + chunks_in_order = [chunks_dicts[checksum] for checksum in chunk_checksums] # implicitly checks chunk availability + data = b"".join([chunk.data for chunk in chunks_in_order]) + + if sha1(data).hexdigest() != checksum: + raise Exception("checksum mismatch") + + return File.objects.get_or_create( + checksum=checksum, + defaults={ + "size": len(data), + "data": data, + }) + + +@csrf_exempt # we're in API context here; this could potentially be pulled up to a higher level though +def artifact_bundle_assemble(request, organization_slug): + # TODO authenticate + # Bugsink has a single-organization model; we simply ignore organization_slug + + # NOTE a JSON-schema for this endpoint is available under Apache 2 license (2 year anniversary rule) at + # https://github.com/getsentry/sentry/blob/8df7543848b4/src/sentry/api/endpoints/organization_artifactbundle_assemble.py#L24 + # (not worth the trouble of extracting right now, since our /sentry dir contains BSD-3 licensed code (2019 version) + + data = json.loads(request.body) + assemble_artifact_bundle(data["checksum"], data["chunks"]) + + # NOTE sentry & glitchtip _always_ return an empty list for "missingChunks" in this view; I don't really understand + # what's being achieved with that, but it seems to be the expected behavior. Working hypothesis: this was introduced + # for DIF uploads, and the present endpoint doesn't use it at all. Not even for "v2", surprisingly. + + # NOTE: as it stands, we process the bundle inline, so arguably we could return "OK" here too; "CREATED" is what + # sentry returns though, so for faithful mimicking it's the safest bet. + return JsonResponse({"state": ChunkFileState.CREATED, "missingChunks": []}) diff --git a/pyproject.toml b/pyproject.toml index 9df83b2..d0e7dbe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,7 @@ include = [ "ee*", "ingest*", "issues*", + "files*", "performance*", "phonehome*", "projects*", diff --git a/sentry/assemble.py b/sentry/assemble.py new file mode 100644 index 0000000..ee8c10f --- /dev/null +++ b/sentry/assemble.py @@ -0,0 +1,16 @@ +# from src/sentry/tasks/assemble.py + + +def enum(**named_values): + """Creates an enum type.""" + return type("Enum", (), named_values) + + +ChunkFileState = enum( + OK="ok", # File in database + NOT_FOUND="not_found", # File not found in database + CREATED="created", # File was created in the request and send to the worker for assembling + ASSEMBLING="assembling", # File still being processed by worker + ERROR="error", # Error happened during assembling +) +