mirror of
https://github.com/jlengrand/bugsink.git
synced 2026-03-09 23:51:20 +00:00
PoC of uploading sourcemap artifact bundles
* debug_id-only * various TODOs (e.g. auth, async, retention) See #19
This commit is contained in:
@@ -77,6 +77,7 @@ BUGSINK_APPS = [
|
||||
'releases',
|
||||
'ingest',
|
||||
'issues',
|
||||
'files',
|
||||
'events',
|
||||
'tags',
|
||||
'alerts',
|
||||
|
||||
@@ -10,6 +10,7 @@ from teams.views import debug_email as debug_teams_email
|
||||
from bugsink.app_settings import get_settings
|
||||
from users.views import signup, confirm_email, resend_confirmation, request_reset_password, reset_password, preferences
|
||||
from ingest.views import download_envelope
|
||||
from files.views import chunk_upload, artifact_bundle_assemble
|
||||
|
||||
from .views import home, trigger_error, favicon, settings_view, silence_email_system_warning
|
||||
from .debug_views import csrf_debug
|
||||
@@ -38,6 +39,13 @@ urlpatterns = [
|
||||
# many user-related views are directly exposed above (/accounts/), the rest is here:
|
||||
path("users/", include("users.urls")),
|
||||
|
||||
# these are sentry-cli endpoint for uploading; they're unrelated to e.g. the ingestion API.
|
||||
# the /api/0/ is just a hard prefix (for the ingest API, that position indicates the project id, but here it's just
|
||||
# a prefix)
|
||||
path("api/0/organizations/<slug:organization_slug>/chunk-upload/", chunk_upload, name="chunk_upload"),
|
||||
path("api/0/organizations/<slug:organization_slug>/artifactbundle/assemble/", artifact_bundle_assemble,
|
||||
name="artifact_bundle_assemble"),
|
||||
|
||||
path('api/', include('ingest.urls')),
|
||||
|
||||
# not in /api/ because it's not part of the ingest API, but still part of the ingest app
|
||||
|
||||
0
files/__init__.py
Normal file
0
files/__init__.py
Normal file
22
files/admin.py
Normal file
22
files/admin.py
Normal file
@@ -0,0 +1,22 @@
|
||||
from django.contrib import admin
|
||||
from .models import Chunk, File, FileMetadata
|
||||
|
||||
|
||||
@admin.register(Chunk)
|
||||
class ChunkAdmin(admin.ModelAdmin):
|
||||
list_display = ('checksum', 'size')
|
||||
search_fields = ('checksum',)
|
||||
readonly_fields = ('data',)
|
||||
|
||||
|
||||
@admin.register(File)
|
||||
class FileAdmin(admin.ModelAdmin):
|
||||
list_display = ('checksum', 'size')
|
||||
search_fields = ('checksum',)
|
||||
readonly_fields = ('data',)
|
||||
|
||||
|
||||
@admin.register(FileMetadata)
|
||||
class FileMetadataAdmin(admin.ModelAdmin):
|
||||
list_display = ('debug_id', 'file_type', 'file')
|
||||
search_fields = ('file__checksum', 'debug_id', 'file_type')
|
||||
6
files/apps.py
Normal file
6
files/apps.py
Normal file
@@ -0,0 +1,6 @@
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class FilesConfig(AppConfig):
|
||||
default_auto_field = "django.db.models.BigAutoField"
|
||||
name = "files"
|
||||
76
files/migrations/0001_initial.py
Normal file
76
files/migrations/0001_initial.py
Normal file
@@ -0,0 +1,76 @@
|
||||
# Generated by Django 4.2.19 on 2025-04-10 08:15
|
||||
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = []
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name="Chunk",
|
||||
fields=[
|
||||
(
|
||||
"id",
|
||||
models.BigAutoField(
|
||||
auto_created=True,
|
||||
primary_key=True,
|
||||
serialize=False,
|
||||
verbose_name="ID",
|
||||
),
|
||||
),
|
||||
("checksum", models.CharField(max_length=40, unique=True)),
|
||||
("size", models.PositiveIntegerField()),
|
||||
("data", models.BinaryField()),
|
||||
],
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name="File",
|
||||
fields=[
|
||||
(
|
||||
"id",
|
||||
models.BigAutoField(
|
||||
auto_created=True,
|
||||
primary_key=True,
|
||||
serialize=False,
|
||||
verbose_name="ID",
|
||||
),
|
||||
),
|
||||
("checksum", models.CharField(max_length=40, unique=True)),
|
||||
("size", models.PositiveIntegerField()),
|
||||
("data", models.BinaryField()),
|
||||
],
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name="FileMetadata",
|
||||
fields=[
|
||||
(
|
||||
"id",
|
||||
models.BigAutoField(
|
||||
auto_created=True,
|
||||
primary_key=True,
|
||||
serialize=False,
|
||||
verbose_name="ID",
|
||||
),
|
||||
),
|
||||
("debug_id", models.UUIDField(blank=True, null=True)),
|
||||
("file_type", models.CharField(blank=True, max_length=255, null=True)),
|
||||
("data", models.TextField()),
|
||||
(
|
||||
"file",
|
||||
models.ForeignKey(
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
related_name="metadatas",
|
||||
to="files.file",
|
||||
),
|
||||
),
|
||||
],
|
||||
options={
|
||||
"unique_together": {("debug_id", "file_type")},
|
||||
},
|
||||
),
|
||||
]
|
||||
0
files/migrations/__init__.py
Normal file
0
files/migrations/__init__.py
Normal file
42
files/models.py
Normal file
42
files/models.py
Normal file
@@ -0,0 +1,42 @@
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Chunk(models.Model):
|
||||
checksum = models.CharField(max_length=40, unique=True) # unique implies index, which we also use for lookups
|
||||
size = models.PositiveIntegerField()
|
||||
data = models.BinaryField(null=False) # as with Events, we can "eventually" move this out of the database
|
||||
|
||||
def __str__(self):
|
||||
return self.checksum
|
||||
|
||||
|
||||
class File(models.Model):
|
||||
# NOTE: as it stands, this is exactly the same thing as Chunk; and since we do single-chunk uploads, optimizations
|
||||
# are imaginable. Make it work first though
|
||||
|
||||
checksum = models.CharField(max_length=40, unique=True) # unique implies index, which we also use for lookups
|
||||
size = models.PositiveIntegerField()
|
||||
data = models.BinaryField(null=False) # as with Events, we can "eventually" move this out of the database
|
||||
|
||||
def __str__(self):
|
||||
return self.checksum
|
||||
|
||||
|
||||
class FileMetadata(models.Model):
|
||||
file = models.ForeignKey(File, null=False, on_delete=models.CASCADE, related_name="metadatas")
|
||||
|
||||
# debug_id & file_type nullability: such data exists in manifest.json; we are future-proof for it although we
|
||||
# currently don't store it as such.
|
||||
debug_id = models.UUIDField(max_length=40, null=True, blank=True)
|
||||
file_type = models.CharField(max_length=255, null=True, blank=True)
|
||||
data = models.TextField() # we just dump the rest in here; let's see how much we really need.
|
||||
|
||||
def __str__(self):
|
||||
# somewhat useless when debug_id is None; but that's not the case we care about ATM
|
||||
return f"debug_id: {self.debug_id} ({self.file_type})"
|
||||
|
||||
class Meta:
|
||||
# it's _imaginable_ that the below does not actually hold (we just trust the CLI, after all), but that wouldn't
|
||||
# make any sense, so we just enforce a property that makes sense. Pro: lookups work. Con: if the client sends
|
||||
# garbage, this is not exposed.
|
||||
unique_together = (("debug_id", "file_type"),)
|
||||
1
files/tests.py
Normal file
1
files/tests.py
Normal file
@@ -0,0 +1 @@
|
||||
# from django.test import TestCase
|
||||
224
files/views.py
Normal file
224
files/views.py
Normal file
@@ -0,0 +1,224 @@
|
||||
from zipfile import ZipFile
|
||||
import json
|
||||
from hashlib import sha1
|
||||
from gzip import GzipFile
|
||||
from io import BytesIO
|
||||
|
||||
from django.http import JsonResponse, HttpResponse
|
||||
from django.views.decorators.csrf import csrf_exempt
|
||||
|
||||
from sentry.assemble import ChunkFileState
|
||||
|
||||
from bugsink.app_settings import get_settings
|
||||
|
||||
from .models import Chunk, File, FileMetadata
|
||||
|
||||
|
||||
_KIBIBYTE = 1024
|
||||
_MEBIBYTE = 1024 * _KIBIBYTE
|
||||
|
||||
|
||||
class NamedBytesIO(BytesIO):
|
||||
def __init__(self, data, name):
|
||||
super().__init__(data)
|
||||
self.name = name
|
||||
|
||||
|
||||
def get_chunk_upload_settings(request, organization_slug):
|
||||
# Sentry / Sentry-CLI has a whole bunch of logic surrounding URLs, which I do not understand and which presumably
|
||||
# doesn't make it past Bugsink's cost/benefit-analysis. feature-completeness. For now, we just return our own URL
|
||||
# which seems to "just work". If we ever want to go down this path :
|
||||
#
|
||||
# https://github.com/getsentry/sentry/pull/7095/files <= upload-url-prefix: introduced, but rationale not explained
|
||||
#
|
||||
# 2 more starting points for the whole "relative" idea
|
||||
# * https://github.com/getsentry/sentry-cli/issues/839
|
||||
# * https://github.com/getsentry/sentry/pull/29347
|
||||
url = get_settings().BASE_URL + "/api/0/organizations/" + organization_slug + "/chunk-upload/"
|
||||
|
||||
# Our "chunk_upload" is chunked in name only; i.e. we only "speak chunked" for the purpose of API-compatability with
|
||||
# sentry-cli, but we provide params here such that that cli will only send a single chunk.
|
||||
|
||||
return JsonResponse({
|
||||
"url": url,
|
||||
|
||||
# For now, staying close to the default MAX_ENVELOPE_COMPRESSED_SIZE, which is 20MiB;
|
||||
# I _think_ I saw a note somewhere on (one of) these values having to be a power of 2; hence 32 here.
|
||||
#
|
||||
# When implementing uploading, it was done to support sourcemaps. It seems that over at Sentry, the reason they
|
||||
# went so complicated in the first place was to enable DIF support (hunderds of MiB regularly).
|
||||
"chunkSize": 32 * _MEBIBYTE,
|
||||
"maxRequestSize": 32 * _MEBIBYTE,
|
||||
|
||||
# I didn't check the supposed relationship between maxRequestSize and maxFileSize, but assume something similar
|
||||
# to what happens w/ envelopes; hence harmonizing with MAX_ENVELOPE_SIZE (and rounding up to a power of 2) here
|
||||
"maxFileSize": 128 * _MEBIBYTE,
|
||||
|
||||
# force single-chunk by setting these to 1.
|
||||
"concurrency": 1,
|
||||
"chunksPerRequest": 1,
|
||||
|
||||
"hashAlgorithm": "sha1",
|
||||
"compression": ["gzip"],
|
||||
|
||||
"accept": [
|
||||
# I don't claim to fully understand how the sentry-cli switches based on these advertised capabilities, but
|
||||
# the list below works for now. Any understanding that I did gain is documented.
|
||||
# for a full list of types we _could_ accept, see src/sentry/api/endpoints/chunk.py
|
||||
#
|
||||
|
||||
# If the below is off, sentry-cli complains "A release slug is required". Because release-less artifacts are
|
||||
# actually the simpler thing, that's undesirable. Other consequences of turning it on have not been charted
|
||||
# yet.
|
||||
"release_files",
|
||||
|
||||
# this would seem to be the "javascript sourcemaps" thing, but how exactly I did not check yet.
|
||||
"sources",
|
||||
|
||||
# https://github.com/getsentry/sentry/discussions/46967
|
||||
# artifact_bundles is a concept originating from sentry that uses debug_ids to link maps & sources. Despite
|
||||
# it being relatively new, it's my _first_ target for getting sourcemaps to work, because it's actually the
|
||||
# most simple and reliable thing (uuid, bidirectional mapping)
|
||||
"artifact_bundles",
|
||||
|
||||
# AFAIU the only thing _v2 would signify is the ability to "Implement de-duplication with chunking in the
|
||||
# assemble endpoint for artifact bundles (#51224)". Which is needlessly complex from my point of view.
|
||||
# "artifact_bundles_v2",
|
||||
|
||||
# the rest of the options are below:
|
||||
# "debug_files",
|
||||
# "release_files",
|
||||
# "pdbs",
|
||||
# "bcsymbolmaps",
|
||||
# "il2cpp",
|
||||
# "portablepdbs",
|
||||
# "artifact_bundles",
|
||||
# "proguard",
|
||||
]
|
||||
})
|
||||
|
||||
|
||||
@csrf_exempt
|
||||
def chunk_upload(request, organization_slug):
|
||||
# TODO authenticate
|
||||
# Bugsink has a single-organization model; we simply ignore organization_slug
|
||||
# NOTE: we don't check against chunkSize, maxRequestSize and chunksPerRequest (yet), we expect the CLI to behave.
|
||||
|
||||
if request.method == "GET":
|
||||
# a GET at this endpoint returns a dict of settings that the CLI takes into account when uploading
|
||||
return get_chunk_upload_settings(request, organization_slug)
|
||||
|
||||
# POST: upload (full-size) "chunks" and store them as Chunk objects; file.name whould be the sha1 of the content.
|
||||
chunks = []
|
||||
if request.FILES:
|
||||
chunks = request.FILES.getlist("file")
|
||||
|
||||
# NOTE: we read the whole unzipped file into memory; we _could_ take an approach like bugsink/streams.py.
|
||||
# (Note that, because of the auth layer in front, we're slightly less worried about adverserial scenarios)
|
||||
chunks += [
|
||||
NamedBytesIO(GzipFile(fileobj=file_gzip, mode="rb").read(), name=file_gzip.name)
|
||||
for file_gzip in request.FILES.getlist("file_gzip")]
|
||||
|
||||
for chunk in chunks:
|
||||
data = chunk.getvalue()
|
||||
|
||||
if sha1(data).hexdigest() != chunk.name:
|
||||
raise Exception("checksum mismatch")
|
||||
|
||||
_, _ = Chunk.objects.get_or_create(
|
||||
checksum=chunk.name,
|
||||
defaults={
|
||||
"size": len(data),
|
||||
"data": data, # NOTE: further possible optimization: don't even read the file when already existing
|
||||
})
|
||||
|
||||
open('/tmp/chunk.zip', "wb").write(data) # TODO: remove this line; it's just for debugging
|
||||
|
||||
return HttpResponse()
|
||||
|
||||
|
||||
def assemble_artifact_bundle(bundle_checksum, chunk_checksums):
|
||||
# NOTE: as it stands we don't store the (optional) extra info of release/dist.
|
||||
|
||||
# NOTE: there's also the concept of an artifact bundle as _tied_ to a release, i.e. without debug_ids. We don't
|
||||
# support that, but if we ever were to support it we'd need a separate method/param to distinguish it.
|
||||
|
||||
bundle_file, _ = assemble_file(bundle_checksum, chunk_checksums)
|
||||
|
||||
bundle_zip = ZipFile(BytesIO(bundle_file.data)) # NOTE: in-memory handling of zips.
|
||||
manifest_bytes = bundle_zip.read("manifest.json")
|
||||
manifest = json.loads(manifest_bytes.decode("utf-8"))
|
||||
|
||||
for filename, manifest_entry in manifest["files"].items():
|
||||
file_data = bundle_zip.read(filename)
|
||||
|
||||
checksum = sha1(file_data).hexdigest()
|
||||
|
||||
file, _ = File.objects.get_or_create(
|
||||
checksum=checksum,
|
||||
defaults={
|
||||
"size": len(file_data),
|
||||
"data": file_data,
|
||||
})
|
||||
|
||||
debug_id = manifest_entry.get("headers", {}).get("debug-id", None)
|
||||
file_type = manifest_entry.get("type", None)
|
||||
if debug_id is None or file_type is None:
|
||||
# such records exist and we could store them, but we don't, since we don't have a purpose for them.
|
||||
continue
|
||||
|
||||
FileMetadata.objects.get_or_create(
|
||||
debug_id=debug_id,
|
||||
file_type=file_type,
|
||||
defaults={
|
||||
"file": file,
|
||||
"data": json.dumps(manifest_entry),
|
||||
}
|
||||
)
|
||||
|
||||
# NOTE we _could_ get rid of the file at this point (but we don't). Ties in to broader questions of retention.
|
||||
|
||||
|
||||
def assemble_file(checksum, chunk_checksums):
|
||||
"""Assembles a file from chunks"""
|
||||
|
||||
# NOTE: unimplemented checks/tricks
|
||||
# * total file-size v.s. some max
|
||||
# * explicit check chunk availability (as it stands, our processing is synchronous, so no need)
|
||||
# * skip-on-checksum-exists
|
||||
|
||||
chunks = Chunk.objects.filter(checksum__in=chunk_checksums)
|
||||
chunks_dicts = {chunk.checksum: chunk for chunk in chunks}
|
||||
chunks_in_order = [chunks_dicts[checksum] for checksum in chunk_checksums] # implicitly checks chunk availability
|
||||
data = b"".join([chunk.data for chunk in chunks_in_order])
|
||||
|
||||
if sha1(data).hexdigest() != checksum:
|
||||
raise Exception("checksum mismatch")
|
||||
|
||||
return File.objects.get_or_create(
|
||||
checksum=checksum,
|
||||
defaults={
|
||||
"size": len(data),
|
||||
"data": data,
|
||||
})
|
||||
|
||||
|
||||
@csrf_exempt # we're in API context here; this could potentially be pulled up to a higher level though
|
||||
def artifact_bundle_assemble(request, organization_slug):
|
||||
# TODO authenticate
|
||||
# Bugsink has a single-organization model; we simply ignore organization_slug
|
||||
|
||||
# NOTE a JSON-schema for this endpoint is available under Apache 2 license (2 year anniversary rule) at
|
||||
# https://github.com/getsentry/sentry/blob/8df7543848b4/src/sentry/api/endpoints/organization_artifactbundle_assemble.py#L24
|
||||
# (not worth the trouble of extracting right now, since our /sentry dir contains BSD-3 licensed code (2019 version)
|
||||
|
||||
data = json.loads(request.body)
|
||||
assemble_artifact_bundle(data["checksum"], data["chunks"])
|
||||
|
||||
# NOTE sentry & glitchtip _always_ return an empty list for "missingChunks" in this view; I don't really understand
|
||||
# what's being achieved with that, but it seems to be the expected behavior. Working hypothesis: this was introduced
|
||||
# for DIF uploads, and the present endpoint doesn't use it at all. Not even for "v2", surprisingly.
|
||||
|
||||
# NOTE: as it stands, we process the bundle inline, so arguably we could return "OK" here too; "CREATED" is what
|
||||
# sentry returns though, so for faithful mimicking it's the safest bet.
|
||||
return JsonResponse({"state": ChunkFileState.CREATED, "missingChunks": []})
|
||||
@@ -46,6 +46,7 @@ include = [
|
||||
"ee*",
|
||||
"ingest*",
|
||||
"issues*",
|
||||
"files*",
|
||||
"performance*",
|
||||
"phonehome*",
|
||||
"projects*",
|
||||
|
||||
16
sentry/assemble.py
Normal file
16
sentry/assemble.py
Normal file
@@ -0,0 +1,16 @@
|
||||
# from src/sentry/tasks/assemble.py
|
||||
|
||||
|
||||
def enum(**named_values):
|
||||
"""Creates an enum type."""
|
||||
return type("Enum", (), named_values)
|
||||
|
||||
|
||||
ChunkFileState = enum(
|
||||
OK="ok", # File in database
|
||||
NOT_FOUND="not_found", # File not found in database
|
||||
CREATED="created", # File was created in the request and send to the worker for assembling
|
||||
ASSEMBLING="assembling", # File still being processed by worker
|
||||
ERROR="error", # Error happened during assembling
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user