mirror of
https://github.com/jlengrand/bugsink.git
synced 2026-03-10 08:01:17 +00:00
264 lines
11 KiB
Python
264 lines
11 KiB
Python
from zipfile import ZipFile
|
|
import json
|
|
from hashlib import sha1
|
|
from gzip import GzipFile
|
|
from io import BytesIO
|
|
from os.path import basename
|
|
|
|
from django.http import JsonResponse, HttpResponse
|
|
from django.views.decorators.csrf import csrf_exempt
|
|
from django.contrib.auth.decorators import user_passes_test
|
|
|
|
from sentry.assemble import ChunkFileState
|
|
|
|
from bugsink.app_settings import get_settings
|
|
from bsmain.models import AuthToken
|
|
|
|
from .models import Chunk, File, FileMetadata
|
|
|
|
|
|
_KIBIBYTE = 1024
|
|
_MEBIBYTE = 1024 * _KIBIBYTE
|
|
|
|
|
|
class NamedBytesIO(BytesIO):
|
|
def __init__(self, data, name):
|
|
super().__init__(data)
|
|
self.name = name
|
|
|
|
|
|
def get_chunk_upload_settings(request, organization_slug):
|
|
# Sentry / Sentry-CLI has a whole bunch of logic surrounding URLs, which I do not understand and which presumably
|
|
# doesn't make it past Bugsink's cost/benefit-analysis. feature-completeness. For now, we just return our own URL
|
|
# which seems to "just work". If we ever want to go down this path :
|
|
#
|
|
# https://github.com/getsentry/sentry/pull/7095/files <= upload-url-prefix: introduced, but rationale not explained
|
|
#
|
|
# 2 more starting points for the whole "relative" idea
|
|
# * https://github.com/getsentry/sentry-cli/issues/839
|
|
# * https://github.com/getsentry/sentry/pull/29347
|
|
url = get_settings().BASE_URL + "/api/0/organizations/" + organization_slug + "/chunk-upload/"
|
|
|
|
# Our "chunk_upload" is chunked in name only; i.e. we only "speak chunked" for the purpose of API-compatability with
|
|
# sentry-cli, but we provide params here such that that cli will only send a single chunk.
|
|
|
|
return JsonResponse({
|
|
"url": url,
|
|
|
|
# For now, staying close to the default MAX_ENVELOPE_COMPRESSED_SIZE, which is 20MiB;
|
|
# I _think_ I saw a note somewhere on (one of) these values having to be a power of 2; hence 32 here.
|
|
#
|
|
# When implementing uploading, it was done to support sourcemaps. It seems that over at Sentry, the reason they
|
|
# went so complicated in the first place was to enable DIF support (hunderds of MiB regularly).
|
|
"chunkSize": 32 * _MEBIBYTE,
|
|
"maxRequestSize": 32 * _MEBIBYTE,
|
|
|
|
# I didn't check the supposed relationship between maxRequestSize and maxFileSize, but assume something similar
|
|
# to what happens w/ envelopes; hence harmonizing with MAX_ENVELOPE_SIZE (and rounding up to a power of 2) here
|
|
"maxFileSize": 128 * _MEBIBYTE,
|
|
|
|
# force single-chunk by setting these to 1.
|
|
"concurrency": 1,
|
|
"chunksPerRequest": 1,
|
|
|
|
"hashAlgorithm": "sha1",
|
|
"compression": ["gzip"],
|
|
|
|
"accept": [
|
|
# I don't claim to fully understand how the sentry-cli switches based on these advertised capabilities, but
|
|
# the list below works for now. Any understanding that I did gain is documented.
|
|
# for a full list of types we _could_ accept, see src/sentry/api/endpoints/chunk.py
|
|
#
|
|
|
|
# If the below is off, sentry-cli complains "A release slug is required". Because release-less artifacts are
|
|
# actually the simpler thing, that's undesirable. Other consequences of turning it on have not been charted
|
|
# yet.
|
|
"release_files",
|
|
|
|
# this would seem to be the "javascript sourcemaps" thing, but how exactly I did not check yet.
|
|
"sources",
|
|
|
|
# https://github.com/getsentry/sentry/discussions/46967
|
|
# artifact_bundles is a concept originating from sentry that uses debug_ids to link maps & sources. Despite
|
|
# it being relatively new, it's my _first_ target for getting sourcemaps to work, because it's actually the
|
|
# most simple and reliable thing (uuid, bidirectional mapping)
|
|
"artifact_bundles",
|
|
|
|
# AFAIU the only thing _v2 would signify is the ability to "Implement de-duplication with chunking in the
|
|
# assemble endpoint for artifact bundles (#51224)". Which is needlessly complex from my point of view.
|
|
# "artifact_bundles_v2",
|
|
|
|
# the rest of the options are below:
|
|
# "debug_files",
|
|
# "release_files",
|
|
# "pdbs",
|
|
# "bcsymbolmaps",
|
|
# "il2cpp",
|
|
# "portablepdbs",
|
|
# "artifact_bundles",
|
|
# "proguard",
|
|
]
|
|
})
|
|
|
|
|
|
def requires_auth_token(view_function):
|
|
# {"error": "..."} (status=401) response is API-compatible; for that to work we need the present function to be a
|
|
# decorator (so we can return, rather than raise, which plain-Django doesn't support for 401)
|
|
|
|
def first_require_auth_token(request, *args, **kwargs):
|
|
header_value = request.META.get("HTTP_AUTHORIZATION")
|
|
if not header_value:
|
|
return JsonResponse({"error": "Authorization header not found"}, status=401)
|
|
|
|
header_values = header_value.split()
|
|
|
|
if len(header_values) != 2:
|
|
return JsonResponse(
|
|
{"error": "Expecting 'Authorization: Token abc123...' but got '%s'" % header_value}, status=401)
|
|
|
|
the_word_bearer, token = header_values
|
|
|
|
if AuthToken.objects.filter(token=token).count() < 1:
|
|
return JsonResponse({"error": "Invalid token"}, status=401)
|
|
|
|
return view_function(request, *args, **kwargs)
|
|
|
|
first_require_auth_token.__name__ = view_function.__name__
|
|
return first_require_auth_token
|
|
|
|
|
|
@csrf_exempt
|
|
@requires_auth_token
|
|
def chunk_upload(request, organization_slug):
|
|
# Bugsink has a single-organization model; we simply ignore organization_slug
|
|
# NOTE: we don't check against chunkSize, maxRequestSize and chunksPerRequest (yet), we expect the CLI to behave.
|
|
|
|
if request.method == "GET":
|
|
# a GET at this endpoint returns a dict of settings that the CLI takes into account when uploading
|
|
return get_chunk_upload_settings(request, organization_slug)
|
|
|
|
# POST: upload (full-size) "chunks" and store them as Chunk objects; file.name whould be the sha1 of the content.
|
|
chunks = []
|
|
if request.FILES:
|
|
chunks = request.FILES.getlist("file")
|
|
|
|
# NOTE: we read the whole unzipped file into memory; we _could_ take an approach like bugsink/streams.py.
|
|
# (Note that, because of the auth layer in front, we're slightly less worried about adverserial scenarios)
|
|
chunks += [
|
|
NamedBytesIO(GzipFile(fileobj=file_gzip, mode="rb").read(), name=file_gzip.name)
|
|
for file_gzip in request.FILES.getlist("file_gzip")]
|
|
|
|
for chunk in chunks:
|
|
data = chunk.getvalue()
|
|
|
|
if sha1(data).hexdigest() != chunk.name:
|
|
raise Exception("checksum mismatch")
|
|
|
|
_, _ = Chunk.objects.get_or_create(
|
|
checksum=chunk.name,
|
|
defaults={
|
|
"size": len(data),
|
|
"data": data, # NOTE: further possible optimization: don't even read the file when already existing
|
|
})
|
|
|
|
return HttpResponse()
|
|
|
|
|
|
def assemble_artifact_bundle(bundle_checksum, chunk_checksums):
|
|
# NOTE: as it stands we don't store the (optional) extra info of release/dist.
|
|
|
|
# NOTE: there's also the concept of an artifact bundle as _tied_ to a release, i.e. without debug_ids. We don't
|
|
# support that, but if we ever were to support it we'd need a separate method/param to distinguish it.
|
|
|
|
bundle_file, _ = assemble_file(bundle_checksum, chunk_checksums, filename=f"{bundle_checksum}.zip")
|
|
|
|
bundle_zip = ZipFile(BytesIO(bundle_file.data)) # NOTE: in-memory handling of zips.
|
|
manifest_bytes = bundle_zip.read("manifest.json")
|
|
manifest = json.loads(manifest_bytes.decode("utf-8"))
|
|
|
|
for filename, manifest_entry in manifest["files"].items():
|
|
file_data = bundle_zip.read(filename)
|
|
|
|
checksum = sha1(file_data).hexdigest()
|
|
|
|
filename = basename(manifest_entry.get("url", filename))[:255]
|
|
|
|
file, _ = File.objects.get_or_create(
|
|
checksum=checksum,
|
|
defaults={
|
|
"filename": filename,
|
|
"size": len(file_data),
|
|
"data": file_data,
|
|
})
|
|
|
|
debug_id = manifest_entry.get("headers", {}).get("debug-id", None)
|
|
file_type = manifest_entry.get("type", None)
|
|
if debug_id is None or file_type is None:
|
|
# such records exist and we could store them, but we don't, since we don't have a purpose for them.
|
|
continue
|
|
|
|
FileMetadata.objects.get_or_create(
|
|
debug_id=debug_id,
|
|
file_type=file_type,
|
|
defaults={
|
|
"file": file,
|
|
"data": json.dumps(manifest_entry),
|
|
}
|
|
)
|
|
|
|
# NOTE we _could_ get rid of the file at this point (but we don't). Ties in to broader questions of retention.
|
|
|
|
|
|
def assemble_file(checksum, chunk_checksums, filename):
|
|
"""Assembles a file from chunks"""
|
|
|
|
# NOTE: unimplemented checks/tricks
|
|
# * total file-size v.s. some max
|
|
# * explicit check chunk availability (as it stands, our processing is synchronous, so no need)
|
|
# * skip-on-checksum-exists
|
|
|
|
chunks = Chunk.objects.filter(checksum__in=chunk_checksums)
|
|
chunks_dicts = {chunk.checksum: chunk for chunk in chunks}
|
|
chunks_in_order = [chunks_dicts[checksum] for checksum in chunk_checksums] # implicitly checks chunk availability
|
|
data = b"".join([chunk.data for chunk in chunks_in_order])
|
|
|
|
if sha1(data).hexdigest() != checksum:
|
|
raise Exception("checksum mismatch")
|
|
|
|
return File.objects.get_or_create(
|
|
checksum=checksum,
|
|
defaults={
|
|
"size": len(data),
|
|
"data": data,
|
|
"filename": filename,
|
|
})
|
|
|
|
|
|
@csrf_exempt # we're in API context here; this could potentially be pulled up to a higher level though
|
|
@requires_auth_token
|
|
def artifact_bundle_assemble(request, organization_slug):
|
|
# Bugsink has a single-organization model; we simply ignore organization_slug
|
|
|
|
# NOTE a JSON-schema for this endpoint is available under Apache 2 license (2 year anniversary rule) at
|
|
# https://github.com/getsentry/sentry/blob/8df7543848b4/src/sentry/api/endpoints/organization_artifactbundle_assemble.py#L24
|
|
# (not worth the trouble of extracting right now, since our /sentry dir contains BSD-3 licensed code (2019 version)
|
|
|
|
data = json.loads(request.body)
|
|
assemble_artifact_bundle(data["checksum"], data["chunks"])
|
|
|
|
# NOTE sentry & glitchtip _always_ return an empty list for "missingChunks" in this view; I don't really understand
|
|
# what's being achieved with that, but it seems to be the expected behavior. Working hypothesis: this was introduced
|
|
# for DIF uploads, and the present endpoint doesn't use it at all. Not even for "v2", surprisingly.
|
|
|
|
# NOTE: as it stands, we process the bundle inline, so arguably we could return "OK" here too; "CREATED" is what
|
|
# sentry returns though, so for faithful mimicking it's the safest bet.
|
|
return JsonResponse({"state": ChunkFileState.CREATED, "missingChunks": []})
|
|
|
|
|
|
@user_passes_test(lambda u: u.is_superuser)
|
|
def download_file(request, checksum):
|
|
file = File.objects.get(checksum=checksum)
|
|
response = HttpResponse(file.data, content_type="application/octet-stream")
|
|
response["Content-Disposition"] = f"attachment; filename={file.filename}"
|
|
return response
|