From b60980c8f36b25e659f052d0965d29b763d8dbe9 Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Wed, 12 Nov 2025 14:57:02 +0100 Subject: [PATCH] PoC: Minidumps w/ symbolification Plenty of TODOs left; but this proves we can find: * file names * function names * line nos * source context See #82 --- files/minidump.py | 143 +++++++++++++++++++++++++++++++++++++++++++++ files/views.py | 8 ++- sentry/minidump.py | 59 +++++++++---------- 3 files changed, 177 insertions(+), 33 deletions(-) create mode 100644 files/minidump.py diff --git a/files/minidump.py b/files/minidump.py new file mode 100644 index 0000000..2ac523c --- /dev/null +++ b/files/minidump.py @@ -0,0 +1,143 @@ +import io +import zipfile +import symbolic +from sentry_sdk_extensions import capture_or_log_exception + +from bugsink.utils import assert_ +from .models import FileMetadata + + +def get_single_object(archive): + # our understanding: sentry-cli uploads single-object archives; we need to get the single object out of it... + # ...but this does raise the question of why archives exist at all... hence the assert + objects = list(archive.iter_objects()) + assert_(len(objects) == 1) + return objects[0] + + +def build_cfi_map_from_minidump_bytes(minidump_bytes): + process_state = symbolic.minidump.ProcessState.from_minidump_buffer(minidump_bytes) + + frame_info_map = symbolic.minidump.FrameInfoMap.new() + + for module in process_state.modules(): + if not module.debug_id: + continue + + dashed_debug_id = symbolic.debuginfo.id_from_breakpad(module.debug_id) + if FileMetadata.objects.filter(debug_id=dashed_debug_id, file_type="dbg").count() == 0: + continue + + dif_bytes = FileMetadata.objects.get(debug_id=dashed_debug_id, file_type="dbg").file.data + archive = symbolic.debuginfo.Archive.from_bytes(dif_bytes) + + debug_object = get_single_object(archive) + + cfi = symbolic.minidump.CfiCache.from_object(debug_object) + frame_info_map.add(module.debug_id, cfi) + + return frame_info_map + + +def extract_dif_metadata(dif_bytes): + try: + archive = symbolic.debuginfo.Archive.from_bytes(dif_bytes) + debug_object = get_single_object(archive) + return { + "kind": debug_object.kind, # "dbg", "lib", "src" + "code_id": debug_object.code_id, + "debug_id": debug_object.debug_id, + # "file_format": debug_object.file_format, # "elf", "macho", "pe", "sourcebundle" + } + except Exception as e: + raise # TODO stabalize what we do later + capture_or_log_exception(e) + return {} + + +def extract_source_context(src_bytes, filename, center_line, context=5): + + # TODO the usual worries about zip bombs/memory usage apply here. + with zipfile.ZipFile(io.BytesIO(src_bytes)) as zf: + # sourcebundle entries use relative paths like "src/main.c" or so says ChatGPT + candidates = [n for n in zf.namelist() if n.endswith(filename)] + + if not candidates: + return [], None, [] + + with zf.open(candidates[0]) as f: + lines = f.read().decode("utf-8").splitlines() + + # Clamp line range to valid indices + start = max(center_line - context - 1, 0) + end = min(center_line + context, len(lines)) + + pre_context = lines[start:center_line - 1] + context_line = lines[center_line - 1] if 0 <= center_line - 1 < len(lines) else None + post_context = lines[center_line:end] + + return pre_context, context_line, post_context + + +def _find_module_for_address(process_state, abs_addr: int): + for m in process_state.modules(): + if m.addr and m.size and m.addr <= abs_addr < (m.addr + m.size): + return m + return None + + +def event_threads_for_process_state(process_state): + threads = [] + for thread in process_state.threads(): + thread_frames = [] + + for frame in thread.frames(): + module = _find_module_for_address(process_state, frame.instruction) + fn = file = None + line = 0 + + if module and module.debug_id: + dashed_debug_id = symbolic.debuginfo.id_from_breakpad(module.debug_id) + + file_metadata = FileMetadata.objects.filter(debug_id=dashed_debug_id, file_type="dbg").first() + if file_metadata: + dif_bytes = file_metadata.file.data + + archive = symbolic.debuginfo.Archive.from_bytes(dif_bytes) + objects = list(archive.iter_objects()) + assert len(objects) == 1 + obj = objects[0] + + symcache = obj.make_symcache() + + rel = frame.instruction - module.addr + infos = symcache.lookup(rel) or symcache.lookup(rel - 1) # "or -1" from ChatGPT... should we do it? + if infos: + li = infos[0] + fn = li.function_name + file = li.filename + line = li.line + + # if we have line info, try source bundle + src_meta = FileMetadata.objects.filter(debug_id=dashed_debug_id, file_type="src").first() + if src_meta and file and line: + src_bytes = src_meta.file.data + pre_ctx, ctx_line, post_ctx = extract_source_context(src_bytes, file, line) + + thread_frames.append({ + "instruction_addr": f"0x{frame.instruction:x}", + "function": fn or "", + "filename": file, + "lineno": line, + "pre_context": pre_ctx, + "context_line": ctx_line, + "post_context": post_ctx, + }) + + threads.append({ + "id": thread.thread_id, + "crashed": (thread.thread_id == process_state.requesting_thread), + "stacktrace": {"frames": thread_frames}, + }) + + return threads diff --git a/files/views.py b/files/views.py index 96922b8..31ee4f2 100644 --- a/files/views.py +++ b/files/views.py @@ -18,6 +18,7 @@ from bsmain.models import AuthToken from .models import Chunk, File, FileMetadata from .tasks import assemble_artifact_bundle, assemble_file +from .minidump import extract_dif_metadata logger = logging.getLogger("bugsink.api") @@ -256,9 +257,12 @@ def difs_assemble(request, organization_slug, project_slug): continue file, _ = assemble_file(file_checksum, file_chunks, filename=file_info["name"]) + + symbolic_metadata = extract_dif_metadata(file.data) + FileMetadata.objects.get_or_create( - debug_id=file_info.get("debug_id"), - file_type="dif", # I think? check! + debug_id=file_info.get("debug_id"), # TODO : .get implies "no debug_id", but in that case it's useless + file_type=symbolic_metadata["kind"], # NOTE: symbolic's kind goes into file_type... defaults={ "file": file, "data": "{}", # this is the "catch all" field but I don't think we have anything in this case. diff --git a/sentry/minidump.py b/sentry/minidump.py index 8a126c3..a798505 100644 --- a/sentry/minidump.py +++ b/sentry/minidump.py @@ -2,27 +2,28 @@ # https://github.com/getsentry/sentry/blob/f0ac91f2ec6b45ad18e5eea6df72c5c72573e964/src/sentry/models/minidump.py#L53 # with (as it stands) minor modifications. -import logging -from symbolic import ProcessState +import symbolic +from files.minidump import build_cfi_map_from_minidump_bytes, event_threads_for_process_state def merge_minidump_event(data, minidump_bytes): - state = ProcessState.from_minidump_buffer(minidump_bytes) + frame_info_map = build_cfi_map_from_minidump_bytes(minidump_bytes) + process_state = symbolic.ProcessState.from_minidump_buffer(minidump_bytes, frame_infos=frame_info_map) - data['level'] = 'fatal' if state.crashed else 'info' + data['level'] = 'fatal' if process_state.crashed else 'info' - exception_value = 'Assertion Error: %s' % state.assertion if state.assertion \ - else 'Fatal Error: %s' % state.crash_reason + exception_value = 'Assertion Error: %s' % process_state.assertion if process_state.assertion \ + else 'Fatal Error: %s' % process_state.crash_reason # NO_BANANA: data['message'] is not the right target # data['message'] = exception_value - if state.timestamp: - data['timestamp'] = float(state.timestamp) + if process_state.timestamp: + data['timestamp'] = float(process_state.timestamp) # Extract as much system information as we can. TODO: We should create # a custom context and implement a specific minidump view in the event # UI. - info = state.system_info + info = process_state.system_info context = data.setdefault('contexts', {}) os = context.setdefault('os', {}) device = context.setdefault('device', {}) @@ -30,46 +31,42 @@ def merge_minidump_event(data, minidump_bytes): os['version'] = info.os_version device['arch'] = info.cpu_family - # We can extract stack traces here already but since CFI is not - # available yet (without debug symbols), the stackwalker will - # resort to stack scanning which yields low-quality results. If - # the user provides us with debug symbols, we will reprocess this - # minidump and add improved stacktraces later. - threads = [{ - 'id': thread.thread_id, - 'crashed': False, - 'stacktrace': { - 'frames': [{ - 'instruction_addr': '0x%x' % frame.instruction, - 'function': '', # Required by interface - } for frame in thread.frames()], - }, - } for thread in state.threads()] - data.setdefault('threads', {})['values'] = threads + threads = event_threads_for_process_state(process_state) + data.setdefault("threads", {})["values"] = threads # Mark the crashed thread and add its stacktrace to the exception - crashed_thread = threads[state.requesting_thread] + crashed_thread = threads[process_state.requesting_thread] crashed_thread['crashed'] = True # Extract the crash reason and infos exception = { 'value': exception_value, 'thread_id': crashed_thread['id'], - 'type': state.crash_reason, + 'type': process_state.crash_reason, # Move stacktrace here from crashed_thread (mutating!) 'stacktrace': crashed_thread.pop('stacktrace'), } + for frame in exception['stacktrace']['frames']: + frame['in_app'] = True # minidumps don't distinguish in_app frames; assume all are in_app + + exception['stacktrace']['frames'].reverse() # "Frames should be sorted from oldest to newest." + # TODO we don't have display-info for threads yet, I think? + # we may need to revert the per-thread stacktraces above as well then + data.setdefault('exception', {}) \ .setdefault('values', []) \ .append(exception) # Extract referenced (not all loaded) images images = [{ - 'type': 'apple', # Required by interface - # 'uuid': module.uuid, NO_BANANA + 'type': 'elf', # TODO not sure what this should _actually_ be 'image_addr': module.addr, 'image_size': module.size, - # 'name': module.name, NO_BANANA - } for module in state.modules()] + 'code_file': module.code_file, + 'code_id': module.code_id, + 'debug_file': module.debug_file, + 'debug_id': symbolic.debuginfo.id_from_breakpad(module.debug_id) if module.debug_id else None, + } for module in process_state.modules()] + data.setdefault('debug_meta', {})['images'] = images