PoC: Minidumps w/ symbolification

Plenty of TODOs left; but this proves we can find: * file names * function names * line nos * source context See #82
2026-03-10 08:01:17 +00:00 · 2025-11-12 14:57:02 +01:00
parent ab065a6329
commit b60980c8f3
3 changed files with 177 additions and 33 deletions
--- a/files/minidump.py
+++ b/files/minidump.py
@@ -0,0 +1,143 @@
+import io
+import zipfile
+import symbolic
+from sentry_sdk_extensions import capture_or_log_exception
+
+from bugsink.utils import assert_
+from .models import FileMetadata
+
+
+def get_single_object(archive):
+    # our understanding: sentry-cli uploads single-object archives; we need to get the single object out of it...
+    # ...but this does raise the question of why archives exist at all... hence the assert
+    objects = list(archive.iter_objects())
+    assert_(len(objects) == 1)
+    return objects[0]
+
+
+def build_cfi_map_from_minidump_bytes(minidump_bytes):
+    process_state = symbolic.minidump.ProcessState.from_minidump_buffer(minidump_bytes)
+
+    frame_info_map = symbolic.minidump.FrameInfoMap.new()
+
+    for module in process_state.modules():
+        if not module.debug_id:
+            continue
+
+        dashed_debug_id = symbolic.debuginfo.id_from_breakpad(module.debug_id)
+        if FileMetadata.objects.filter(debug_id=dashed_debug_id, file_type="dbg").count() == 0:
+            continue
+
+        dif_bytes = FileMetadata.objects.get(debug_id=dashed_debug_id, file_type="dbg").file.data
+        archive = symbolic.debuginfo.Archive.from_bytes(dif_bytes)
+
+        debug_object = get_single_object(archive)
+
+        cfi = symbolic.minidump.CfiCache.from_object(debug_object)
+        frame_info_map.add(module.debug_id, cfi)
+
+    return frame_info_map
+
+
+def extract_dif_metadata(dif_bytes):
+    try:
+        archive = symbolic.debuginfo.Archive.from_bytes(dif_bytes)
+        debug_object = get_single_object(archive)
+        return {
+            "kind": debug_object.kind,  # "dbg", "lib", "src"
+            "code_id": debug_object.code_id,
+            "debug_id": debug_object.debug_id,
+            # "file_format": debug_object.file_format,  # "elf", "macho", "pe", "sourcebundle"
+        }
+    except Exception as e:
+        raise  # TODO stabalize what we do later
+        capture_or_log_exception(e)
+        return {}
+
+
+def extract_source_context(src_bytes, filename, center_line, context=5):
+
+    # TODO the usual worries about zip bombs/memory usage apply here.
+    with zipfile.ZipFile(io.BytesIO(src_bytes)) as zf:
+        # sourcebundle entries use relative paths like "src/main.c" or so says ChatGPT
+        candidates = [n for n in zf.namelist() if n.endswith(filename)]
+
+        if not candidates:
+            return [], None, []
+
+        with zf.open(candidates[0]) as f:
+            lines = f.read().decode("utf-8").splitlines()
+
+        # Clamp line range to valid indices
+        start = max(center_line - context - 1, 0)
+        end = min(center_line + context, len(lines))
+
+        pre_context = lines[start:center_line - 1]
+        context_line = lines[center_line - 1] if 0 <= center_line - 1 < len(lines) else None
+        post_context = lines[center_line:end]
+
+        return pre_context, context_line, post_context
+
+
+def _find_module_for_address(process_state, abs_addr: int):
+    for m in process_state.modules():
+        if m.addr and m.size and m.addr <= abs_addr < (m.addr + m.size):
+            return m
+    return None
+
+
+def event_threads_for_process_state(process_state):
+    threads = []
+    for thread in process_state.threads():
+        thread_frames = []
+
+        for frame in thread.frames():
+            module = _find_module_for_address(process_state, frame.instruction)
+            fn = file = None
+            line = 0
+
+            if module and module.debug_id:
+                dashed_debug_id = symbolic.debuginfo.id_from_breakpad(module.debug_id)
+
+                file_metadata = FileMetadata.objects.filter(debug_id=dashed_debug_id, file_type="dbg").first()
+                if file_metadata:
+                    dif_bytes = file_metadata.file.data
+
+                    archive = symbolic.debuginfo.Archive.from_bytes(dif_bytes)
+                    objects = list(archive.iter_objects())
+                    assert len(objects) == 1
+                    obj = objects[0]
+
+                    symcache = obj.make_symcache()
+
+                    rel = frame.instruction - module.addr
+                    infos = symcache.lookup(rel) or symcache.lookup(rel - 1)  # "or -1" from ChatGPT... should we do it?
+                    if infos:
+                        li = infos[0]
+                        fn = li.function_name
+                        file = li.filename
+                        line = li.line
+
+                        # if we have line info, try source bundle
+                        src_meta = FileMetadata.objects.filter(debug_id=dashed_debug_id, file_type="src").first()
+                        if src_meta and file and line:
+                            src_bytes = src_meta.file.data
+                            pre_ctx, ctx_line, post_ctx = extract_source_context(src_bytes, file, line)
+
+            thread_frames.append({
+                "instruction_addr": f"0x{frame.instruction:x}",
+                "function": fn or "<unknown>",
+                "filename": file,
+                "lineno": line,
+                "pre_context": pre_ctx,
+                "context_line": ctx_line,
+                "post_context": post_ctx,
+            })
+
+        threads.append({
+            "id": thread.thread_id,
+            "crashed": (thread.thread_id == process_state.requesting_thread),
+            "stacktrace": {"frames": thread_frames},
+        })
+
+    return threads
--- a/files/views.py
+++ b/files/views.py
@@ -18,6 +18,7 @@ from bsmain.models import AuthToken

 from .models import Chunk, File, FileMetadata
 from .tasks import assemble_artifact_bundle, assemble_file
+from .minidump import extract_dif_metadata

 logger = logging.getLogger("bugsink.api")

@@ -256,9 +257,12 @@ def difs_assemble(request, organization_slug, project_slug):
            continue

        file, _ = assemble_file(file_checksum, file_chunks, filename=file_info["name"])
+
+        symbolic_metadata = extract_dif_metadata(file.data)
+
        FileMetadata.objects.get_or_create(
-            debug_id=file_info.get("debug_id"),
-            file_type="dif",  # I think? check!
+            debug_id=file_info.get("debug_id"),  # TODO : .get implies "no debug_id", but in that case it's useless
+            file_type=symbolic_metadata["kind"],  # NOTE: symbolic's kind goes into file_type...
            defaults={
                "file": file,
                "data": "{}",  # this is the "catch all" field but I don't think we have anything in this case.
--- a/sentry/minidump.py
+++ b/sentry/minidump.py
@@ -2,27 +2,28 @@
 # https://github.com/getsentry/sentry/blob/f0ac91f2ec6b45ad18e5eea6df72c5c72573e964/src/sentry/models/minidump.py#L53
 # with (as it stands) minor modifications.

-import logging
-from symbolic import ProcessState
+import symbolic
+from files.minidump import build_cfi_map_from_minidump_bytes, event_threads_for_process_state


 def merge_minidump_event(data, minidump_bytes):
-    state = ProcessState.from_minidump_buffer(minidump_bytes)
+    frame_info_map = build_cfi_map_from_minidump_bytes(minidump_bytes)
+    process_state = symbolic.ProcessState.from_minidump_buffer(minidump_bytes, frame_infos=frame_info_map)

-    data['level'] = 'fatal' if state.crashed else 'info'
+    data['level'] = 'fatal' if process_state.crashed else 'info'

-    exception_value =  'Assertion Error: %s' % state.assertion if state.assertion \
-        else 'Fatal Error: %s' % state.crash_reason
+    exception_value = 'Assertion Error: %s' % process_state.assertion if process_state.assertion \
+        else 'Fatal Error: %s' % process_state.crash_reason
    # NO_BANANA: data['message'] is not the right target
    # data['message'] = exception_value

-    if state.timestamp:
-        data['timestamp'] = float(state.timestamp)
+    if process_state.timestamp:
+        data['timestamp'] = float(process_state.timestamp)

    # Extract as much system information as we can. TODO: We should create
    # a custom context and implement a specific minidump view in the event
    # UI.
-    info = state.system_info
+    info = process_state.system_info
    context = data.setdefault('contexts', {})
    os = context.setdefault('os', {})
    device = context.setdefault('device', {})
@@ -30,46 +31,42 @@ def merge_minidump_event(data, minidump_bytes):
    os['version'] = info.os_version
    device['arch'] = info.cpu_family

-    # We can extract stack traces here already but since CFI is not
-    # available yet (without debug symbols), the stackwalker will
-    # resort to stack scanning which yields low-quality results. If
-    # the user provides us with debug symbols, we will reprocess this
-    # minidump and add improved stacktraces later.
-    threads = [{
-        'id': thread.thread_id,
-        'crashed': False,
-        'stacktrace': {
-            'frames': [{
-                'instruction_addr': '0x%x' % frame.instruction,
-                'function': '<unknown>',  # Required by interface
-            } for frame in thread.frames()],
-        },
-    } for thread in state.threads()]
-    data.setdefault('threads', {})['values'] = threads
+    threads = event_threads_for_process_state(process_state)
+    data.setdefault("threads", {})["values"] = threads

    # Mark the crashed thread and add its stacktrace to the exception
-    crashed_thread = threads[state.requesting_thread]
+    crashed_thread = threads[process_state.requesting_thread]
    crashed_thread['crashed'] = True

    # Extract the crash reason and infos
    exception = {
        'value': exception_value,
        'thread_id': crashed_thread['id'],
-        'type': state.crash_reason,
+        'type': process_state.crash_reason,
        # Move stacktrace here from crashed_thread (mutating!)
        'stacktrace': crashed_thread.pop('stacktrace'),
    }

+    for frame in exception['stacktrace']['frames']:
+        frame['in_app'] = True  # minidumps don't distinguish in_app frames; assume all are in_app
+
+    exception['stacktrace']['frames'].reverse()  # "Frames should be sorted from oldest to newest."
+    # TODO we don't have display-info for threads yet, I think?
+    # we may need to revert the per-thread stacktraces above as well then
+
    data.setdefault('exception', {}) \
        .setdefault('values', []) \
        .append(exception)

    # Extract referenced (not all loaded) images
    images = [{
-        'type': 'apple',  # Required by interface
-        # 'uuid': module.uuid, NO_BANANA
+        'type': 'elf',                    # TODO not sure what this should _actually_ be
        'image_addr': module.addr,
        'image_size': module.size,
-        # 'name': module.name, NO_BANANA
-    } for module in state.modules()]
+        'code_file': module.code_file,
+        'code_id': module.code_id,
+        'debug_file': module.debug_file,
+        'debug_id': symbolic.debuginfo.id_from_breakpad(module.debug_id) if module.debug_id else None,
+    } for module in process_state.modules()]
+
    data.setdefault('debug_meta', {})['images'] = images