PoC: Minidumps w/ symbolification

Plenty of TODOs left; but this proves we can find: * file names * function names * line nos * source context See #82
2026-03-10 08:01:17 +00:00 · 2025-11-12 14:57:02 +01:00
parent ab065a6329
commit b60980c8f3
3 changed files with 177 additions and 33 deletions
--- a/files/minidump.py
+++ b/files/minidump.py
@@ -0,0 +1,143 @@
+import io
+import zipfile
+import symbolic
+from sentry_sdk_extensions import capture_or_log_exception
+
+from bugsink.utils import assert_
+from .models import FileMetadata
+
+
+def get_single_object(archive):
+    # our understanding: sentry-cli uploads single-object archives; we need to get the single object out of it...
+    # ...but this does raise the question of why archives exist at all... hence the assert
+    objects = list(archive.iter_objects())
+    assert_(len(objects) == 1)
+    return objects[0]
+
+
+def build_cfi_map_from_minidump_bytes(minidump_bytes):
+    process_state = symbolic.minidump.ProcessState.from_minidump_buffer(minidump_bytes)
+
+    frame_info_map = symbolic.minidump.FrameInfoMap.new()
+
+    for module in process_state.modules():
+        if not module.debug_id:
+            continue
+
+        dashed_debug_id = symbolic.debuginfo.id_from_breakpad(module.debug_id)
+        if FileMetadata.objects.filter(debug_id=dashed_debug_id, file_type="dbg").count() == 0:
+            continue
+
+        dif_bytes = FileMetadata.objects.get(debug_id=dashed_debug_id, file_type="dbg").file.data
+        archive = symbolic.debuginfo.Archive.from_bytes(dif_bytes)
+
+        debug_object = get_single_object(archive)
+
+        cfi = symbolic.minidump.CfiCache.from_object(debug_object)
+        frame_info_map.add(module.debug_id, cfi)
+
+    return frame_info_map
+
+
+def extract_dif_metadata(dif_bytes):
+    try:
+        archive = symbolic.debuginfo.Archive.from_bytes(dif_bytes)
+        debug_object = get_single_object(archive)
+        return {
+            "kind": debug_object.kind,  # "dbg", "lib", "src"
+            "code_id": debug_object.code_id,
+            "debug_id": debug_object.debug_id,
+            # "file_format": debug_object.file_format,  # "elf", "macho", "pe", "sourcebundle"
+        }
+    except Exception as e:
+        raise  # TODO stabalize what we do later
+        capture_or_log_exception(e)
+        return {}
+
+
+def extract_source_context(src_bytes, filename, center_line, context=5):
+
+    # TODO the usual worries about zip bombs/memory usage apply here.
+    with zipfile.ZipFile(io.BytesIO(src_bytes)) as zf:
+        # sourcebundle entries use relative paths like "src/main.c" or so says ChatGPT
+        candidates = [n for n in zf.namelist() if n.endswith(filename)]
+
+        if not candidates:
+            return [], None, []
+
+        with zf.open(candidates[0]) as f:
+            lines = f.read().decode("utf-8").splitlines()
+
+        # Clamp line range to valid indices
+        start = max(center_line - context - 1, 0)
+        end = min(center_line + context, len(lines))
+
+        pre_context = lines[start:center_line - 1]
+        context_line = lines[center_line - 1] if 0 <= center_line - 1 < len(lines) else None
+        post_context = lines[center_line:end]
+
+        return pre_context, context_line, post_context
+
+
+def _find_module_for_address(process_state, abs_addr: int):
+    for m in process_state.modules():
+        if m.addr and m.size and m.addr <= abs_addr < (m.addr + m.size):
+            return m
+    return None
+
+
+def event_threads_for_process_state(process_state):
+    threads = []
+    for thread in process_state.threads():
+        thread_frames = []
+
+        for frame in thread.frames():
+            module = _find_module_for_address(process_state, frame.instruction)
+            fn = file = None
+            line = 0
+
+            if module and module.debug_id:
+                dashed_debug_id = symbolic.debuginfo.id_from_breakpad(module.debug_id)
+
+                file_metadata = FileMetadata.objects.filter(debug_id=dashed_debug_id, file_type="dbg").first()
+                if file_metadata:
+                    dif_bytes = file_metadata.file.data
+
+                    archive = symbolic.debuginfo.Archive.from_bytes(dif_bytes)
+                    objects = list(archive.iter_objects())
+                    assert len(objects) == 1
+                    obj = objects[0]
+
+                    symcache = obj.make_symcache()
+
+                    rel = frame.instruction - module.addr
+                    infos = symcache.lookup(rel) or symcache.lookup(rel - 1)  # "or -1" from ChatGPT... should we do it?
+                    if infos:
+                        li = infos[0]
+                        fn = li.function_name
+                        file = li.filename
+                        line = li.line
+
+                        # if we have line info, try source bundle
+                        src_meta = FileMetadata.objects.filter(debug_id=dashed_debug_id, file_type="src").first()
+                        if src_meta and file and line:
+                            src_bytes = src_meta.file.data
+                            pre_ctx, ctx_line, post_ctx = extract_source_context(src_bytes, file, line)
+
+            thread_frames.append({
+                "instruction_addr": f"0x{frame.instruction:x}",
+                "function": fn or "<unknown>",
+                "filename": file,
+                "lineno": line,
+                "pre_context": pre_ctx,
+                "context_line": ctx_line,
+                "post_context": post_ctx,
+            })
+
+        threads.append({
+            "id": thread.thread_id,
+            "crashed": (thread.thread_id == process_state.requesting_thread),
+            "stacktrace": {"frames": thread_frames},
+        })
+
+    return threads
--- a/files/views.py
+++ b/files/views.py
@@ -18,6 +18,7 @@ from bsmain.models import AuthToken

 from .models import Chunk, File, FileMetadata
 from .tasks import assemble_artifact_bundle, assemble_file
+from .minidump import extract_dif_metadata

 logger = logging.getLogger("bugsink.api")

@@ -256,9 +257,12 @@ def difs_assemble(request, organization_slug, project_slug):
            continue

        file, _ = assemble_file(file_checksum, file_chunks, filename=file_info["name"])
+
+        symbolic_metadata = extract_dif_metadata(file.data)
+
        FileMetadata.objects.get_or_create(
-            debug_id=file_info.get("debug_id"),
-            file_type="dif",  # I think? check!
+            debug_id=file_info.get("debug_id"),  # TODO : .get implies "no debug_id", but in that case it's useless
+            file_type=symbolic_metadata["kind"],  # NOTE: symbolic's kind goes into file_type...
            defaults={
                "file": file,
                "data": "{}",  # this is the "catch all" field but I don't think we have anything in this case.