From b60980c8f36b25e659f052d0965d29b763d8dbe9 Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Wed, 12 Nov 2025 14:57:02 +0100
Subject: [PATCH] PoC: Minidumps w/ symbolification

Plenty of TODOs left; but this proves we can find:

* file names
* function names
* line nos
* source context

See #82
---
 files/minidump.py  | 143 +++++++++++++++++++++++++++++++++++++++++++++
 files/views.py     |   8 ++-
 sentry/minidump.py |  59 +++++++++----------
 3 files changed, 177 insertions(+), 33 deletions(-)
 create mode 100644 files/minidump.py

diff --git a/files/minidump.py b/files/minidump.py
new file mode 100644
index 0000000..2ac523c
--- /dev/null
+++ b/files/minidump.py
@@ -0,0 +1,143 @@
+import io
+import zipfile
+import symbolic
+from sentry_sdk_extensions import capture_or_log_exception
+
+from bugsink.utils import assert_
+from .models import FileMetadata
+
+
+def get_single_object(archive):
+    # our understanding: sentry-cli uploads single-object archives; we need to get the single object out of it...
+    # ...but this does raise the question of why archives exist at all... hence the assert
+    objects = list(archive.iter_objects())
+    assert_(len(objects) == 1)
+    return objects[0]
+
+
+def build_cfi_map_from_minidump_bytes(minidump_bytes):
+    process_state = symbolic.minidump.ProcessState.from_minidump_buffer(minidump_bytes)
+
+    frame_info_map = symbolic.minidump.FrameInfoMap.new()
+
+    for module in process_state.modules():
+        if not module.debug_id:
+            continue
+
+        dashed_debug_id = symbolic.debuginfo.id_from_breakpad(module.debug_id)
+        if FileMetadata.objects.filter(debug_id=dashed_debug_id, file_type="dbg").count() == 0:
+            continue
+
+        dif_bytes = FileMetadata.objects.get(debug_id=dashed_debug_id, file_type="dbg").file.data
+        archive = symbolic.debuginfo.Archive.from_bytes(dif_bytes)
+
+        debug_object = get_single_object(archive)
+
+        cfi = symbolic.minidump.CfiCache.from_object(debug_object)
+        frame_info_map.add(module.debug_id, cfi)
+
+    return frame_info_map
+
+
+def extract_dif_metadata(dif_bytes):
+    try:
+        archive = symbolic.debuginfo.Archive.from_bytes(dif_bytes)
+        debug_object = get_single_object(archive)
+        return {
+            "kind": debug_object.kind,  # "dbg", "lib", "src"
+            "code_id": debug_object.code_id,
+            "debug_id": debug_object.debug_id,
+            # "file_format": debug_object.file_format,  # "elf", "macho", "pe", "sourcebundle"
+        }
+    except Exception as e:
+        raise  # TODO stabalize what we do later
+        capture_or_log_exception(e)
+        return {}
+
+
+def extract_source_context(src_bytes, filename, center_line, context=5):
+
+    # TODO the usual worries about zip bombs/memory usage apply here.
+    with zipfile.ZipFile(io.BytesIO(src_bytes)) as zf:
+        # sourcebundle entries use relative paths like "src/main.c" or so says ChatGPT
+        candidates = [n for n in zf.namelist() if n.endswith(filename)]
+
+        if not candidates:
+            return [], None, []
+
+        with zf.open(candidates[0]) as f:
+            lines = f.read().decode("utf-8").splitlines()
+
+        # Clamp line range to valid indices
+        start = max(center_line - context - 1, 0)
+        end = min(center_line + context, len(lines))
+
+        pre_context = lines[start:center_line - 1]
+        context_line = lines[center_line - 1] if 0 <= center_line - 1 < len(lines) else None
+        post_context = lines[center_line:end]
+
+        return pre_context, context_line, post_context
+
+
+def _find_module_for_address(process_state, abs_addr: int):
+    for m in process_state.modules():
+        if m.addr and m.size and m.addr <= abs_addr < (m.addr + m.size):
+            return m
+    return None
+
+
+def event_threads_for_process_state(process_state):
+    threads = []
+    for thread in process_state.threads():
+        thread_frames = []
+
+        for frame in thread.frames():
+            module = _find_module_for_address(process_state, frame.instruction)
+            fn = file = None
+            line = 0
+
+            if module and module.debug_id:
+                dashed_debug_id = symbolic.debuginfo.id_from_breakpad(module.debug_id)
+
+                file_metadata = FileMetadata.objects.filter(debug_id=dashed_debug_id, file_type="dbg").first()
+                if file_metadata:
+                    dif_bytes = file_metadata.file.data
+
+                    archive = symbolic.debuginfo.Archive.from_bytes(dif_bytes)
+                    objects = list(archive.iter_objects())
+                    assert len(objects) == 1
+                    obj = objects[0]
+
+                    symcache = obj.make_symcache()
+
+                    rel = frame.instruction - module.addr
+                    infos = symcache.lookup(rel) or symcache.lookup(rel - 1)  # "or -1" from ChatGPT... should we do it?
+                    if infos:
+                        li = infos[0]
+                        fn = li.function_name
+                        file = li.filename
+                        line = li.line
+
+                        # if we have line info, try source bundle
+                        src_meta = FileMetadata.objects.filter(debug_id=dashed_debug_id, file_type="src").first()
+                        if src_meta and file and line:
+                            src_bytes = src_meta.file.data
+                            pre_ctx, ctx_line, post_ctx = extract_source_context(src_bytes, file, line)
+
+            thread_frames.append({
+                "instruction_addr": f"0x{frame.instruction:x}",
+                "function": fn or "<unknown>",
+                "filename": file,
+                "lineno": line,
+                "pre_context": pre_ctx,
+                "context_line": ctx_line,
+                "post_context": post_ctx,
+            })
+
+        threads.append({
+            "id": thread.thread_id,
+            "crashed": (thread.thread_id == process_state.requesting_thread),
+            "stacktrace": {"frames": thread_frames},
+        })
+
+    return threads
diff --git a/files/views.py b/files/views.py
index 96922b8..31ee4f2 100644
--- a/files/views.py
+++ b/files/views.py
@@ -18,6 +18,7 @@ from bsmain.models import AuthToken
 
 from .models import Chunk, File, FileMetadata
 from .tasks import assemble_artifact_bundle, assemble_file
+from .minidump import extract_dif_metadata
 
 logger = logging.getLogger("bugsink.api")
 
@@ -256,9 +257,12 @@ def difs_assemble(request, organization_slug, project_slug):
             continue
 
         file, _ = assemble_file(file_checksum, file_chunks, filename=file_info["name"])
+
+        symbolic_metadata = extract_dif_metadata(file.data)
+
         FileMetadata.objects.get_or_create(
-            debug_id=file_info.get("debug_id"),
-            file_type="dif",  # I think? check!
+            debug_id=file_info.get("debug_id"),  # TODO : .get implies "no debug_id", but in that case it's useless
+            file_type=symbolic_metadata["kind"],  # NOTE: symbolic's kind goes into file_type...
             defaults={
                 "file": file,
                 "data": "{}",  # this is the "catch all" field but I don't think we have anything in this case.
diff --git a/sentry/minidump.py b/sentry/minidump.py
index 8a126c3..a798505 100644
--- a/sentry/minidump.py
+++ b/sentry/minidump.py
@@ -2,27 +2,28 @@
 # https://github.com/getsentry/sentry/blob/f0ac91f2ec6b45ad18e5eea6df72c5c72573e964/src/sentry/models/minidump.py#L53
 # with (as it stands) minor modifications.
 
-import logging
-from symbolic import ProcessState
+import symbolic
+from files.minidump import build_cfi_map_from_minidump_bytes, event_threads_for_process_state
 
 
 def merge_minidump_event(data, minidump_bytes):
-    state = ProcessState.from_minidump_buffer(minidump_bytes)
+    frame_info_map = build_cfi_map_from_minidump_bytes(minidump_bytes)
+    process_state = symbolic.ProcessState.from_minidump_buffer(minidump_bytes, frame_infos=frame_info_map)
 
-    data['level'] = 'fatal' if state.crashed else 'info'
+    data['level'] = 'fatal' if process_state.crashed else 'info'
 
-    exception_value =  'Assertion Error: %s' % state.assertion if state.assertion \
-        else 'Fatal Error: %s' % state.crash_reason
+    exception_value = 'Assertion Error: %s' % process_state.assertion if process_state.assertion \
+        else 'Fatal Error: %s' % process_state.crash_reason
     # NO_BANANA: data['message'] is not the right target
     # data['message'] = exception_value
 
-    if state.timestamp:
-        data['timestamp'] = float(state.timestamp)
+    if process_state.timestamp:
+        data['timestamp'] = float(process_state.timestamp)
 
     # Extract as much system information as we can. TODO: We should create
     # a custom context and implement a specific minidump view in the event
     # UI.
-    info = state.system_info
+    info = process_state.system_info
     context = data.setdefault('contexts', {})
     os = context.setdefault('os', {})
     device = context.setdefault('device', {})
@@ -30,46 +31,42 @@ def merge_minidump_event(data, minidump_bytes):
     os['version'] = info.os_version
     device['arch'] = info.cpu_family
 
-    # We can extract stack traces here already but since CFI is not
-    # available yet (without debug symbols), the stackwalker will
-    # resort to stack scanning which yields low-quality results. If
-    # the user provides us with debug symbols, we will reprocess this
-    # minidump and add improved stacktraces later.
-    threads = [{
-        'id': thread.thread_id,
-        'crashed': False,
-        'stacktrace': {
-            'frames': [{
-                'instruction_addr': '0x%x' % frame.instruction,
-                'function': '<unknown>',  # Required by interface
-            } for frame in thread.frames()],
-        },
-    } for thread in state.threads()]
-    data.setdefault('threads', {})['values'] = threads
+    threads = event_threads_for_process_state(process_state)
+    data.setdefault("threads", {})["values"] = threads
 
     # Mark the crashed thread and add its stacktrace to the exception
-    crashed_thread = threads[state.requesting_thread]
+    crashed_thread = threads[process_state.requesting_thread]
     crashed_thread['crashed'] = True
 
     # Extract the crash reason and infos
     exception = {
         'value': exception_value,
         'thread_id': crashed_thread['id'],
-        'type': state.crash_reason,
+        'type': process_state.crash_reason,
         # Move stacktrace here from crashed_thread (mutating!)
         'stacktrace': crashed_thread.pop('stacktrace'),
     }
 
+    for frame in exception['stacktrace']['frames']:
+        frame['in_app'] = True  # minidumps don't distinguish in_app frames; assume all are in_app
+
+    exception['stacktrace']['frames'].reverse()  # "Frames should be sorted from oldest to newest."
+    # TODO we don't have display-info for threads yet, I think?
+    # we may need to revert the per-thread stacktraces above as well then
+
     data.setdefault('exception', {}) \
         .setdefault('values', []) \
         .append(exception)
 
     # Extract referenced (not all loaded) images
     images = [{
-        'type': 'apple',  # Required by interface
-        # 'uuid': module.uuid, NO_BANANA
+        'type': 'elf',                    # TODO not sure what this should _actually_ be
         'image_addr': module.addr,
         'image_size': module.size,
-        # 'name': module.name, NO_BANANA
-    } for module in state.modules()]
+        'code_file': module.code_file,
+        'code_id': module.code_id,
+        'debug_file': module.debug_file,
+        'debug_id': symbolic.debuginfo.id_from_breakpad(module.debug_id) if module.debug_id else None,
+    } for module in process_state.modules()]
+
     data.setdefault('debug_meta', {})['images'] = images