mirror of
https://github.com/jlengrand/bugsink.git
synced 2026-03-10 08:01:17 +00:00
PoC: Minidumps w/ symbolification
Plenty of TODOs left; but this proves we can find: * file names * function names * line nos * source context See #82
This commit is contained in:
143
files/minidump.py
Normal file
143
files/minidump.py
Normal file
@@ -0,0 +1,143 @@
|
||||
import io
|
||||
import zipfile
|
||||
import symbolic
|
||||
from sentry_sdk_extensions import capture_or_log_exception
|
||||
|
||||
from bugsink.utils import assert_
|
||||
from .models import FileMetadata
|
||||
|
||||
|
||||
def get_single_object(archive):
|
||||
# our understanding: sentry-cli uploads single-object archives; we need to get the single object out of it...
|
||||
# ...but this does raise the question of why archives exist at all... hence the assert
|
||||
objects = list(archive.iter_objects())
|
||||
assert_(len(objects) == 1)
|
||||
return objects[0]
|
||||
|
||||
|
||||
def build_cfi_map_from_minidump_bytes(minidump_bytes):
|
||||
process_state = symbolic.minidump.ProcessState.from_minidump_buffer(minidump_bytes)
|
||||
|
||||
frame_info_map = symbolic.minidump.FrameInfoMap.new()
|
||||
|
||||
for module in process_state.modules():
|
||||
if not module.debug_id:
|
||||
continue
|
||||
|
||||
dashed_debug_id = symbolic.debuginfo.id_from_breakpad(module.debug_id)
|
||||
if FileMetadata.objects.filter(debug_id=dashed_debug_id, file_type="dbg").count() == 0:
|
||||
continue
|
||||
|
||||
dif_bytes = FileMetadata.objects.get(debug_id=dashed_debug_id, file_type="dbg").file.data
|
||||
archive = symbolic.debuginfo.Archive.from_bytes(dif_bytes)
|
||||
|
||||
debug_object = get_single_object(archive)
|
||||
|
||||
cfi = symbolic.minidump.CfiCache.from_object(debug_object)
|
||||
frame_info_map.add(module.debug_id, cfi)
|
||||
|
||||
return frame_info_map
|
||||
|
||||
|
||||
def extract_dif_metadata(dif_bytes):
|
||||
try:
|
||||
archive = symbolic.debuginfo.Archive.from_bytes(dif_bytes)
|
||||
debug_object = get_single_object(archive)
|
||||
return {
|
||||
"kind": debug_object.kind, # "dbg", "lib", "src"
|
||||
"code_id": debug_object.code_id,
|
||||
"debug_id": debug_object.debug_id,
|
||||
# "file_format": debug_object.file_format, # "elf", "macho", "pe", "sourcebundle"
|
||||
}
|
||||
except Exception as e:
|
||||
raise # TODO stabalize what we do later
|
||||
capture_or_log_exception(e)
|
||||
return {}
|
||||
|
||||
|
||||
def extract_source_context(src_bytes, filename, center_line, context=5):
|
||||
|
||||
# TODO the usual worries about zip bombs/memory usage apply here.
|
||||
with zipfile.ZipFile(io.BytesIO(src_bytes)) as zf:
|
||||
# sourcebundle entries use relative paths like "src/main.c" or so says ChatGPT
|
||||
candidates = [n for n in zf.namelist() if n.endswith(filename)]
|
||||
|
||||
if not candidates:
|
||||
return [], None, []
|
||||
|
||||
with zf.open(candidates[0]) as f:
|
||||
lines = f.read().decode("utf-8").splitlines()
|
||||
|
||||
# Clamp line range to valid indices
|
||||
start = max(center_line - context - 1, 0)
|
||||
end = min(center_line + context, len(lines))
|
||||
|
||||
pre_context = lines[start:center_line - 1]
|
||||
context_line = lines[center_line - 1] if 0 <= center_line - 1 < len(lines) else None
|
||||
post_context = lines[center_line:end]
|
||||
|
||||
return pre_context, context_line, post_context
|
||||
|
||||
|
||||
def _find_module_for_address(process_state, abs_addr: int):
|
||||
for m in process_state.modules():
|
||||
if m.addr and m.size and m.addr <= abs_addr < (m.addr + m.size):
|
||||
return m
|
||||
return None
|
||||
|
||||
|
||||
def event_threads_for_process_state(process_state):
|
||||
threads = []
|
||||
for thread in process_state.threads():
|
||||
thread_frames = []
|
||||
|
||||
for frame in thread.frames():
|
||||
module = _find_module_for_address(process_state, frame.instruction)
|
||||
fn = file = None
|
||||
line = 0
|
||||
|
||||
if module and module.debug_id:
|
||||
dashed_debug_id = symbolic.debuginfo.id_from_breakpad(module.debug_id)
|
||||
|
||||
file_metadata = FileMetadata.objects.filter(debug_id=dashed_debug_id, file_type="dbg").first()
|
||||
if file_metadata:
|
||||
dif_bytes = file_metadata.file.data
|
||||
|
||||
archive = symbolic.debuginfo.Archive.from_bytes(dif_bytes)
|
||||
objects = list(archive.iter_objects())
|
||||
assert len(objects) == 1
|
||||
obj = objects[0]
|
||||
|
||||
symcache = obj.make_symcache()
|
||||
|
||||
rel = frame.instruction - module.addr
|
||||
infos = symcache.lookup(rel) or symcache.lookup(rel - 1) # "or -1" from ChatGPT... should we do it?
|
||||
if infos:
|
||||
li = infos[0]
|
||||
fn = li.function_name
|
||||
file = li.filename
|
||||
line = li.line
|
||||
|
||||
# if we have line info, try source bundle
|
||||
src_meta = FileMetadata.objects.filter(debug_id=dashed_debug_id, file_type="src").first()
|
||||
if src_meta and file and line:
|
||||
src_bytes = src_meta.file.data
|
||||
pre_ctx, ctx_line, post_ctx = extract_source_context(src_bytes, file, line)
|
||||
|
||||
thread_frames.append({
|
||||
"instruction_addr": f"0x{frame.instruction:x}",
|
||||
"function": fn or "<unknown>",
|
||||
"filename": file,
|
||||
"lineno": line,
|
||||
"pre_context": pre_ctx,
|
||||
"context_line": ctx_line,
|
||||
"post_context": post_ctx,
|
||||
})
|
||||
|
||||
threads.append({
|
||||
"id": thread.thread_id,
|
||||
"crashed": (thread.thread_id == process_state.requesting_thread),
|
||||
"stacktrace": {"frames": thread_frames},
|
||||
})
|
||||
|
||||
return threads
|
||||
@@ -18,6 +18,7 @@ from bsmain.models import AuthToken
|
||||
|
||||
from .models import Chunk, File, FileMetadata
|
||||
from .tasks import assemble_artifact_bundle, assemble_file
|
||||
from .minidump import extract_dif_metadata
|
||||
|
||||
logger = logging.getLogger("bugsink.api")
|
||||
|
||||
@@ -256,9 +257,12 @@ def difs_assemble(request, organization_slug, project_slug):
|
||||
continue
|
||||
|
||||
file, _ = assemble_file(file_checksum, file_chunks, filename=file_info["name"])
|
||||
|
||||
symbolic_metadata = extract_dif_metadata(file.data)
|
||||
|
||||
FileMetadata.objects.get_or_create(
|
||||
debug_id=file_info.get("debug_id"),
|
||||
file_type="dif", # I think? check!
|
||||
debug_id=file_info.get("debug_id"), # TODO : .get implies "no debug_id", but in that case it's useless
|
||||
file_type=symbolic_metadata["kind"], # NOTE: symbolic's kind goes into file_type...
|
||||
defaults={
|
||||
"file": file,
|
||||
"data": "{}", # this is the "catch all" field but I don't think we have anything in this case.
|
||||
|
||||
@@ -2,27 +2,28 @@
|
||||
# https://github.com/getsentry/sentry/blob/f0ac91f2ec6b45ad18e5eea6df72c5c72573e964/src/sentry/models/minidump.py#L53
|
||||
# with (as it stands) minor modifications.
|
||||
|
||||
import logging
|
||||
from symbolic import ProcessState
|
||||
import symbolic
|
||||
from files.minidump import build_cfi_map_from_minidump_bytes, event_threads_for_process_state
|
||||
|
||||
|
||||
def merge_minidump_event(data, minidump_bytes):
|
||||
state = ProcessState.from_minidump_buffer(minidump_bytes)
|
||||
frame_info_map = build_cfi_map_from_minidump_bytes(minidump_bytes)
|
||||
process_state = symbolic.ProcessState.from_minidump_buffer(minidump_bytes, frame_infos=frame_info_map)
|
||||
|
||||
data['level'] = 'fatal' if state.crashed else 'info'
|
||||
data['level'] = 'fatal' if process_state.crashed else 'info'
|
||||
|
||||
exception_value = 'Assertion Error: %s' % state.assertion if state.assertion \
|
||||
else 'Fatal Error: %s' % state.crash_reason
|
||||
exception_value = 'Assertion Error: %s' % process_state.assertion if process_state.assertion \
|
||||
else 'Fatal Error: %s' % process_state.crash_reason
|
||||
# NO_BANANA: data['message'] is not the right target
|
||||
# data['message'] = exception_value
|
||||
|
||||
if state.timestamp:
|
||||
data['timestamp'] = float(state.timestamp)
|
||||
if process_state.timestamp:
|
||||
data['timestamp'] = float(process_state.timestamp)
|
||||
|
||||
# Extract as much system information as we can. TODO: We should create
|
||||
# a custom context and implement a specific minidump view in the event
|
||||
# UI.
|
||||
info = state.system_info
|
||||
info = process_state.system_info
|
||||
context = data.setdefault('contexts', {})
|
||||
os = context.setdefault('os', {})
|
||||
device = context.setdefault('device', {})
|
||||
@@ -30,46 +31,42 @@ def merge_minidump_event(data, minidump_bytes):
|
||||
os['version'] = info.os_version
|
||||
device['arch'] = info.cpu_family
|
||||
|
||||
# We can extract stack traces here already but since CFI is not
|
||||
# available yet (without debug symbols), the stackwalker will
|
||||
# resort to stack scanning which yields low-quality results. If
|
||||
# the user provides us with debug symbols, we will reprocess this
|
||||
# minidump and add improved stacktraces later.
|
||||
threads = [{
|
||||
'id': thread.thread_id,
|
||||
'crashed': False,
|
||||
'stacktrace': {
|
||||
'frames': [{
|
||||
'instruction_addr': '0x%x' % frame.instruction,
|
||||
'function': '<unknown>', # Required by interface
|
||||
} for frame in thread.frames()],
|
||||
},
|
||||
} for thread in state.threads()]
|
||||
data.setdefault('threads', {})['values'] = threads
|
||||
threads = event_threads_for_process_state(process_state)
|
||||
data.setdefault("threads", {})["values"] = threads
|
||||
|
||||
# Mark the crashed thread and add its stacktrace to the exception
|
||||
crashed_thread = threads[state.requesting_thread]
|
||||
crashed_thread = threads[process_state.requesting_thread]
|
||||
crashed_thread['crashed'] = True
|
||||
|
||||
# Extract the crash reason and infos
|
||||
exception = {
|
||||
'value': exception_value,
|
||||
'thread_id': crashed_thread['id'],
|
||||
'type': state.crash_reason,
|
||||
'type': process_state.crash_reason,
|
||||
# Move stacktrace here from crashed_thread (mutating!)
|
||||
'stacktrace': crashed_thread.pop('stacktrace'),
|
||||
}
|
||||
|
||||
for frame in exception['stacktrace']['frames']:
|
||||
frame['in_app'] = True # minidumps don't distinguish in_app frames; assume all are in_app
|
||||
|
||||
exception['stacktrace']['frames'].reverse() # "Frames should be sorted from oldest to newest."
|
||||
# TODO we don't have display-info for threads yet, I think?
|
||||
# we may need to revert the per-thread stacktraces above as well then
|
||||
|
||||
data.setdefault('exception', {}) \
|
||||
.setdefault('values', []) \
|
||||
.append(exception)
|
||||
|
||||
# Extract referenced (not all loaded) images
|
||||
images = [{
|
||||
'type': 'apple', # Required by interface
|
||||
# 'uuid': module.uuid, NO_BANANA
|
||||
'type': 'elf', # TODO not sure what this should _actually_ be
|
||||
'image_addr': module.addr,
|
||||
'image_size': module.size,
|
||||
# 'name': module.name, NO_BANANA
|
||||
} for module in state.modules()]
|
||||
'code_file': module.code_file,
|
||||
'code_id': module.code_id,
|
||||
'debug_file': module.debug_file,
|
||||
'debug_id': symbolic.debuginfo.id_from_breakpad(module.debug_id) if module.debug_id else None,
|
||||
} for module in process_state.modules()]
|
||||
|
||||
data.setdefault('debug_meta', {})['images'] = images
|
||||
|
||||
Reference in New Issue
Block a user