From abb84172bb14158242c7087fa8c2bafdfe0a3a80 Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Fri, 1 Aug 2025 09:48:40 +0200 Subject: [PATCH] header and envelope parsers: filter, don't fail on validation failures See #179 --- bsmain/management/commands/send_json.py | 9 ++++++-- ingest/header_validators.py | 30 +++++++++++++++++++++++++ ingest/parsers.py | 6 ++--- ingest/tests.py | 15 +++++++++++++ 4 files changed, 55 insertions(+), 5 deletions(-) diff --git a/bsmain/management/commands/send_json.py b/bsmain/management/commands/send_json.py index 1b671ab..e67c3b5 100644 --- a/bsmain/management/commands/send_json.py +++ b/bsmain/management/commands/send_json.py @@ -33,6 +33,7 @@ class Command(BaseCommand): parser.add_argument( "--x-forwarded-for", action="store", help="Set the X-Forwarded-For header to test whether your setup is properly ignoring it") + parser.add_argument("--sent-at", action="store", default=None, help="Set the sent_at header to this value") parser.add_argument("kind", action="store", help="The kind of object (filename, project, issue, event)") parser.add_argument("identifiers", nargs="+") @@ -153,10 +154,14 @@ class Command(BaseCommand): data_bytes = json.dumps(data).encode("utf-8") if use_envelope: - # the smallest possible envelope: event_id = data.get("event_id", uuid.uuid4().hex) - data_bytes = (b'{"event_id": "%s"}\n{"type": "event"}\n' % event_id.encode("utf-8") + + sent_at_snip = (b',"sent_at":"%s"' % options["sent_at"].encode("utf-8")) if options["sent_at"] else b"" + + # the smallest possible envelope: + data_bytes = (b'{"event_id": "%s"' % event_id.encode("utf-8") + + sent_at_snip + + b'}\n{"type": "event"}\n' + data_bytes) if compress in ["gzip", "deflate"]: diff --git a/ingest/header_validators.py b/ingest/header_validators.py index c82caaa..a8f95b8 100644 --- a/ingest/header_validators.py +++ b/ingest/header_validators.py @@ -62,6 +62,20 @@ def validate_envelope_headers(headers): envelope_validators[key](val) +def filter_valid_envelope_headers(headers): + result = {} + + for key, val in headers.items(): + if key in envelope_validators: # this implies that only known headers remain + try: + envelope_validators[key](val) + result[key] = val + except Exception: + pass + + return result + + ALLOWED_TYPES = { "event", "transaction", "attachment", "session", "sessions", "feedback", "user_report", "client_report", "replay_event", "replay_recording", "profile", "profile_chunk", "check_in", "log", "otel_log" @@ -99,3 +113,19 @@ def validate_item_headers(headers): for key, val in headers.items(): if key in item_validators: item_validators[key](val) + + +def filter_valid_item_headers(headers): + if headers.get("type") != "event": + return headers # or {} if you want to remove all item headers + + result = {} + for key, val in headers.items(): + if key in item_validators: # this implies that only known headers remain + try: + item_validators[key](val) + result[key] = val + except Exception: + pass + + return result diff --git a/ingest/parsers.py b/ingest/parsers.py index f60f8bc..3cd419d 100644 --- a/ingest/parsers.py +++ b/ingest/parsers.py @@ -4,7 +4,7 @@ import io from bugsink.streams import MaxDataWriter from .exceptions import ParseError -from .header_validators import validate_envelope_headers, validate_item_headers +from .header_validators import filter_valid_envelope_headers, filter_valid_item_headers class NewlineFinder: @@ -149,7 +149,7 @@ class StreamingEnvelopeParser: if self.envelope_headers is None: # see test_eof_after_envelope_headers for why we don't error on EOF-after-header here self.envelope_headers = self._parse_headers(empty_is_error=True, eof_after_header_is_error=False) - validate_envelope_headers(self.envelope_headers) + self.envelope_headers = filter_valid_envelope_headers(self.envelope_headers) return self.envelope_headers @@ -166,7 +166,7 @@ class StreamingEnvelopeParser: self.at_eof = True break - validate_item_headers(item_headers) + item_headers = filter_valid_item_headers(item_headers) if "length" in item_headers: length = item_headers["length"] diff --git a/ingest/tests.py b/ingest/tests.py index 1ebbfe9..eeb3f56 100644 --- a/ingest/tests.py +++ b/ingest/tests.py @@ -32,6 +32,7 @@ from bsmain.management.commands.send_json import Command as SendJsonCommand from .views import BaseIngestAPIView from .parsers import readuntil, NewlineFinder, ParseError, LengthFinder, StreamingEnvelopeParser from .event_counter import check_for_thresholds + from bugsink.exceptions import ViolatedExpectation @@ -884,3 +885,17 @@ class TestParser(RegularTestCase): with self.assertRaises(StopIteration): header, item = next(items) + + def test_garbage_sent_at(self): + # based on test_envelope_with_2_items_last_newline_omitted, but with a garbage sent_at value + parser = StreamingEnvelopeParser(io.BytesIO(b"""{"event_id":"9ec79c33ec9942ab8353589fcb2e04dc","dsn":"https://e12d836b15bb49d7bbf99e64295d995b:@sentry.io/42","sent_at":"garbage"}\n{"type":"attachment","length":10,"content_type":"text/plain","filename":"hello.txt"}\n\xef\xbb\xbfHello\r\n\n{"type":"event","length":41,"content_type":"application/json","filename":"application.log"}\n{"message":"hello world","level":"error"}\n""")) # noqa + + envelope_headers = parser.get_envelope_headers() + + # note: sent_at filtered out + self.assertEqual( + {"event_id": "9ec79c33ec9942ab8353589fcb2e04dc", + "dsn": "https://e12d836b15bb49d7bbf99e64295d995b:@sentry.io/42"}, + envelope_headers) + + # the rest of the test is not repeated here