api.json.schema: put back in code, make test fail on invalidness and related fixes

This reverts course on 4201fbd778, and restores event.schema.json from that
commit.  In that commit we said: 'this is not used'. Not true: it's used in a
test, though this test used the validity check to silently skip.

In this commit:

1. Do _not_ just silently skip invalid samples. Since we have a way of properly
   validating, let's use that so that we know how useful the samples that we have
   actually are.

2. Deal with "_meta", a field that we sometimes see in the "private samples" (data
   that ultimately comes from running a somewhat recent python-sdk against my
   actual codebase). The need for this was exposed by [1]

3. Add a test for the up-to-date-ness of event.json.schema

4. remove special-cased attribute-checks in `is_valid`; `send_json` was, at the
   time, an opportunistic way to just get my hands on some sample data. the
   approach at validation reflected that: I just did some tests on the existence
   of certain attributes to determine which json files were even events. But in
   the end I did a full validation using an API schema, which kinda made the
   whole business useless. This commit cleans up the individual checks.
This commit is contained in:
Klaas van Schelven
2024-09-16 11:28:05 +02:00
parent 359953cee2
commit f1b75aab81
4 changed files with 3810 additions and 35 deletions

10
api/LICENSE Normal file
View File

@@ -0,0 +1,10 @@
This licence applies to the file: event.schema.json
The source of this file is: https://raw.githubusercontent.com/getsentry/sentry-data-schemas/main/LICENSE
Copyright (c) 2020 Sentry (https://sentry.io) and individual contributors.
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

3773
api/event.schema.json Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -30,46 +30,21 @@ class Command(BaseCommand):
parser.add_argument("identifiers", nargs="+")
def is_valid(self, data, identifier):
if "event_id" not in data:
self.stderr.write("%s %s" % ("Probably not a (single) event", identifier))
return False
if "platform" not in data:
# in a few cases this value isn't set either in the sentry test data but I'd rather ignore those...
# because 'platform' is such a valuable piece of info while getting a sense of the shape of the data
self.stderr.write("%s %s" % ("Platform not set", identifier))
return False
if data.get("type", "") == "transaction":
# kinda weird that this is in the "type" field rather than endpoint/envelope but who cares, that's
# where the info lives and we use it as an indicator to skip
self.stderr.write("%s %s" % ("We don't do transactions", identifier))
return False
if data.get('profile'):
# yet another case of undocumented behavior that I don't care about
# ../sentry-current/static/app/utils/profiling/profile/formats/node/trace.json
self.stderr.write("%s %s" % ("124", identifier))
return False
if data.get('message'):
# yet another case of undocumented behavior that I don't care about (top-level "message")
# ../glitchtip/events/test_data/py_hi_event.json
self.stderr.write("%s %s" % ("asdf", identifier))
return False
# In our (private) samples we often have this "_meta" field. I can't (quickly) find any documentation for it,
# nor do I have any use for it myself (i.e. I don't display this info in templates). The quickest way to get
# something to work is to just remove the info from the json. This comes with the drawback of changing data
# on-validation, but for now that's an OK trade=off.
if "_meta" in data:
del data["_meta"]
try:
schema_filename = settings.BASE_DIR / 'api/event.schema.json'
if not schema_filename.exists():
# see api/README.md for more info
self.stderr.write("%s %s" % ("No schema file, exiting", identifier))
exit()
with open(schema_filename, 'r') as f:
schema = json.loads(f.read())
jsonschema.validate(data, schema)
except jsonschema.ValidationError as e:
self.stderr.write("%s %s %s" % ("still not ok at", repr(e), identifier))
self.stderr.write("%s %s" % (repr(e), identifier))
return False
return True

View File

@@ -1,3 +1,4 @@
import requests
import os
import inspect
import uuid
@@ -12,6 +13,7 @@ from datetime import datetime, timezone
from django.test import TestCase as DjangoTestCase, TransactionTestCase
from django.contrib.auth import get_user_model
from django.test import tag
from django.conf import settings
from projects.models import Project, ProjectMembership
from releases.models import create_release_if_needed
@@ -457,6 +459,20 @@ class IntegrationTest(TransactionTestCase):
if self.verbosity > 1:
print(f"Found {len(event_samples)} event samples and {len(event_samples_private)} private event samples")
try:
github_result = requests.get(
"https://raw.githubusercontent.com/getsentry/sentry-data-schemas/main/relay/event.schema.json")
github_result.raise_for_status()
with open(settings.BASE_DIR / "api/event.schema.json", "r") as f:
my_contents = f.read()
self.assertEqual(my_contents, github_result.content.decode("utf-8"), "event.schema.json is not up-to-date")
except requests.RequestException:
# getting the latest schema "once in a while" is nice so that we can be sure we're not falling behind;
# but we don't want that to introduce a point-of-failure in our tests. So print-and-continue.
print("Could not fetch the latest event schema from GitHub; I will not fail the tests for this")
for filename in event_samples + event_samples_private:
with open(filename) as f:
data = json.loads(f.read())
@@ -468,7 +484,7 @@ class IntegrationTest(TransactionTestCase):
data["timestamp"] = time.time()
if not command.is_valid(data, filename):
continue
raise Exception("validatity check in %s: %s" % filename, command.stderr.getvalue())
response = self.client.post(
f"/api/{ project.id }/store/",
@@ -480,7 +496,8 @@ class IntegrationTest(TransactionTestCase):
},
)
self.assertEqual(
200, response.status_code, response.content if response.status_code != 302 else response.url)
200, response.status_code, "Error in %s: %s" % (
filename, response.content if response.status_code != 302 else response.url))
for event in Event.objects.all():
urls = [