From 7a19e2d277cc21d5338682c00ca5a200736c7634 Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Thu, 27 Feb 2025 13:12:49 +0100 Subject: [PATCH] Tags; deducing tags; search on tags; WIP --- bugsink/moreiterutils.py | 13 +++ bugsink/settings/default.py | 1 + bugsink/utils.py | 1 + events/models.py | 6 +- ingest/views.py | 5 + issues/views.py | 68 +++++++++++-- pyproject.toml | 1 + tags/__init__.py | 0 tags/admin.py | 0 tags/apps.py | 6 ++ tags/migrations/0001_initial.py | 169 ++++++++++++++++++++++++++++++++ tags/migrations/__init__.py | 0 tags/models.py | 103 +++++++++++++++++++ tags/tests.py | 1 + tags/utils.py | 81 +++++++++++++++ 15 files changed, 447 insertions(+), 8 deletions(-) create mode 100644 tags/__init__.py create mode 100644 tags/admin.py create mode 100644 tags/apps.py create mode 100644 tags/migrations/0001_initial.py create mode 100644 tags/migrations/__init__.py create mode 100644 tags/models.py create mode 100644 tags/tests.py create mode 100644 tags/utils.py diff --git a/bugsink/moreiterutils.py b/bugsink/moreiterutils.py index 30202dc..1d5ac92 100644 --- a/bugsink/moreiterutils.py +++ b/bugsink/moreiterutils.py @@ -39,6 +39,19 @@ def pairwise(it): prev = current +def tuplewise(iterable): + """ + >>> list(tuplewise(range(4))) + [(0, 1), (2, 3)] + """ + i = iter(iterable) + while True: + try: + yield next(i), next(i) + except StopIteration: + return + + def batched(iterable, n): # itertools.batched was introduced in Python 3.12, but it's "roughly equivalent" to this: diff --git a/bugsink/settings/default.py b/bugsink/settings/default.py index e4d693e..0e4f04d 100644 --- a/bugsink/settings/default.py +++ b/bugsink/settings/default.py @@ -78,6 +78,7 @@ BUGSINK_APPS = [ 'ingest', 'issues', 'events', + 'tags', 'alerts', 'performance', diff --git a/bugsink/utils.py b/bugsink/utils.py index a41c080..4fa97e5 100644 --- a/bugsink/utils.py +++ b/bugsink/utils.py @@ -132,6 +132,7 @@ def eat_your_own_dogfood(sentry_dsn, **kwargs): "sentry", "sentry_sdk_extensions", "snappea", + "tags", "teams", "theme", "users", diff --git a/events/models.py b/events/models.py index f070847..24ca152 100644 --- a/events/models.py +++ b/events/models.py @@ -59,6 +59,10 @@ def write_to_storage(event_id, parsed_data): class Event(models.Model): + # TODO now that the Tag models are introduced, an number of the below fields are actually already stored as tags. + # At some point we should decide whether to proceed with "just as tags" or "just in the event table". Will depend on + # findings about performance (and how generic our solution with tags really is). + # Lines quotes with ">" are from the following to resources: # https://develop.sentry.dev/sdk/event-payloads/ (supposedly more human-readable) # https://develop.sentry.dev/sdk/event-payloads/types/ (more up-to-date and complete) @@ -155,7 +159,7 @@ class Event(models.Model): # them to be [yet]): # # > Optional. A map or list of tags for this event. Each tag must be less than 200 characters. - # tags + # tags = # implemented in tags/models.py # # > A list of relevant modules and their versions. # modules = diff --git a/ingest/views.py b/ingest/views.py index e4e6938..a509322 100644 --- a/ingest/views.py +++ b/ingest/views.py @@ -35,6 +35,7 @@ from events.retention import evict_for_max_events, should_evict from releases.models import create_release_if_needed from alerts.tasks import send_new_issue_alert, send_regression_alert from compat.timestamp import format_timestamp, parse_timestamp +from tags.models import digest_tags from .parsers import StreamingEnvelopeParser, ParseError from .filestore import get_filename_for_event_id @@ -389,6 +390,10 @@ class BaseIngestAPIView(View): issue.save() + # intentionally at the end: possible future optimization is to push this out of the transaction (or even use + # a separate DB for this) + digest_tags(event_data, event, issue) + @classmethod def count_project_periods_and_act_on_it(cls, project, now): # returns: True if any further processing should be done. diff --git a/issues/views.py b/issues/views.py index 2e8eb78..8b2da71 100644 --- a/issues/views.py +++ b/issues/views.py @@ -1,10 +1,11 @@ from collections import namedtuple import json import sentry_sdk +import re from django.utils import timezone from django.shortcuts import render, get_object_or_404, redirect -from django.db.models import Q +from django.db.models import Q, Subquery from django.http import HttpResponseRedirect, HttpResponseNotAllowed from django.utils.safestring import mark_safe from django.template.defaultfilters import date @@ -13,6 +14,9 @@ from django.core.exceptions import PermissionDenied from django.http import Http404 from django.core.paginator import Paginator +from sentry.utils.safe import get_path + +from bugsink.moreiterutils import tuplewise from bugsink.decorators import project_membership_required, issue_membership_required, atomic_for_request_method from bugsink.transaction import durable_atomic from bugsink.period_utils import add_periods_to_datetime @@ -22,7 +26,7 @@ from events.ua_stuff import get_contexts_enriched_with_ua from compat.timestamp import format_timestamp from projects.models import ProjectMembership -from sentry.utils.safe import get_path +from tags.models import TagValue, IssueTag from .models import Issue, IssueQuerysetStateManager, IssueStateManager, TurningPoint, TurningPointKind from .forms import CommentForm @@ -216,6 +220,60 @@ def _issue_list_pt_1(request, project, state_filter="open"): return project, unapplied_issue_ids +def remove_slices(s, slices_to_remove): + """Returns s with the slices removed.""" + items = [item for tup in slices_to_remove for item in tup] + slices_to_preserve = tuplewise([0] + items + [None]) + return "".join(s[start:stop] for start, stop in slices_to_preserve) + + +def _and_join(q_objects): + if len(q_objects) == 0: + # we _could_ just return Q(), but I'd force the calling location to not do this + raise ValueError("empty list of Q objects") + + result = q_objects[0] + for q in q_objects[1:]: + result &= q + + return result + + +def _search(project, issue_list, q): + # The simplest possible query-language that could have any value: key:value is recognized as such; the rest is "free + # text"; no support for quoting of spaces. + slices_to_remove = [] + clauses = [] + for match in re.finditer(r"(\S+):(\S+)", q): + slices_to_remove.append(match.span()) + key, value = match.groups() + try: + tag_value_obj = TagValue.objects.get(project=project, key__key=key, value=value) + except TagValue.DoesNotExist: + # if the tag doesn't exist, we can't have any issues with it; the below short-circuit is fine, I think (I + # mean: we _could_ say "tag x is to blame" but that's not what one does generally in search, is it? + return issue_list.none() + + # TODO: Extensive performance testing of various choices here is necessary; in particular the choice of Subquery + # vs. joins; and the choice of a separate query to get TagValue v.s. doing everything in a single big query will + # have different trade-offs _in practice_. + clauses.append( + Q(id__in=Subquery(IssueTag.objects.filter(value=tag_value_obj).values_list("issue_id", flat=True)))) + + # this is really TSTTCPW (or more like a "fake it till you make it" thing); but I'd rather "have something" and then + # have really-good-search than to have either nothing at all, or half-baked search. Note that we didn't even bother + # to set indexes on the fields we search on (nor create a single searchable field for the whole of 'title'). + plain_text_q = remove_slices(q, slices_to_remove).strip() + if plain_text_q: + clauses.append(Q(Q(calculated_type__icontains=plain_text_q) | Q(calculated_value__icontains=plain_text_q))) + + # if we reach this point, there's always either a plain_text_q or some key/value pair (this is a condition for + # _and_join) + issue_list = issue_list.filter(_and_join(clauses)) + + return issue_list + + def _issue_list_pt_2(request, project, state_filter, unapplied_issue_ids): d_state_filter = { "open": lambda qs: qs.filter(is_resolved=False, is_muted=False), @@ -229,12 +287,8 @@ def _issue_list_pt_2(request, project, state_filter, unapplied_issue_ids): Issue.objects.filter(project=project) ).order_by("-last_seen") - # this is really TSTTCPW (or more like a "fake it till you make it" thing); but I'd rather "have something" and then - # have really-good-search than to have either nothing at all, or half-baked search. Note that we didn't even bother - # to set indexes on the fields we search on (nor create a single searchable field for the whole of 'title'). if request.GET.get("q"): - issue_list = issue_list.filter( - Q(calculated_type__icontains=request.GET["q"]) | Q(calculated_value__icontains=request.GET["q"])) + issue_list = _search(project, issue_list, request.GET["q"]) paginator = Paginator(issue_list, 250) page_number = request.GET.get("page") diff --git a/pyproject.toml b/pyproject.toml index d6a699d..76b2662 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,6 +54,7 @@ include = [ "sentry_sdk_extensions*", "snappea*", "static*", + "tags*", "teams*", "templates*", "theme*", diff --git a/tags/__init__.py b/tags/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tags/admin.py b/tags/admin.py new file mode 100644 index 0000000..e69de29 diff --git a/tags/apps.py b/tags/apps.py new file mode 100644 index 0000000..1169a97 --- /dev/null +++ b/tags/apps.py @@ -0,0 +1,6 @@ +from django.apps import AppConfig + + +class TagsConfig(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + name = "tags" diff --git a/tags/migrations/0001_initial.py b/tags/migrations/0001_initial.py new file mode 100644 index 0000000..3cdf744 --- /dev/null +++ b/tags/migrations/0001_initial.py @@ -0,0 +1,169 @@ +# Generated by Django 4.2.19 on 2025-02-27 12:04 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ("events", "0019_event_storage_backend"), + ("issues", "0010_issue_list_indexes"), + ("projects", "0011_fill_stored_event_count"), + ] + + operations = [ + migrations.CreateModel( + name="TagKey", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("key", models.CharField(max_length=32)), + ( + "project", + models.ForeignKey( + null=True, + on_delete=django.db.models.deletion.SET_NULL, + to="projects.project", + ), + ), + ], + ), + migrations.CreateModel( + name="TagValue", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("value", models.CharField(db_index=True, max_length=200)), + ( + "key", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, to="tags.tagkey" + ), + ), + ( + "project", + models.ForeignKey( + null=True, + on_delete=django.db.models.deletion.SET_NULL, + to="projects.project", + ), + ), + ], + options={ + "unique_together": {("project", "key", "value")}, + }, + ), + migrations.CreateModel( + name="IssueTag", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "issue", + models.ForeignKey( + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="tags", + to="issues.issue", + ), + ), + ( + "project", + models.ForeignKey( + null=True, + on_delete=django.db.models.deletion.SET_NULL, + to="projects.project", + ), + ), + ( + "value", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, to="tags.tagvalue" + ), + ), + ], + ), + migrations.CreateModel( + name="EventTag", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "event", + models.ForeignKey( + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="tags", + to="events.event", + ), + ), + ( + "project", + models.ForeignKey( + null=True, + on_delete=django.db.models.deletion.SET_NULL, + to="projects.project", + ), + ), + ( + "value", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, to="tags.tagvalue" + ), + ), + ], + ), + migrations.AddIndex( + model_name="tagkey", + index=models.Index(fields=["key"], name="tags_tagkey_key_5e8a5a_idx"), + ), + migrations.AlterUniqueTogether( + name="tagkey", + unique_together={("project", "key")}, + ), + migrations.AddIndex( + model_name="issuetag", + index=models.Index( + fields=["issue", "value"], name="tags_issuet_issue_i_f06f20_idx" + ), + ), + migrations.AlterUniqueTogether( + name="issuetag", + unique_together={("value", "issue")}, + ), + migrations.AlterUniqueTogether( + name="eventtag", + unique_together={("value", "event")}, + ), + ] diff --git a/tags/migrations/__init__.py b/tags/migrations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tags/models.py b/tags/models.py new file mode 100644 index 0000000..2d01510 --- /dev/null +++ b/tags/models.py @@ -0,0 +1,103 @@ +""" +Tags provide support for arbitrary key/value pairs (both strings) on Events and Issues, allowing for searching & +counting. Some notes: + +* Arbitrary Tags can be set programatically in the SDKs, which we need to support (Sentry API Compatability). +* Some "synthetic" Tags are introduced by Bugsink itself: attributes of an Event are deduced and stored explicitly as a + Tag. The main reason to do this: stay flexible in terms of DB design and allow for generic code for searching and + counting. _However_, we don't make a commitment to any particular implementation, and if the deduce-and-store approach + turns out to be a performance bottleneck, it may be replaced. Particular notes on what we deduce are in `deduce_tags`. + +https://docs.sentry.io/platforms/python/enriching-events/tags/ + +> Tag keys have a maximum length of 32 characters and can contain only letters (a-zA-Z), numbers (0-9), underscores (_), +> periods (.), colons (:), and dashes (-). +> +> Tag values have a maximum length of 200 characters and they cannot contain the newline (\n) character. +""" + + +from django.db import models + +from projects.models import Project +from tags.utils import deduce_tags + + +class TagKey(models.Model): + project = models.ForeignKey(Project, blank=False, null=True, on_delete=models.SET_NULL) # SET_NULL: cleanup 'later' + key = models.CharField(max_length=32, blank=False, null=False) + + # I briefly considered being explicit about is_deduced; but it's annoying to store this info on the TagKey, and it's + # probably redundant if we just come up with a list of "reserved" tags or similar. + # is_deduced = models.BooleanField(default=False) + + class Meta: + unique_together = ('project', 'key') + indexes = [ + # Untested assumption: when searching, we search by TagValue[..]key__key=key, so we need an + # index-not-prefixed-by-project too + models.Index(fields=['key']), + ] + + +class TagValue(models.Model): + project = models.ForeignKey(Project, blank=False, null=True, on_delete=models.SET_NULL) # SET_NULL: cleanup 'later' + # CASCADE: TBD; one argument might be: decouple deletions from events ingestion, but at least have them internally + # consistent + key = models.ForeignKey(TagKey, blank=False, null=False, on_delete=models.CASCADE) + value = models.CharField(max_length=200, blank=False, null=False, db_index=True) + + class Meta: + # matches what we do in search + unique_together = ('project', 'key', 'value') + + def __str__(self): + return f"{self.key.key}:{self.value}" + + +class EventTag(models.Model): + project = models.ForeignKey(Project, blank=False, null=True, on_delete=models.SET_NULL) # SET_NULL: cleanup 'later' + + # value already implies key in our current setup. + value = models.ForeignKey(TagValue, blank=False, null=False, on_delete=models.CASCADE) + + event = models.ForeignKey('events.Event', blank=False, null=True, on_delete=models.SET_NULL, related_name='tags') + + class Meta: + unique_together = ('value', 'event') + + +# class GroupingTag is not needed (not even for future-proofing); it would only be needed if you'd want to "unmerge" +# manually merged issues while preserving the tags/counts. I think that's in "not worth it" territory. + + +class IssueTag(models.Model): + project = models.ForeignKey(Project, blank=False, null=True, on_delete=models.SET_NULL) # SET_NULL: cleanup 'later' + + # value already implies key in our current setup. + value = models.ForeignKey(TagValue, blank=False, null=False, on_delete=models.CASCADE) + + issue = models.ForeignKey('issues.Issue', blank=False, null=True, on_delete=models.SET_NULL, related_name='tags') + + class Meta: + # searching: by value, then Issue (is this so though? .qs is already on the other!) + unique_together = ('value', 'issue') + indexes = [ + models.Index(fields=['issue', 'value']), # make lookups by issue (for detail pages) faster + ] + + +def digest_tags(event_data, event, issue): + tags = deduce_tags(event_data) + + for key, value in tags.items(): + # The max length of 200 is from TFM for user-provided tags. Still, we just apply it on synthetic tags as well; + # It's a reasonably safe guess that this will not terribly confuse people, and avoids triggering errors on-save. + value = value[:200] + + # TODO just use bulk_create for each of the types of objects + # TODO check: event.project won't trigger a query, right? it's already loaded, right? + tag_key, _ = TagKey.objects.get_or_create(project=event.project, key=key) + tag_value, _ = TagValue.objects.get_or_create(project=event.project, key=tag_key, value=value) + EventTag.objects.get_or_create(project=event.project, value=tag_value, event=event) + IssueTag.objects.get_or_create(project=event.project, value=tag_value, issue=issue) diff --git a/tags/tests.py b/tags/tests.py new file mode 100644 index 0000000..e4defab --- /dev/null +++ b/tags/tests.py @@ -0,0 +1 @@ +# from django.test import TestCase diff --git a/tags/utils.py b/tags/utils.py new file mode 100644 index 0000000..f0a825a --- /dev/null +++ b/tags/utils.py @@ -0,0 +1,81 @@ +from events.ua_stuff import get_contexts_enriched_with_ua +from sentry.utils.safe import get_path + + +EVENT_DATA_CONVERSION_TABLE = { + # NOTE that "level" is not included here; Sentry puts this front and center, and although we may give it _some_ + # place in the UI, Bugsink's take is that "level: Error" in an Error Tracker is an open door/waste of space. + "server_name": "server_name", + "release": "release", + "environment": "environment", + "transaction": "transaction", +} + + +CONTEXT_CONVERSION_TABLE = { + "trace": ("trace", "trace_id"), + "trace.span": ("trace", "span_id"), + "browser.name": ("browser", "name"), + "os.name": ("os", "name"), + + # TODO probably useful, simply not done yet: + # runtime + # runtime.name + # device.something +} + + +def deduce_user_tag(contexts): + # quick & dirty / barebones implementation; we don't try to mimick Sentry's full behavior, instead just pick the + # most relevant piece of the user context that we can find. For reference, Sentry has the concept of an "EventUser" + # (`src/sentry/models/eventuser.py`) + + if "user" not in contexts: + return None + + for key in ["id", "username", "email", "ip_address"]: + if contexts["user"].get(key): + return contexts["user"][key] + + return None + + +def deduce_tags(event_data): + """ + Deduce tags for `event_data`. Used as an "opportunistic" (generic) way to implement counting and searching. Although + Sentry does something similar, we're not striving to replicate Sentry's behavior (tag names etc). In particular, we + feel that Sentry's choices are poorly documented, the separation of concerns between events/contexts/tags is + unclear, and some choices are straight up not so great. We'd rather think about what information matters ourselves. + """ + + # we start with the explicitly provided tags + tags = event_data.get('tags', {}) + + for tag_key, lookup_path in EVENT_DATA_CONVERSION_TABLE.items(): + value = get_path(event_data, lookup_path) + if value not in [None, ""]: + tags[tag_key] = value + + # deduce from contexts + contexts = get_contexts_enriched_with_ua(event_data) + + for tag_key, path in CONTEXT_CONVERSION_TABLE.items(): + value = get_path(contexts, *path) + if value not in [None, ""]: + tags[tag_key] = value + + if "trace" in tags and "trace.span" in tags: + tags["trace.ctx"] = f"{tags['trace']}.{tags['trace.span']}" + + if "browser.name" in tags and (browser_version := tags.get("browser.version")): + tags["browser"] = f"{tags['browser.name']} {browser_version}" + + if user_tag := deduce_user_tag(contexts): + tags["user"] = user_tag + + # TODO further tags that are probably useful: + # url + # logger + # mechanism + + return tags