From 7a19e2d277cc21d5338682c00ca5a200736c7634 Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Thu, 27 Feb 2025 13:12:49 +0100
Subject: [PATCH] Tags; deducing tags; search on tags; WIP

---
 bugsink/moreiterutils.py        |  13 +++
 bugsink/settings/default.py     |   1 +
 bugsink/utils.py                |   1 +
 events/models.py                |   6 +-
 ingest/views.py                 |   5 +
 issues/views.py                 |  68 +++++++++++--
 pyproject.toml                  |   1 +
 tags/__init__.py                |   0
 tags/admin.py                   |   0
 tags/apps.py                    |   6 ++
 tags/migrations/0001_initial.py | 169 ++++++++++++++++++++++++++++++++
 tags/migrations/__init__.py     |   0
 tags/models.py                  | 103 +++++++++++++++++++
 tags/tests.py                   |   1 +
 tags/utils.py                   |  81 +++++++++++++++
 15 files changed, 447 insertions(+), 8 deletions(-)
 create mode 100644 tags/__init__.py
 create mode 100644 tags/admin.py
 create mode 100644 tags/apps.py
 create mode 100644 tags/migrations/0001_initial.py
 create mode 100644 tags/migrations/__init__.py
 create mode 100644 tags/models.py
 create mode 100644 tags/tests.py
 create mode 100644 tags/utils.py

diff --git a/bugsink/moreiterutils.py b/bugsink/moreiterutils.py
index 30202dc..1d5ac92 100644
--- a/bugsink/moreiterutils.py
+++ b/bugsink/moreiterutils.py
@@ -39,6 +39,19 @@ def pairwise(it):
         prev = current
 
 
+def tuplewise(iterable):
+    """
+    >>> list(tuplewise(range(4)))
+    [(0, 1), (2, 3)]
+    """
+    i = iter(iterable)
+    while True:
+        try:
+            yield next(i), next(i)
+        except StopIteration:
+            return
+
+
 def batched(iterable, n):
     # itertools.batched was introduced in Python 3.12, but it's "roughly equivalent" to this:
 
diff --git a/bugsink/settings/default.py b/bugsink/settings/default.py
index e4d693e..0e4f04d 100644
--- a/bugsink/settings/default.py
+++ b/bugsink/settings/default.py
@@ -78,6 +78,7 @@ BUGSINK_APPS = [
     'ingest',
     'issues',
     'events',
+    'tags',
     'alerts',
 
     'performance',
diff --git a/bugsink/utils.py b/bugsink/utils.py
index a41c080..4fa97e5 100644
--- a/bugsink/utils.py
+++ b/bugsink/utils.py
@@ -132,6 +132,7 @@ def eat_your_own_dogfood(sentry_dsn, **kwargs):
             "sentry",
             "sentry_sdk_extensions",
             "snappea",
+            "tags",
             "teams",
             "theme",
             "users",
diff --git a/events/models.py b/events/models.py
index f070847..24ca152 100644
--- a/events/models.py
+++ b/events/models.py
@@ -59,6 +59,10 @@ def write_to_storage(event_id, parsed_data):
 
 
 class Event(models.Model):
+    # TODO now that the Tag models are introduced, an number of the below fields are actually already stored as tags.
+    # At some point we should decide whether to proceed with "just as tags" or "just in the event table". Will depend on
+    # findings about performance (and how generic our solution with tags really is).
+
     # Lines quotes with ">" are from the following to resources:
     # https://develop.sentry.dev/sdk/event-payloads/ (supposedly more human-readable)
     # https://develop.sentry.dev/sdk/event-payloads/types/ (more up-to-date and complete)
@@ -155,7 +159,7 @@ class Event(models.Model):
     # them to be [yet]):
     #
     # > Optional. A map or list of tags for this event. Each tag must be less than 200 characters.
-    # tags
+    # tags =   # implemented in tags/models.py
     #
     # > A list of relevant modules and their versions.
     # modules =
diff --git a/ingest/views.py b/ingest/views.py
index e4e6938..a509322 100644
--- a/ingest/views.py
+++ b/ingest/views.py
@@ -35,6 +35,7 @@ from events.retention import evict_for_max_events, should_evict
 from releases.models import create_release_if_needed
 from alerts.tasks import send_new_issue_alert, send_regression_alert
 from compat.timestamp import format_timestamp, parse_timestamp
+from tags.models import digest_tags
 
 from .parsers import StreamingEnvelopeParser, ParseError
 from .filestore import get_filename_for_event_id
@@ -389,6 +390,10 @@ class BaseIngestAPIView(View):
 
         issue.save()
 
+        # intentionally at the end: possible future optimization is to push this out of the transaction (or even use
+        # a separate DB for this)
+        digest_tags(event_data, event, issue)
+
     @classmethod
     def count_project_periods_and_act_on_it(cls, project, now):
         # returns: True if any further processing should be done.
diff --git a/issues/views.py b/issues/views.py
index 2e8eb78..8b2da71 100644
--- a/issues/views.py
+++ b/issues/views.py
@@ -1,10 +1,11 @@
 from collections import namedtuple
 import json
 import sentry_sdk
+import re
 
 from django.utils import timezone
 from django.shortcuts import render, get_object_or_404, redirect
-from django.db.models import Q
+from django.db.models import Q, Subquery
 from django.http import HttpResponseRedirect, HttpResponseNotAllowed
 from django.utils.safestring import mark_safe
 from django.template.defaultfilters import date
@@ -13,6 +14,9 @@ from django.core.exceptions import PermissionDenied
 from django.http import Http404
 from django.core.paginator import Paginator
 
+from sentry.utils.safe import get_path
+
+from bugsink.moreiterutils import tuplewise
 from bugsink.decorators import project_membership_required, issue_membership_required, atomic_for_request_method
 from bugsink.transaction import durable_atomic
 from bugsink.period_utils import add_periods_to_datetime
@@ -22,7 +26,7 @@ from events.ua_stuff import get_contexts_enriched_with_ua
 
 from compat.timestamp import format_timestamp
 from projects.models import ProjectMembership
-from sentry.utils.safe import get_path
+from tags.models import TagValue, IssueTag
 
 from .models import Issue, IssueQuerysetStateManager, IssueStateManager, TurningPoint, TurningPointKind
 from .forms import CommentForm
@@ -216,6 +220,60 @@ def _issue_list_pt_1(request, project, state_filter="open"):
     return project, unapplied_issue_ids
 
 
+def remove_slices(s, slices_to_remove):
+    """Returns s with the slices removed."""
+    items = [item for tup in slices_to_remove for item in tup]
+    slices_to_preserve = tuplewise([0] + items + [None])
+    return "".join(s[start:stop] for start, stop in slices_to_preserve)
+
+
+def _and_join(q_objects):
+    if len(q_objects) == 0:
+        # we _could_ just return Q(), but I'd force the calling location to not do this
+        raise ValueError("empty list of Q objects")
+
+    result = q_objects[0]
+    for q in q_objects[1:]:
+        result &= q
+
+    return result
+
+
+def _search(project, issue_list, q):
+    # The simplest possible query-language that could have any value: key:value is recognized as such; the rest is "free
+    # text"; no support for quoting of spaces.
+    slices_to_remove = []
+    clauses = []
+    for match in re.finditer(r"(\S+):(\S+)", q):
+        slices_to_remove.append(match.span())
+        key, value = match.groups()
+        try:
+            tag_value_obj = TagValue.objects.get(project=project, key__key=key, value=value)
+        except TagValue.DoesNotExist:
+            # if the tag doesn't exist, we can't have any issues with it; the below short-circuit is fine, I think (I
+            # mean: we _could_ say "tag x is to blame" but that's not what one does generally in search, is it?
+            return issue_list.none()
+
+        # TODO: Extensive performance testing of various choices here is necessary; in particular the choice of Subquery
+        # vs. joins; and the choice of a separate query to get TagValue v.s. doing everything in a single big query will
+        # have different trade-offs _in practice_.
+        clauses.append(
+            Q(id__in=Subquery(IssueTag.objects.filter(value=tag_value_obj).values_list("issue_id", flat=True))))
+
+    # this is really TSTTCPW (or more like a "fake it till you make it" thing); but I'd rather "have something" and then
+    # have really-good-search than to have either nothing at all, or half-baked search. Note that we didn't even bother
+    # to set indexes on the fields we search on (nor create a single searchable field for the whole of 'title').
+    plain_text_q = remove_slices(q, slices_to_remove).strip()
+    if plain_text_q:
+        clauses.append(Q(Q(calculated_type__icontains=plain_text_q) | Q(calculated_value__icontains=plain_text_q)))
+
+    # if we reach this point, there's always either a plain_text_q or some key/value pair (this is a condition for
+    # _and_join)
+    issue_list = issue_list.filter(_and_join(clauses))
+
+    return issue_list
+
+
 def _issue_list_pt_2(request, project, state_filter, unapplied_issue_ids):
     d_state_filter = {
         "open": lambda qs: qs.filter(is_resolved=False, is_muted=False),
@@ -229,12 +287,8 @@ def _issue_list_pt_2(request, project, state_filter, unapplied_issue_ids):
         Issue.objects.filter(project=project)
     ).order_by("-last_seen")
 
-    # this is really TSTTCPW (or more like a "fake it till you make it" thing); but I'd rather "have something" and then
-    # have really-good-search than to have either nothing at all, or half-baked search. Note that we didn't even bother
-    # to set indexes on the fields we search on (nor create a single searchable field for the whole of 'title').
     if request.GET.get("q"):
-        issue_list = issue_list.filter(
-            Q(calculated_type__icontains=request.GET["q"]) | Q(calculated_value__icontains=request.GET["q"]))
+        issue_list = _search(project, issue_list, request.GET["q"])
 
     paginator = Paginator(issue_list, 250)
     page_number = request.GET.get("page")
diff --git a/pyproject.toml b/pyproject.toml
index d6a699d..76b2662 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -54,6 +54,7 @@ include = [
     "sentry_sdk_extensions*",
     "snappea*",
     "static*",
+    "tags*",
     "teams*",
     "templates*",
     "theme*",
diff --git a/tags/__init__.py b/tags/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tags/admin.py b/tags/admin.py
new file mode 100644
index 0000000..e69de29
diff --git a/tags/apps.py b/tags/apps.py
new file mode 100644
index 0000000..1169a97
--- /dev/null
+++ b/tags/apps.py
@@ -0,0 +1,6 @@
+from django.apps import AppConfig
+
+
+class TagsConfig(AppConfig):
+    default_auto_field = "django.db.models.BigAutoField"
+    name = "tags"
diff --git a/tags/migrations/0001_initial.py b/tags/migrations/0001_initial.py
new file mode 100644
index 0000000..3cdf744
--- /dev/null
+++ b/tags/migrations/0001_initial.py
@@ -0,0 +1,169 @@
+# Generated by Django 4.2.19 on 2025-02-27 12:04
+
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+    initial = True
+
+    dependencies = [
+        ("events", "0019_event_storage_backend"),
+        ("issues", "0010_issue_list_indexes"),
+        ("projects", "0011_fill_stored_event_count"),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name="TagKey",
+            fields=[
+                (
+                    "id",
+                    models.BigAutoField(
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
+                    ),
+                ),
+                ("key", models.CharField(max_length=32)),
+                (
+                    "project",
+                    models.ForeignKey(
+                        null=True,
+                        on_delete=django.db.models.deletion.SET_NULL,
+                        to="projects.project",
+                    ),
+                ),
+            ],
+        ),
+        migrations.CreateModel(
+            name="TagValue",
+            fields=[
+                (
+                    "id",
+                    models.BigAutoField(
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
+                    ),
+                ),
+                ("value", models.CharField(db_index=True, max_length=200)),
+                (
+                    "key",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.CASCADE, to="tags.tagkey"
+                    ),
+                ),
+                (
+                    "project",
+                    models.ForeignKey(
+                        null=True,
+                        on_delete=django.db.models.deletion.SET_NULL,
+                        to="projects.project",
+                    ),
+                ),
+            ],
+            options={
+                "unique_together": {("project", "key", "value")},
+            },
+        ),
+        migrations.CreateModel(
+            name="IssueTag",
+            fields=[
+                (
+                    "id",
+                    models.BigAutoField(
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
+                    ),
+                ),
+                (
+                    "issue",
+                    models.ForeignKey(
+                        null=True,
+                        on_delete=django.db.models.deletion.SET_NULL,
+                        related_name="tags",
+                        to="issues.issue",
+                    ),
+                ),
+                (
+                    "project",
+                    models.ForeignKey(
+                        null=True,
+                        on_delete=django.db.models.deletion.SET_NULL,
+                        to="projects.project",
+                    ),
+                ),
+                (
+                    "value",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.CASCADE, to="tags.tagvalue"
+                    ),
+                ),
+            ],
+        ),
+        migrations.CreateModel(
+            name="EventTag",
+            fields=[
+                (
+                    "id",
+                    models.BigAutoField(
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
+                    ),
+                ),
+                (
+                    "event",
+                    models.ForeignKey(
+                        null=True,
+                        on_delete=django.db.models.deletion.SET_NULL,
+                        related_name="tags",
+                        to="events.event",
+                    ),
+                ),
+                (
+                    "project",
+                    models.ForeignKey(
+                        null=True,
+                        on_delete=django.db.models.deletion.SET_NULL,
+                        to="projects.project",
+                    ),
+                ),
+                (
+                    "value",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.CASCADE, to="tags.tagvalue"
+                    ),
+                ),
+            ],
+        ),
+        migrations.AddIndex(
+            model_name="tagkey",
+            index=models.Index(fields=["key"], name="tags_tagkey_key_5e8a5a_idx"),
+        ),
+        migrations.AlterUniqueTogether(
+            name="tagkey",
+            unique_together={("project", "key")},
+        ),
+        migrations.AddIndex(
+            model_name="issuetag",
+            index=models.Index(
+                fields=["issue", "value"], name="tags_issuet_issue_i_f06f20_idx"
+            ),
+        ),
+        migrations.AlterUniqueTogether(
+            name="issuetag",
+            unique_together={("value", "issue")},
+        ),
+        migrations.AlterUniqueTogether(
+            name="eventtag",
+            unique_together={("value", "event")},
+        ),
+    ]
diff --git a/tags/migrations/__init__.py b/tags/migrations/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tags/models.py b/tags/models.py
new file mode 100644
index 0000000..2d01510
--- /dev/null
+++ b/tags/models.py
@@ -0,0 +1,103 @@
+"""
+Tags provide support for arbitrary key/value pairs (both strings) on Events and Issues, allowing for searching &
+counting. Some notes:
+
+* Arbitrary Tags can be set programatically in the SDKs, which we need to support (Sentry API Compatability).
+* Some "synthetic" Tags are introduced by Bugsink itself: attributes of an Event are deduced and stored explicitly as a
+  Tag. The main reason to do this: stay flexible in terms of DB design and allow for generic code for searching and
+  counting. _However_, we don't make a commitment to any particular implementation, and if the deduce-and-store approach
+  turns out to be a performance bottleneck, it may be replaced. Particular notes on what we deduce are in `deduce_tags`.
+
+https://docs.sentry.io/platforms/python/enriching-events/tags/
+
+> Tag keys have a maximum length of 32 characters and can contain only letters (a-zA-Z), numbers (0-9), underscores (_),
+> periods (.), colons (:), and dashes (-).
+>
+> Tag values have a maximum length of 200 characters and they cannot contain the newline (\n) character.
+"""
+
+
+from django.db import models
+
+from projects.models import Project
+from tags.utils import deduce_tags
+
+
+class TagKey(models.Model):
+    project = models.ForeignKey(Project, blank=False, null=True, on_delete=models.SET_NULL)  # SET_NULL: cleanup 'later'
+    key = models.CharField(max_length=32, blank=False, null=False)
+
+    # I briefly considered being explicit about is_deduced; but it's annoying to store this info on the TagKey, and it's
+    # probably redundant if we just come up with a list of "reserved" tags or similar.
+    # is_deduced = models.BooleanField(default=False)
+
+    class Meta:
+        unique_together = ('project', 'key')
+        indexes = [
+            # Untested assumption: when searching, we search by TagValue[..]key__key=key, so we need an
+            # index-not-prefixed-by-project too
+            models.Index(fields=['key']),
+        ]
+
+
+class TagValue(models.Model):
+    project = models.ForeignKey(Project, blank=False, null=True, on_delete=models.SET_NULL)  # SET_NULL: cleanup 'later'
+    # CASCADE: TBD; one argument might be: decouple deletions from events ingestion, but at least have them internally
+    # consistent
+    key = models.ForeignKey(TagKey, blank=False, null=False, on_delete=models.CASCADE)
+    value = models.CharField(max_length=200, blank=False, null=False, db_index=True)
+
+    class Meta:
+        # matches what we do in search
+        unique_together = ('project', 'key', 'value')
+
+    def __str__(self):
+        return f"{self.key.key}:{self.value}"
+
+
+class EventTag(models.Model):
+    project = models.ForeignKey(Project, blank=False, null=True, on_delete=models.SET_NULL)  # SET_NULL: cleanup 'later'
+
+    # value already implies key in our current setup.
+    value = models.ForeignKey(TagValue, blank=False, null=False, on_delete=models.CASCADE)
+
+    event = models.ForeignKey('events.Event', blank=False, null=True, on_delete=models.SET_NULL, related_name='tags')
+
+    class Meta:
+        unique_together = ('value', 'event')
+
+
+# class GroupingTag is not needed (not even for future-proofing); it would only be needed if you'd want to "unmerge"
+# manually merged issues while preserving the tags/counts. I think that's in "not worth it" territory.
+
+
+class IssueTag(models.Model):
+    project = models.ForeignKey(Project, blank=False, null=True, on_delete=models.SET_NULL)  # SET_NULL: cleanup 'later'
+
+    # value already implies key in our current setup.
+    value = models.ForeignKey(TagValue, blank=False, null=False, on_delete=models.CASCADE)
+
+    issue = models.ForeignKey('issues.Issue', blank=False, null=True, on_delete=models.SET_NULL, related_name='tags')
+
+    class Meta:
+        # searching: by value, then Issue (is this so though? .qs is already on the other!)
+        unique_together = ('value', 'issue')
+        indexes = [
+            models.Index(fields=['issue', 'value']),  # make lookups by issue (for detail pages) faster
+        ]
+
+
+def digest_tags(event_data, event, issue):
+    tags = deduce_tags(event_data)
+
+    for key, value in tags.items():
+        # The max length of 200 is from TFM for user-provided tags. Still, we just apply it on synthetic tags as well;
+        # It's a reasonably safe guess that this will not terribly confuse people, and avoids triggering errors on-save.
+        value = value[:200]
+
+        # TODO just use bulk_create for each of the types of objects
+        # TODO check: event.project won't trigger a query, right? it's already loaded, right?
+        tag_key, _ = TagKey.objects.get_or_create(project=event.project, key=key)
+        tag_value, _ = TagValue.objects.get_or_create(project=event.project, key=tag_key, value=value)
+        EventTag.objects.get_or_create(project=event.project, value=tag_value, event=event)
+        IssueTag.objects.get_or_create(project=event.project, value=tag_value, issue=issue)
diff --git a/tags/tests.py b/tags/tests.py
new file mode 100644
index 0000000..e4defab
--- /dev/null
+++ b/tags/tests.py
@@ -0,0 +1 @@
+# from django.test import TestCase
diff --git a/tags/utils.py b/tags/utils.py
new file mode 100644
index 0000000..f0a825a
--- /dev/null
+++ b/tags/utils.py
@@ -0,0 +1,81 @@
+from events.ua_stuff import get_contexts_enriched_with_ua
+from sentry.utils.safe import get_path
+
+
+EVENT_DATA_CONVERSION_TABLE = {
+    # NOTE that "level" is not included here; Sentry puts this front and center, and although we may give it _some_
+    # place in the UI, Bugsink's take is that "level: Error" in an Error Tracker is an open door/waste of space.
+    "server_name": "server_name",
+    "release": "release",
+    "environment": "environment",
+    "transaction": "transaction",
+}
+
+
+CONTEXT_CONVERSION_TABLE = {
+    "trace": ("trace", "trace_id"),
+    "trace.span": ("trace", "span_id"),
+    "browser.name": ("browser", "name"),
+    "os.name": ("os", "name"),
+
+    # TODO probably useful, simply not done yet:
+    # runtime
+    # runtime.name
+    # device.something
+}
+
+
+def deduce_user_tag(contexts):
+    # quick & dirty / barebones implementation; we don't try to mimick Sentry's full behavior, instead just pick the
+    # most relevant piece of the user context that we can find. For reference, Sentry has the concept of an "EventUser"
+    # (`src/sentry/models/eventuser.py`)
+
+    if "user" not in contexts:
+        return None
+
+    for key in ["id", "username", "email", "ip_address"]:
+        if contexts["user"].get(key):
+            return contexts["user"][key]
+
+    return None
+
+
+def deduce_tags(event_data):
+    """
+    Deduce tags for `event_data`. Used as an "opportunistic" (generic) way to implement counting and searching. Although
+    Sentry does something similar, we're not striving to replicate Sentry's behavior (tag names etc). In particular, we
+    feel that Sentry's choices are poorly documented, the separation of concerns between events/contexts/tags is
+    unclear, and some choices are straight up not so great. We'd rather think about what information matters ourselves.
+    """
+
+    # we start with the explicitly provided tags
+    tags = event_data.get('tags', {})
+
+    for tag_key, lookup_path in EVENT_DATA_CONVERSION_TABLE.items():
+        value = get_path(event_data, lookup_path)
+        if value not in [None, ""]:
+            tags[tag_key] = value
+
+    # deduce from contexts
+    contexts = get_contexts_enriched_with_ua(event_data)
+
+    for tag_key, path in CONTEXT_CONVERSION_TABLE.items():
+        value = get_path(contexts, *path)
+        if value not in [None, ""]:
+            tags[tag_key] = value
+
+    if "trace" in tags and "trace.span" in tags:
+        tags["trace.ctx"] = f"{tags['trace']}.{tags['trace.span']}"
+
+    if "browser.name" in tags and (browser_version := tags.get("browser.version")):
+        tags["browser"] = f"{tags['browser.name']} {browser_version}"
+
+    if user_tag := deduce_user_tag(contexts):
+        tags["user"] = user_tag
+
+    # TODO further tags that are probably useful:
+    # url
+    # logger
+    # mechanism
+
+    return tags