Add 'mostly_unique' property to tags

This commit is contained in:
Klaas van Schelven
2025-03-03 10:52:28 +01:00
parent 1b9a76bc17
commit 00c49443eb
4 changed files with 32 additions and 8 deletions

View File

@@ -116,7 +116,7 @@ class Issue(models.Model):
# the 2-step process allows for the filter on count;
# one could argue that this is also possible in a single query though...
ds = self.tags.order_by("value__key__key").values("value__key")\
ds = self.tags.filter(value__key__mostly_unique=False).order_by("value__key__key").values("value__key")\
.annotate(cnt=models.Count("value")).distinct()
for d in ds:

View File

@@ -1,4 +1,4 @@
# Generated by Django 4.2.19 on 2025-02-27 19:46
# Generated by Django 4.2.19 on 2025-03-03 09:42
from django.db import migrations, models
import django.db.models.deletion
@@ -10,8 +10,8 @@ class Migration(migrations.Migration):
dependencies = [
("issues", "0010_issue_list_indexes"),
("projects", "0011_fill_stored_event_count"),
("events", "0019_event_storage_backend"),
("projects", "0011_fill_stored_event_count"),
]
operations = [
@@ -28,6 +28,7 @@ class Migration(migrations.Migration):
),
),
("key", models.CharField(max_length=32)),
("mostly_unique", models.BooleanField(default=False)),
(
"project",
models.ForeignKey(

View File

@@ -5,8 +5,9 @@ counting. Some notes:
* Arbitrary Tags can be set programatically in the SDKs, which we need to support (Sentry API Compatability).
* Some "synthetic" Tags are introduced by Bugsink itself: attributes of an Event are deduced and stored explicitly as a
Tag. The main reason to do this: stay flexible in terms of DB design and allow for generic code for searching and
counting. _However_, we don't make a commitment to any particular implementation, and if the deduce-and-store approach
turns out to be a performance bottleneck, it may be replaced. Particular notes on what we deduce are in `deduce_tags`.
counting (especially in the light of Issues, where a single tag can have many values). _However_, we don't make a
commitment to any particular implementation, and if the deduce-and-store approach turns out to be a performance
bottleneck, it may be replaced. Particular notes on what we deduce are in `deduce_tags`.
https://docs.sentry.io/platforms/python/enriching-events/tags/
@@ -21,13 +22,18 @@ from django.db import models
from django.db.models import Q, F
from projects.models import Project
from tags.utils import deduce_tags
from tags.utils import deduce_tags, is_mostly_unique
class TagKey(models.Model):
project = models.ForeignKey(Project, blank=False, null=True, on_delete=models.SET_NULL) # SET_NULL: cleanup 'later'
key = models.CharField(max_length=32, blank=False, null=False)
# Tags that are "mostly unique" are not displayed in the issue tag counts, because the distribution of values is
# too flat to provide useful information. Another way of thinking about this is "this is a tag for searching, but
# not for counting".
mostly_unique = models.BooleanField(default=False)
# I briefly considered being explicit about is_deduced; but it's annoying to store this info on the TagKey, and it's
# probably redundant if we just come up with a list of "reserved" tags or similar.
# is_deduced = models.BooleanField(default=False)
@@ -136,7 +142,8 @@ def store_tags(event, issue, tags):
# # why this is only worth it for very small numbers of tags (1 in the current setup).
#
# for key, value in tags.items():
# tag_key, _ = TagKey.objects.get_or_create(project_id=event.project_id, key=key)
# tag_key, _ = TagKey.objects.get_or_create(
# project_id=event.project_id, key=key, mostly_unique=is_mostly_unique(key))
# tag_value, _ = TagValue.objects.get_or_create(project_id=event.project_id, key=tag_key, value=value)
# EventTag.objects.get_or_create(project_id=event.project_id, value=tag_value, event=event)
# IssueTag.objects.get_or_create(project_id=event.project_id, value=tag_value, issue=issue)
@@ -144,8 +151,11 @@ def store_tags(event, issue, tags):
# # the 0-case is implied here too, which avoids some further guards in the code below
# return
# there is some principled point here that there is always a single value of mostly_unique per key, but this point
# is not formalized in our datbase schema; it "just happens to work correctly" (at least as long as we don't change
# the list of mostly unique keys, at which point we'll have to do a datamigration).
TagKey.objects.bulk_create([
TagKey(project_id=event.project_id, key=key) for key in tags.keys()
TagKey(project_id=event.project_id, key=key, mostly_unique=is_mostly_unique(key)) for key in tags.keys()
], ignore_conflicts=True)
# Select-back what we just created (or was already there); this is needed because "Enabling the ignore_conflicts or

View File

@@ -79,3 +79,16 @@ def deduce_tags(event_data):
# mechanism
return tags
def is_mostly_unique(key):
if key.startswith("user"):
return True
if key.startswith("trace"):
return True
if key in ["browser.version", "browser"]:
return True
return False