From 2c9d5c80edda1688b10cba1db926841a81ec2135 Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Thu, 6 Mar 2025 11:23:18 +0100
Subject: [PATCH] Search: support for quoted values

also adds tests and factors out the query parsing
---
 tags/search.py | 46 +++++++++++++++++++++++++++++++++++++++-------
 tags/tests.py  | 50 +++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 88 insertions(+), 8 deletions(-)

diff --git a/tags/search.py b/tags/search.py
index 38876f5..143c33b 100644
--- a/tags/search.py
+++ b/tags/search.py
@@ -6,12 +6,16 @@ least it means we have all of this together in a separate file this way.
 
 import re
 from django.db.models import Q, Subquery
+from collections import namedtuple
 
 from bugsink.moreiterutils import tuplewise
 
 from .models import TagValue, IssueTag, EventTag
 
 
+ParsedQuery = namedtuple("ParsedQ", ["tags", "plain_text"])
+
+
 def _remove_slices(s, slices_to_remove):
     """Returns s with the slices removed."""
     items = [item for tup in slices_to_remove for item in tup]
@@ -31,17 +35,45 @@ def _and_join(q_objects):
     return result
 
 
+def parse_query(q):
+    # The simplest possible query-language that could have any value: key:value is recognized as such; the rest is "free
+    # text"; no support for quoting of spaces.
+    tags = {}
+
+    slices_to_remove = []
+
+    # first, match all key:value pairs with unquoted values
+    for match in re.finditer(r'(\S+):([^\s"]+)', q):
+        slices_to_remove.append(match.span())
+        key, value = match.groups()
+        tags[key] = value
+
+    # then, match all key:"quoted value" pairs
+    for match in re.finditer(r'(\S+):"([^"]+)"', q):
+        slices_to_remove.append(match.span())
+        key, value = match.groups()
+        tags[key] = value
+
+    slices_to_remove.sort(key=lambda tup: tup[0])  # _remove_slices expects the slices to be sorted
+
+    # this is really TSTTCPW (or more like a "fake it till you make it" thing); but I'd rather "have something" and then
+    # have really-good-search than to have either nothing at all, or half-baked search. Note that we didn't even bother
+    # to set indexes on the fields we search on (nor create a single searchable field for the whole of 'title').
+    plain_text_q = _remove_slices(q, slices_to_remove).strip()
+
+    return ParsedQuery(tags, plain_text_q)
+
+
 def _search(TagClz, fk_fieldname, project, obj_list, q):
     if not q:
         return obj_list
 
+    parsed = parse_query(q)
+
     # The simplest possible query-language that could have any value: key:value is recognized as such; the rest is "free
     # text"; no support for quoting of spaces.
-    slices_to_remove = []
     clauses = []
-    for match in re.finditer(r"(\S+):(\S+)", q):
-        slices_to_remove.append(match.span())
-        key, value = match.groups()
+    for key, value in parsed.tags.items():
         try:
             tag_value_obj = TagValue.objects.get(project=project, key__key=key, value=value)
         except TagValue.DoesNotExist:
@@ -58,9 +90,9 @@ def _search(TagClz, fk_fieldname, project, obj_list, q):
     # this is really TSTTCPW (or more like a "fake it till you make it" thing); but I'd rather "have something" and then
     # have really-good-search than to have either nothing at all, or half-baked search. Note that we didn't even bother
     # to set indexes on the fields we search on (nor create a single searchable field for the whole of 'title').
-    plain_text_q = _remove_slices(q, slices_to_remove).strip()
-    if plain_text_q:
-        clauses.append(Q(Q(calculated_type__icontains=plain_text_q) | Q(calculated_value__icontains=plain_text_q)))
+    if parsed.plain_text:
+        clauses.append(
+            Q(Q(calculated_type__icontains=parsed.plain_text) | Q(calculated_value__icontains=parsed.plain_text)))
 
     # if we reach this point, there's always either a plain_text_q or some key/value pair (this is a condition for
     # _and_join)
diff --git a/tags/tests.py b/tags/tests.py
index 9c4c2ae..0714406 100644
--- a/tags/tests.py
+++ b/tags/tests.py
@@ -9,7 +9,7 @@ from events.models import Event
 
 from .models import store_tags
 from .utils import deduce_tags
-from .search import search_events, search_issues
+from .search import search_events, search_issues, parse_query
 
 
 class DeduceTagsTestCase(RegularTestCase):
@@ -101,6 +101,54 @@ class StoreTagsTestCase(DjangoTestCase):
         self.assertEqual(self.issue.tags.first().value.key.key, "foo")
 
 
+class SearchParserTestCase(RegularTestCase):
+
+    def test_parser(self):
+        # we don't actually do the below, empty queries are never parsed
+        # self.assertEquals(({}, ""), parse_query(""))
+
+        self.assertEquals(({}, "FindableException"), parse_query("FindableException"))
+        self.assertEquals(({}, "findable value"), parse_query("findable value"))
+
+        self.assertEquals(({"key": "value"}, ""),  parse_query("key:value"))
+        self.assertEquals(
+            ({"key": "value", "anotherkey": "anothervalue"}, ""),
+            parse_query("key:value anotherkey:anothervalue"))
+
+        self.assertEquals(
+            ({"keys.may.have.dots": "values.may.have.dots.too"}, ""),
+            parse_query("keys.may.have.dots:values.may.have.dots.too"))
+
+        self.assertEquals(
+            ({"key": "value"}, "some text goes here"),
+            parse_query("key:value some text goes here"))
+
+        self.assertEquals(
+            ({}, "text  with  spaces  everywhere"),
+            parse_query("text  with  spaces  everywhere"))
+
+        self.assertEquals(
+            ({}, "key: preceded by space"),
+            parse_query("key: preceded by space"))
+
+        self.assertEquals(
+            ({"key": "quoted value"}, ""),
+            parse_query('key:"quoted value"'))
+
+        self.assertEquals(
+            ({"key": "quoted value"}, "and further text"),
+            parse_query('key:"quoted value" and further text'))
+
+        # This is the kind of test that just documents "what is" rather than "what I believe is right". The weirdness
+        # here is mostly the double space "on  both" which is the result of just cutting out the key:value bits. But...
+        # I'm not invested in getting this more precise (yet), because this whole case is a bit weird. I'd much rather
+        # point people in the direction of "put k:v at the beginning, and any free text at the end" (which is something
+        # we could even validate on at some later point).
+        self.assertEquals(
+            ({"key": "value"}, "text on  both sides"),
+            parse_query("text on key:value both sides"))
+
+
 class SearchTestCase(DjangoTestCase):
     """'Integration'-test; assuming Tags are stored correctly in the DB, can we search for them?"""