From 5810c1c5800668f1fc0022014c4199a036ad286f Mon Sep 17 00:00:00 2001
From: Klaas van Schelven <klaas@vanschelven.com>
Date: Mon, 16 Dec 2024 21:32:09 +0100
Subject: [PATCH] choose_lexer_for_pattern: choose DjangoLexer explicitly when
 platform is Python

Other than fixing syntax higlighting for my favorite language, this has
the advantage of actually being a _deterministic_ solution (the previous
solution would do first-match) which makes the extra complexity worth-while.
---
 bugsink/pygments_extensions.py | 127 +++++++++++++++++++++++++++------
 theme/templatetags/issues.py   |   2 +-
 theme/tests.py                 |   9 +++
 3 files changed, 115 insertions(+), 23 deletions(-)

diff --git a/bugsink/pygments_extensions.py b/bugsink/pygments_extensions.py
index 0a01007..bc94bd0 100644
--- a/bugsink/pygments_extensions.py
+++ b/bugsink/pygments_extensions.py
@@ -1,10 +1,22 @@
+"""
+Pygments has the following limitations:
+
+1. Bad guessing mechanisms (especially for snippets)
+2. Generally bad support for snippets.
+
+Still, I think it's what we have in the Python world, so I'm just extending around the limitations; (mostly [1])
+
+"""
 from collections import defaultdict
-from pygments.lexers import _iter_lexerclasses, _fn_matches
+from pygments.lexers import _iter_lexerclasses, _fn_matches, HtmlLexer, HtmlDjangoLexer
+from pygments.lexer import DelegatingLexer
+
 from os.path import basename
 
 from pygments.lexers import (
     ActionScript3Lexer, CLexer, ColdfusionHtmlLexer, CSharpLexer, HaskellLexer, GoLexer, GroovyLexer, JavaLexer,
-    JavascriptLexer, ObjectiveCLexer, PerlLexer, PhpLexer, PythonLexer, RubyLexer, TextLexer, XmlPhpLexer)
+    JavascriptLexer, ObjectiveCLexer, PerlLexer, PhpLexer, PythonLexer, RubyLexer, TextLexer, XmlPhpLexer,
+)
 
 _all_lexers = None
 
@@ -20,10 +32,10 @@ def get_all_lexers():
     if _all_lexers is None:
         d = defaultdict(list)
         for lexer in _iter_lexerclasses():
-            for filename in lexer.filenames:
-                d[filename].append(lexer)
-            for filename in lexer.alias_filenames:
-                d[filename].append(lexer)
+            for pattern in lexer.filenames:
+                d[pattern].append(lexer)
+            for pattern in lexer.alias_filenames:
+                d[pattern].append(lexer)
         _all_lexers = MRUList(d.items())
 
     return _all_lexers
@@ -53,7 +65,7 @@ class MRUList(object):
         raise ValueError("No item in the list matched the test")
 
 
-def guess_lexer_for_filename(_fn, code=None, **options):
+def guess_lexer_for_filename(_fn, platform, code=None, **options):
     """
     Similar to pygments' guess_lexer_for_filename, but:
 
@@ -72,30 +84,20 @@ def guess_lexer_for_filename(_fn, code=None, **options):
     (initialization, i.e. setting the caches, takes ~.2s in both approaches)
     """
 
-    fn = basename(_fn)
+    filename = basename(_fn)
 
     def test(tup):
         pattern, classes = tup
-        return _fn_matches(fn, pattern)
+        return _fn_matches(filename, pattern)
 
     try:
         pattern, classes = get_all_lexers().get(test)
     except ValueError:
         return None
 
-    def get_rating(cls):
-        # from pygmets/lexers/__init__'s guess_lexer_for_filename; but adapted for "cls only"; and `code` is required
-        return cls.analyse_text(code)
-
-    if len(classes) == 1:
-        # optimization: if there's only one, we don't need to sort. note: guarding for 0-len is not necessary, because
-        # of how we construct the lookup table (we always have at least one lexer for each pattern)
-        clz = classes[0]
-    else:
-        classes = sorted(classes, key=get_rating)
-
-    clz = classes[-1]
-    print([(get_rating(a), a) for a in classes])
+    clz = choose_lexer_for_pattern(pattern, classes, filename, code, platform)
+    if clz is None:
+        return None
 
     options = _custom_options(clz, options)
     return clz(**options)
@@ -134,3 +136,84 @@ def lexer_for_platform(platform, **options):
     }[platform]
     options = _custom_options(clz, options)
     return clz(**options)
+
+
+def choose_lexer_for_pattern(pattern, classes, filename, code, platform):
+    """
+    This function chooses a lexer from the list of Lexers that Pygments associates with a given filename pattern.
+
+    Pygments' Lexer.analyse_text is basically useless for us: generally poor quality, and depending on full-file code
+    samples (we have 11-line snippets only). We choose to ignore that function and any pygments function that makes use
+    of it, replacing it with the current function. The upside is: this gives us a lot of control. Also upside: once
+    we're at this point we typically have to pick from a handful (max observed: 9) of lexers.
+
+    How we pick:
+
+    * If Pygments associates exactly one Lexer, we simply take Pygments' suggestion. (720 patterns)
+    * For ~70 patterns, there is not a single Lexer associated. This is where this function's heuristics come into play.
+
+    We take a quite conservative approach in disambiguating: the fallback if we don't manage to pick is to just use
+    "platform", which will usually be quite good.
+
+    The idea here is that a Stacktrace (which is what we're dealing with here) is typically a single-language
+    artifact, and the expectation is that SDKs encode which language that is in the "platform" attr. The cases of
+    mixed-language stacktraces are actually the rarity, presumably related to transpiling/templating and such.
+    Emperically: the only _real_ case I've observed in the wild where the current function breaks a tie is: picking
+    between Html template lexers (in particular: picking Django). (the fact that Bugsink is also Django skews this)
+
+    In any case, I'd rather add cases to this function as needed ("complaint-driven development") than have some big
+    ball of potentially very fragile tie-in with pygments' logic (note: Pygments has some 500 lexers, the vast majority
+    of which will not ever show up, simply because no SDK exists for it). In short: the breakage of missing cases is
+    expected to be much easier to reason about than that of "too much magic".
+
+    (counts of cases are from Pygments==2.16.1, end-of-2024)
+    """
+    if len(classes) == 1:
+        # the non-heuristic case (nothing to choose) doubles as an optimization. (0-len check not needed because the
+        # lookup table is constructed by looping over the existing Lexers)
+        return classes[0]
+
+    # heuristics below
+
+    if pattern in ["*.html", "*.htm", "*.xhtml", "*.xslt", "*.html"]:  # (deduced from 'if HtmlLexer in classes')
+        if platform == "python":
+            return HtmlDjangoLexer  # which is actually the Django/Jinja lexer (which makes it an even better guess)
+
+        # alternative solution: look at the code, deduce from there. but I reasoned: the only reason html code appears
+        # on the stacktrace is if it's a template. Your SDK must do some explicit work to get it there. And the only
+        # such case I know is the one of the DjangoIntegration.
+        # if re.search(r'\{%.*%\}', code) is not None or re.search(r'\{\{.*\}\}', code) is not None:
+
+        # The other HtmlLexer-like ones are: EvoqueHtmlLexer, HtmlGenshiLexer, HtmlPhpLexer, HtmlSmartyLexer,
+        # LassoHtmlLexer, RhtmlLexer, VelocityHtmlLexer
+        return HtmlLexer
+
+    return None
+
+
+def get_most_basic_if_exists(classes):
+    # UNUSED; kept for reference.
+    #
+    # The reasoning here was: maybe we can just pick the "simplest" of a list of Lexers, which is somehow the base-case.
+    # But the following are deductions I'm unwilling to make (among probably others):
+    #
+    # *.cp => ComponentPascalLexer' from [(ComponentPascalLexer', 4), (CppLexer', 5)]
+    # *.incl => LassoLexer' from [(LassoLexer', 4), (LassoHtmlLexer', 14), (LassoXmlLexer', 14)]
+    # *.xsl => XmlLexer' from [(XmlLexer', 4), (XsltLexer', 5)]
+    #
+    # In the end, unused b/c the general "don't build complex logic on top of Pygments, instead fix based on observed
+    # need"
+
+    def basicness(clz):
+        return len(clz.mro()) + (10 if DelegatingLexer in clz.mro() else 0)  # 10 is 'arbitrary high' for non-basic
+
+    classes = sorted(classes, key=basicness)
+    x_classes = sorted([(basicness(clz), clz) for clz in classes], key=lambda tup: tup[0])
+
+    best_basicness = x_classes[0][0]
+    best_ones = [clz for basicness, clz in x_classes if basicness == best_basicness]
+
+    if len(best_ones) > 1:
+        return None
+
+    return classes[0]
diff --git a/theme/templatetags/issues.py b/theme/templatetags/issues.py
index 9b3ca31..1f7d3b9 100644
--- a/theme/templatetags/issues.py
+++ b/theme/templatetags/issues.py
@@ -35,7 +35,7 @@ def _core_pygments(code, filename=None, platform=None):
     # code is.
 
     if filename:
-        lexer = guess_lexer_for_filename(filename, code=code)
+        lexer = guess_lexer_for_filename(filename, platform, code=code)
         if lexer is None:
             lexer = lexer_for_platform(platform)
     else:
diff --git a/theme/tests.py b/theme/tests.py
index b0dd554..2d444eb 100644
--- a/theme/tests.py
+++ b/theme/tests.py
@@ -1,5 +1,6 @@
 from unittest import TestCase as RegularTestCase
 
+from bugsink.pygments_extensions import choose_lexer_for_pattern, get_all_lexers
 from .templatetags.issues import _pygmentize_lines as actual_pygmentize_lines
 
 
@@ -59,3 +60,11 @@ class TestPygmentizeLineLineCountHandling(RegularTestCase):
 
     def test_pygmentize_lines_newline_on_otherwise_empty_line(self):
         _pygmentize_lines(["\n", "\n", "\n"])
+
+
+class TestChooseLexerForPatter(RegularTestCase):
+    def test_choose_lexer_for_pattern(self):
+        # simple 'does it not crash' test:
+
+        for pattern, lexers in get_all_lexers()._list:
+            choose_lexer_for_pattern(pattern, lexers, "", "", "python")