From 5810c1c5800668f1fc0022014c4199a036ad286f Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Mon, 16 Dec 2024 21:32:09 +0100 Subject: [PATCH] choose_lexer_for_pattern: choose DjangoLexer explicitly when platform is Python Other than fixing syntax higlighting for my favorite language, this has the advantage of actually being a _deterministic_ solution (the previous solution would do first-match) which makes the extra complexity worth-while. --- bugsink/pygments_extensions.py | 127 +++++++++++++++++++++++++++------ theme/templatetags/issues.py | 2 +- theme/tests.py | 9 +++ 3 files changed, 115 insertions(+), 23 deletions(-) diff --git a/bugsink/pygments_extensions.py b/bugsink/pygments_extensions.py index 0a01007..bc94bd0 100644 --- a/bugsink/pygments_extensions.py +++ b/bugsink/pygments_extensions.py @@ -1,10 +1,22 @@ +""" +Pygments has the following limitations: + +1. Bad guessing mechanisms (especially for snippets) +2. Generally bad support for snippets. + +Still, I think it's what we have in the Python world, so I'm just extending around the limitations; (mostly [1]) + +""" from collections import defaultdict -from pygments.lexers import _iter_lexerclasses, _fn_matches +from pygments.lexers import _iter_lexerclasses, _fn_matches, HtmlLexer, HtmlDjangoLexer +from pygments.lexer import DelegatingLexer + from os.path import basename from pygments.lexers import ( ActionScript3Lexer, CLexer, ColdfusionHtmlLexer, CSharpLexer, HaskellLexer, GoLexer, GroovyLexer, JavaLexer, - JavascriptLexer, ObjectiveCLexer, PerlLexer, PhpLexer, PythonLexer, RubyLexer, TextLexer, XmlPhpLexer) + JavascriptLexer, ObjectiveCLexer, PerlLexer, PhpLexer, PythonLexer, RubyLexer, TextLexer, XmlPhpLexer, +) _all_lexers = None @@ -20,10 +32,10 @@ def get_all_lexers(): if _all_lexers is None: d = defaultdict(list) for lexer in _iter_lexerclasses(): - for filename in lexer.filenames: - d[filename].append(lexer) - for filename in lexer.alias_filenames: - d[filename].append(lexer) + for pattern in lexer.filenames: + d[pattern].append(lexer) + for pattern in lexer.alias_filenames: + d[pattern].append(lexer) _all_lexers = MRUList(d.items()) return _all_lexers @@ -53,7 +65,7 @@ class MRUList(object): raise ValueError("No item in the list matched the test") -def guess_lexer_for_filename(_fn, code=None, **options): +def guess_lexer_for_filename(_fn, platform, code=None, **options): """ Similar to pygments' guess_lexer_for_filename, but: @@ -72,30 +84,20 @@ def guess_lexer_for_filename(_fn, code=None, **options): (initialization, i.e. setting the caches, takes ~.2s in both approaches) """ - fn = basename(_fn) + filename = basename(_fn) def test(tup): pattern, classes = tup - return _fn_matches(fn, pattern) + return _fn_matches(filename, pattern) try: pattern, classes = get_all_lexers().get(test) except ValueError: return None - def get_rating(cls): - # from pygmets/lexers/__init__'s guess_lexer_for_filename; but adapted for "cls only"; and `code` is required - return cls.analyse_text(code) - - if len(classes) == 1: - # optimization: if there's only one, we don't need to sort. note: guarding for 0-len is not necessary, because - # of how we construct the lookup table (we always have at least one lexer for each pattern) - clz = classes[0] - else: - classes = sorted(classes, key=get_rating) - - clz = classes[-1] - print([(get_rating(a), a) for a in classes]) + clz = choose_lexer_for_pattern(pattern, classes, filename, code, platform) + if clz is None: + return None options = _custom_options(clz, options) return clz(**options) @@ -134,3 +136,84 @@ def lexer_for_platform(platform, **options): }[platform] options = _custom_options(clz, options) return clz(**options) + + +def choose_lexer_for_pattern(pattern, classes, filename, code, platform): + """ + This function chooses a lexer from the list of Lexers that Pygments associates with a given filename pattern. + + Pygments' Lexer.analyse_text is basically useless for us: generally poor quality, and depending on full-file code + samples (we have 11-line snippets only). We choose to ignore that function and any pygments function that makes use + of it, replacing it with the current function. The upside is: this gives us a lot of control. Also upside: once + we're at this point we typically have to pick from a handful (max observed: 9) of lexers. + + How we pick: + + * If Pygments associates exactly one Lexer, we simply take Pygments' suggestion. (720 patterns) + * For ~70 patterns, there is not a single Lexer associated. This is where this function's heuristics come into play. + + We take a quite conservative approach in disambiguating: the fallback if we don't manage to pick is to just use + "platform", which will usually be quite good. + + The idea here is that a Stacktrace (which is what we're dealing with here) is typically a single-language + artifact, and the expectation is that SDKs encode which language that is in the "platform" attr. The cases of + mixed-language stacktraces are actually the rarity, presumably related to transpiling/templating and such. + Emperically: the only _real_ case I've observed in the wild where the current function breaks a tie is: picking + between Html template lexers (in particular: picking Django). (the fact that Bugsink is also Django skews this) + + In any case, I'd rather add cases to this function as needed ("complaint-driven development") than have some big + ball of potentially very fragile tie-in with pygments' logic (note: Pygments has some 500 lexers, the vast majority + of which will not ever show up, simply because no SDK exists for it). In short: the breakage of missing cases is + expected to be much easier to reason about than that of "too much magic". + + (counts of cases are from Pygments==2.16.1, end-of-2024) + """ + if len(classes) == 1: + # the non-heuristic case (nothing to choose) doubles as an optimization. (0-len check not needed because the + # lookup table is constructed by looping over the existing Lexers) + return classes[0] + + # heuristics below + + if pattern in ["*.html", "*.htm", "*.xhtml", "*.xslt", "*.html"]: # (deduced from 'if HtmlLexer in classes') + if platform == "python": + return HtmlDjangoLexer # which is actually the Django/Jinja lexer (which makes it an even better guess) + + # alternative solution: look at the code, deduce from there. but I reasoned: the only reason html code appears + # on the stacktrace is if it's a template. Your SDK must do some explicit work to get it there. And the only + # such case I know is the one of the DjangoIntegration. + # if re.search(r'\{%.*%\}', code) is not None or re.search(r'\{\{.*\}\}', code) is not None: + + # The other HtmlLexer-like ones are: EvoqueHtmlLexer, HtmlGenshiLexer, HtmlPhpLexer, HtmlSmartyLexer, + # LassoHtmlLexer, RhtmlLexer, VelocityHtmlLexer + return HtmlLexer + + return None + + +def get_most_basic_if_exists(classes): + # UNUSED; kept for reference. + # + # The reasoning here was: maybe we can just pick the "simplest" of a list of Lexers, which is somehow the base-case. + # But the following are deductions I'm unwilling to make (among probably others): + # + # *.cp => ComponentPascalLexer' from [(ComponentPascalLexer', 4), (CppLexer', 5)] + # *.incl => LassoLexer' from [(LassoLexer', 4), (LassoHtmlLexer', 14), (LassoXmlLexer', 14)] + # *.xsl => XmlLexer' from [(XmlLexer', 4), (XsltLexer', 5)] + # + # In the end, unused b/c the general "don't build complex logic on top of Pygments, instead fix based on observed + # need" + + def basicness(clz): + return len(clz.mro()) + (10 if DelegatingLexer in clz.mro() else 0) # 10 is 'arbitrary high' for non-basic + + classes = sorted(classes, key=basicness) + x_classes = sorted([(basicness(clz), clz) for clz in classes], key=lambda tup: tup[0]) + + best_basicness = x_classes[0][0] + best_ones = [clz for basicness, clz in x_classes if basicness == best_basicness] + + if len(best_ones) > 1: + return None + + return classes[0] diff --git a/theme/templatetags/issues.py b/theme/templatetags/issues.py index 9b3ca31..1f7d3b9 100644 --- a/theme/templatetags/issues.py +++ b/theme/templatetags/issues.py @@ -35,7 +35,7 @@ def _core_pygments(code, filename=None, platform=None): # code is. if filename: - lexer = guess_lexer_for_filename(filename, code=code) + lexer = guess_lexer_for_filename(filename, platform, code=code) if lexer is None: lexer = lexer_for_platform(platform) else: diff --git a/theme/tests.py b/theme/tests.py index b0dd554..2d444eb 100644 --- a/theme/tests.py +++ b/theme/tests.py @@ -1,5 +1,6 @@ from unittest import TestCase as RegularTestCase +from bugsink.pygments_extensions import choose_lexer_for_pattern, get_all_lexers from .templatetags.issues import _pygmentize_lines as actual_pygmentize_lines @@ -59,3 +60,11 @@ class TestPygmentizeLineLineCountHandling(RegularTestCase): def test_pygmentize_lines_newline_on_otherwise_empty_line(self): _pygmentize_lines(["\n", "\n", "\n"]) + + +class TestChooseLexerForPatter(RegularTestCase): + def test_choose_lexer_for_pattern(self): + # simple 'does it not crash' test: + + for pattern, lexers in get_all_lexers()._list: + choose_lexer_for_pattern(pattern, lexers, "", "", "python")