filename-guessing: implement (in an efficient manner)

also deals with the case of not being able to find a good matcher (i.e result is None)
2026-03-10 08:01:17 +00:00 · 2024-09-17 13:04:00 +02:00
parent d9e611ee9e
commit fd278a8a34
2 changed files with 82 additions and 9 deletions
--- a/bugsink/pygments_extensions.py
+++ b/bugsink/pygments_extensions.py
@@ -0,0 +1,74 @@
+from pygments.lexers import _iter_lexerclasses, _fn_matches
+from os.path import basename
+
+
+_all_lexers = None
+
+
+def get_all_lexers():
+    global _all_lexers
+    if _all_lexers is None:
+        _all_lexers = MRUList(_iter_lexerclasses())
+    return _all_lexers
+
+
+class MRUList(object):
+    """
+    Is this called a MRUList in the literature? I don't know. I'm calling it that because it is a list, ordered by the
+    most recently "used", where "used" is defined to be "the caller said that this is the thing they were looking for.
+    """
+
+    def __init__(self, iterable):
+        self._list = list(iterable)
+
+    def get(self, test):
+        """test: a function that takes one argument and returns a boolean. it represents 'I was looking for this'."""
+
+        # we iterate in reversed order because .pop() and .append() are O(1) at the end of the list.
+        # reversed(range()) is "not expensive": empirically: next(reversed(range(10**99)))
+
+        for i in reversed(range(len(self._list))):
+            if test(self._list[i]):
+                result = self._list.pop(i)
+                self._list.append(result)
+                return result
+
+        raise ValueError("No item in the list matched the test")
+
+
+def guess_lexer_for_filename(_fn, **options):
+    """
+    Similar to pygments' guess_lexer_for_filename, but:
+
+    * we iterate over the lexers in order of "most recently matched".
+    * we return only a single result based on filename.
+    * we don't have the "code" argument.
+
+    We return None if no lexer matches the filename.
+
+    This significantly speeds up the guessing process: when using 'vanilla' pygments, the guessing takes approximately
+    5ms (note that on stacktraces there may easily be 20 frames, so this goes times 20 i.e. in the 100ms range). We can
+    do it in ~.01ms. this is unsurprising, because pygments always does ~500 tests (regex calls), and we only do a few
+    for the most common programming languages (fractionally above 1 on average, because you'll have only a handful in
+    practice, and that handfull will typically not alternate much in a given stacktrace).
+
+    (initialization, i.e. setting the caches, takes ~.2s in both approaches)
+    """
+
+    fn = basename(_fn)
+
+    def test(lexer):
+        for filename in lexer.filenames:
+            if _fn_matches(fn, filename):
+                return True
+
+        for filename in lexer.alias_filenames:
+            if _fn_matches(fn, filename):
+                return True
+
+        return False
+
+    try:
+        return get_all_lexers().get(test)(**options)
+    except ValueError:
+        return None
--- a/theme/templatetags/issues.py
+++ b/theme/templatetags/issues.py
@@ -3,7 +3,7 @@ from django import template
 from pygments import highlight
 from pygments.lexers import PythonLexer
 from pygments.formatters import HtmlFormatter
-# from pygments.lexers import guess_lexer_for_filename
+from bugsink.pygments_extensions import guess_lexer_for_filename

 from django.utils.safestring import mark_safe

@@ -26,14 +26,13 @@ def _core_pygments(code, filename=None):
    # PythonLexer(stripnl=False) does not actually work; we work around it by inserting a space in the empty lines
    # before calling this function.

-    # TODO guessing, as implemented here, takes the majority of time to render the page. For now I'm just turning it
-    # off, if (when) we want to turn this back on we could either [1] implement something more performant (preferred
-    # option), perhaps by giving a greater role to the filename or [2] cache the result of the guessing (or even of the
-    # whole of pygemtize() (but "caching is hard").
-    # then, once we actually implement guessing, the PythonLexer should probably not be the fallback (instead: guessing
-    # without filename)
-    # lexer = guess_lexer_for_filename(filename, code) if filename else PythonLexer()
-    lexer = PythonLexer()
+    # TODO the PythonLexer not be the fallback (instead: guessing without filename)
+    if filename:
+        lexer = guess_lexer_for_filename(filename)
+        if lexer is None:
+            lexer = PythonLexer()
+    else:
+        lexer = PythonLexer()

    result = highlight(code, lexer, HtmlFormatter(nowrap=True))