From fd278a8a3462d329e5607ef1b72c2560da95ec38 Mon Sep 17 00:00:00 2001 From: Klaas van Schelven Date: Tue, 17 Sep 2024 13:04:00 +0200 Subject: [PATCH] filename-guessing: implement (in an efficient manner) also deals with the case of not being able to find a good matcher (i.e result is None) --- bugsink/pygments_extensions.py | 74 ++++++++++++++++++++++++++++++++++ theme/templatetags/issues.py | 17 ++++---- 2 files changed, 82 insertions(+), 9 deletions(-) create mode 100644 bugsink/pygments_extensions.py diff --git a/bugsink/pygments_extensions.py b/bugsink/pygments_extensions.py new file mode 100644 index 0000000..fd6e3c0 --- /dev/null +++ b/bugsink/pygments_extensions.py @@ -0,0 +1,74 @@ +from pygments.lexers import _iter_lexerclasses, _fn_matches +from os.path import basename + + +_all_lexers = None + + +def get_all_lexers(): + global _all_lexers + if _all_lexers is None: + _all_lexers = MRUList(_iter_lexerclasses()) + return _all_lexers + + +class MRUList(object): + """ + Is this called a MRUList in the literature? I don't know. I'm calling it that because it is a list, ordered by the + most recently "used", where "used" is defined to be "the caller said that this is the thing they were looking for. + """ + + def __init__(self, iterable): + self._list = list(iterable) + + def get(self, test): + """test: a function that takes one argument and returns a boolean. it represents 'I was looking for this'.""" + + # we iterate in reversed order because .pop() and .append() are O(1) at the end of the list. + # reversed(range()) is "not expensive": empirically: next(reversed(range(10**99))) + + for i in reversed(range(len(self._list))): + if test(self._list[i]): + result = self._list.pop(i) + self._list.append(result) + return result + + raise ValueError("No item in the list matched the test") + + +def guess_lexer_for_filename(_fn, **options): + """ + Similar to pygments' guess_lexer_for_filename, but: + + * we iterate over the lexers in order of "most recently matched". + * we return only a single result based on filename. + * we don't have the "code" argument. + + We return None if no lexer matches the filename. + + This significantly speeds up the guessing process: when using 'vanilla' pygments, the guessing takes approximately + 5ms (note that on stacktraces there may easily be 20 frames, so this goes times 20 i.e. in the 100ms range). We can + do it in ~.01ms. this is unsurprising, because pygments always does ~500 tests (regex calls), and we only do a few + for the most common programming languages (fractionally above 1 on average, because you'll have only a handful in + practice, and that handfull will typically not alternate much in a given stacktrace). + + (initialization, i.e. setting the caches, takes ~.2s in both approaches) + """ + + fn = basename(_fn) + + def test(lexer): + for filename in lexer.filenames: + if _fn_matches(fn, filename): + return True + + for filename in lexer.alias_filenames: + if _fn_matches(fn, filename): + return True + + return False + + try: + return get_all_lexers().get(test)(**options) + except ValueError: + return None diff --git a/theme/templatetags/issues.py b/theme/templatetags/issues.py index 990c6e2..43e227c 100644 --- a/theme/templatetags/issues.py +++ b/theme/templatetags/issues.py @@ -3,7 +3,7 @@ from django import template from pygments import highlight from pygments.lexers import PythonLexer from pygments.formatters import HtmlFormatter -# from pygments.lexers import guess_lexer_for_filename +from bugsink.pygments_extensions import guess_lexer_for_filename from django.utils.safestring import mark_safe @@ -26,14 +26,13 @@ def _core_pygments(code, filename=None): # PythonLexer(stripnl=False) does not actually work; we work around it by inserting a space in the empty lines # before calling this function. - # TODO guessing, as implemented here, takes the majority of time to render the page. For now I'm just turning it - # off, if (when) we want to turn this back on we could either [1] implement something more performant (preferred - # option), perhaps by giving a greater role to the filename or [2] cache the result of the guessing (or even of the - # whole of pygemtize() (but "caching is hard"). - # then, once we actually implement guessing, the PythonLexer should probably not be the fallback (instead: guessing - # without filename) - # lexer = guess_lexer_for_filename(filename, code) if filename else PythonLexer() - lexer = PythonLexer() + # TODO the PythonLexer not be the fallback (instead: guessing without filename) + if filename: + lexer = guess_lexer_for_filename(filename) + if lexer is None: + lexer = PythonLexer() + else: + lexer = PythonLexer() result = highlight(code, lexer, HtmlFormatter(nowrap=True))