filename-guessing: implement (in an efficient manner)

also deals with the case of not being able to find a good matcher (i.e result is None)
This commit is contained in:
Klaas van Schelven
2024-09-17 13:04:00 +02:00
parent d9e611ee9e
commit fd278a8a34
2 changed files with 82 additions and 9 deletions

View File

@@ -0,0 +1,74 @@
from pygments.lexers import _iter_lexerclasses, _fn_matches
from os.path import basename
_all_lexers = None
def get_all_lexers():
global _all_lexers
if _all_lexers is None:
_all_lexers = MRUList(_iter_lexerclasses())
return _all_lexers
class MRUList(object):
"""
Is this called a MRUList in the literature? I don't know. I'm calling it that because it is a list, ordered by the
most recently "used", where "used" is defined to be "the caller said that this is the thing they were looking for.
"""
def __init__(self, iterable):
self._list = list(iterable)
def get(self, test):
"""test: a function that takes one argument and returns a boolean. it represents 'I was looking for this'."""
# we iterate in reversed order because .pop() and .append() are O(1) at the end of the list.
# reversed(range()) is "not expensive": empirically: next(reversed(range(10**99)))
for i in reversed(range(len(self._list))):
if test(self._list[i]):
result = self._list.pop(i)
self._list.append(result)
return result
raise ValueError("No item in the list matched the test")
def guess_lexer_for_filename(_fn, **options):
"""
Similar to pygments' guess_lexer_for_filename, but:
* we iterate over the lexers in order of "most recently matched".
* we return only a single result based on filename.
* we don't have the "code" argument.
We return None if no lexer matches the filename.
This significantly speeds up the guessing process: when using 'vanilla' pygments, the guessing takes approximately
5ms (note that on stacktraces there may easily be 20 frames, so this goes times 20 i.e. in the 100ms range). We can
do it in ~.01ms. this is unsurprising, because pygments always does ~500 tests (regex calls), and we only do a few
for the most common programming languages (fractionally above 1 on average, because you'll have only a handful in
practice, and that handfull will typically not alternate much in a given stacktrace).
(initialization, i.e. setting the caches, takes ~.2s in both approaches)
"""
fn = basename(_fn)
def test(lexer):
for filename in lexer.filenames:
if _fn_matches(fn, filename):
return True
for filename in lexer.alias_filenames:
if _fn_matches(fn, filename):
return True
return False
try:
return get_all_lexers().get(test)(**options)
except ValueError:
return None

View File

@@ -3,7 +3,7 @@ from django import template
from pygments import highlight
from pygments.lexers import PythonLexer
from pygments.formatters import HtmlFormatter
# from pygments.lexers import guess_lexer_for_filename
from bugsink.pygments_extensions import guess_lexer_for_filename
from django.utils.safestring import mark_safe
@@ -26,14 +26,13 @@ def _core_pygments(code, filename=None):
# PythonLexer(stripnl=False) does not actually work; we work around it by inserting a space in the empty lines
# before calling this function.
# TODO guessing, as implemented here, takes the majority of time to render the page. For now I'm just turning it
# off, if (when) we want to turn this back on we could either [1] implement something more performant (preferred
# option), perhaps by giving a greater role to the filename or [2] cache the result of the guessing (or even of the
# whole of pygemtize() (but "caching is hard").
# then, once we actually implement guessing, the PythonLexer should probably not be the fallback (instead: guessing
# without filename)
# lexer = guess_lexer_for_filename(filename, code) if filename else PythonLexer()
lexer = PythonLexer()
# TODO the PythonLexer not be the fallback (instead: guessing without filename)
if filename:
lexer = guess_lexer_for_filename(filename)
if lexer is None:
lexer = PythonLexer()
else:
lexer = PythonLexer()
result = highlight(code, lexer, HtmlFormatter(nowrap=True))