mirror of
https://github.com/jlengrand/bugsink.git
synced 2026-03-10 08:01:17 +00:00
filename-guessing: implement (in an efficient manner)
also deals with the case of not being able to find a good matcher (i.e result is None)
This commit is contained in:
74
bugsink/pygments_extensions.py
Normal file
74
bugsink/pygments_extensions.py
Normal file
@@ -0,0 +1,74 @@
|
||||
from pygments.lexers import _iter_lexerclasses, _fn_matches
|
||||
from os.path import basename
|
||||
|
||||
|
||||
_all_lexers = None
|
||||
|
||||
|
||||
def get_all_lexers():
|
||||
global _all_lexers
|
||||
if _all_lexers is None:
|
||||
_all_lexers = MRUList(_iter_lexerclasses())
|
||||
return _all_lexers
|
||||
|
||||
|
||||
class MRUList(object):
|
||||
"""
|
||||
Is this called a MRUList in the literature? I don't know. I'm calling it that because it is a list, ordered by the
|
||||
most recently "used", where "used" is defined to be "the caller said that this is the thing they were looking for.
|
||||
"""
|
||||
|
||||
def __init__(self, iterable):
|
||||
self._list = list(iterable)
|
||||
|
||||
def get(self, test):
|
||||
"""test: a function that takes one argument and returns a boolean. it represents 'I was looking for this'."""
|
||||
|
||||
# we iterate in reversed order because .pop() and .append() are O(1) at the end of the list.
|
||||
# reversed(range()) is "not expensive": empirically: next(reversed(range(10**99)))
|
||||
|
||||
for i in reversed(range(len(self._list))):
|
||||
if test(self._list[i]):
|
||||
result = self._list.pop(i)
|
||||
self._list.append(result)
|
||||
return result
|
||||
|
||||
raise ValueError("No item in the list matched the test")
|
||||
|
||||
|
||||
def guess_lexer_for_filename(_fn, **options):
|
||||
"""
|
||||
Similar to pygments' guess_lexer_for_filename, but:
|
||||
|
||||
* we iterate over the lexers in order of "most recently matched".
|
||||
* we return only a single result based on filename.
|
||||
* we don't have the "code" argument.
|
||||
|
||||
We return None if no lexer matches the filename.
|
||||
|
||||
This significantly speeds up the guessing process: when using 'vanilla' pygments, the guessing takes approximately
|
||||
5ms (note that on stacktraces there may easily be 20 frames, so this goes times 20 i.e. in the 100ms range). We can
|
||||
do it in ~.01ms. this is unsurprising, because pygments always does ~500 tests (regex calls), and we only do a few
|
||||
for the most common programming languages (fractionally above 1 on average, because you'll have only a handful in
|
||||
practice, and that handfull will typically not alternate much in a given stacktrace).
|
||||
|
||||
(initialization, i.e. setting the caches, takes ~.2s in both approaches)
|
||||
"""
|
||||
|
||||
fn = basename(_fn)
|
||||
|
||||
def test(lexer):
|
||||
for filename in lexer.filenames:
|
||||
if _fn_matches(fn, filename):
|
||||
return True
|
||||
|
||||
for filename in lexer.alias_filenames:
|
||||
if _fn_matches(fn, filename):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
try:
|
||||
return get_all_lexers().get(test)(**options)
|
||||
except ValueError:
|
||||
return None
|
||||
@@ -3,7 +3,7 @@ from django import template
|
||||
from pygments import highlight
|
||||
from pygments.lexers import PythonLexer
|
||||
from pygments.formatters import HtmlFormatter
|
||||
# from pygments.lexers import guess_lexer_for_filename
|
||||
from bugsink.pygments_extensions import guess_lexer_for_filename
|
||||
|
||||
from django.utils.safestring import mark_safe
|
||||
|
||||
@@ -26,14 +26,13 @@ def _core_pygments(code, filename=None):
|
||||
# PythonLexer(stripnl=False) does not actually work; we work around it by inserting a space in the empty lines
|
||||
# before calling this function.
|
||||
|
||||
# TODO guessing, as implemented here, takes the majority of time to render the page. For now I'm just turning it
|
||||
# off, if (when) we want to turn this back on we could either [1] implement something more performant (preferred
|
||||
# option), perhaps by giving a greater role to the filename or [2] cache the result of the guessing (or even of the
|
||||
# whole of pygemtize() (but "caching is hard").
|
||||
# then, once we actually implement guessing, the PythonLexer should probably not be the fallback (instead: guessing
|
||||
# without filename)
|
||||
# lexer = guess_lexer_for_filename(filename, code) if filename else PythonLexer()
|
||||
lexer = PythonLexer()
|
||||
# TODO the PythonLexer not be the fallback (instead: guessing without filename)
|
||||
if filename:
|
||||
lexer = guess_lexer_for_filename(filename)
|
||||
if lexer is None:
|
||||
lexer = PythonLexer()
|
||||
else:
|
||||
lexer = PythonLexer()
|
||||
|
||||
result = highlight(code, lexer, HtmlFormatter(nowrap=True))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user