You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1521 lines
58 KiB
1521 lines
58 KiB
""" |
|
pygments.lexers.markup |
|
~~~~~~~~~~~~~~~~~~~~~~ |
|
|
|
Lexers for non-HTML markup languages. |
|
|
|
:copyright: Copyright 2006-2023 by the Pygments team, see AUTHORS. |
|
:license: BSD, see LICENSE for details. |
|
""" |
|
|
|
import re |
|
|
|
from pygments.lexers.html import XmlLexer |
|
from pygments.lexers.javascript import JavascriptLexer |
|
from pygments.lexers.css import CssLexer |
|
from pygments.lexers.lilypond import LilyPondLexer |
|
from pygments.lexers.data import JsonLexer |
|
|
|
from pygments.lexer import RegexLexer, DelegatingLexer, include, bygroups, \ |
|
using, this, do_insertions, default, words |
|
from pygments.token import Text, Comment, Operator, Keyword, Name, String, \ |
|
Number, Punctuation, Generic, Other, Whitespace |
|
from pygments.util import get_bool_opt, ClassNotFound |
|
|
|
__all__ = ['BBCodeLexer', 'MoinWikiLexer', 'RstLexer', 'TexLexer', 'GroffLexer', |
|
'MozPreprocHashLexer', 'MozPreprocPercentLexer', |
|
'MozPreprocXulLexer', 'MozPreprocJavascriptLexer', |
|
'MozPreprocCssLexer', 'MarkdownLexer', 'TiddlyWiki5Lexer', 'WikitextLexer'] |
|
|
|
|
|
class BBCodeLexer(RegexLexer): |
|
""" |
|
A lexer that highlights BBCode(-like) syntax. |
|
|
|
.. versionadded:: 0.6 |
|
""" |
|
|
|
name = 'BBCode' |
|
aliases = ['bbcode'] |
|
mimetypes = ['text/x-bbcode'] |
|
|
|
tokens = { |
|
'root': [ |
|
(r'[^[]+', Text), |
|
# tag/end tag begin |
|
(r'\[/?\w+', Keyword, 'tag'), |
|
# stray bracket |
|
(r'\[', Text), |
|
], |
|
'tag': [ |
|
(r'\s+', Text), |
|
# attribute with value |
|
(r'(\w+)(=)("?[^\s"\]]+"?)', |
|
bygroups(Name.Attribute, Operator, String)), |
|
# tag argument (a la [color=green]) |
|
(r'(=)("?[^\s"\]]+"?)', |
|
bygroups(Operator, String)), |
|
# tag end |
|
(r'\]', Keyword, '#pop'), |
|
], |
|
} |
|
|
|
|
|
class MoinWikiLexer(RegexLexer): |
|
""" |
|
For MoinMoin (and Trac) Wiki markup. |
|
|
|
.. versionadded:: 0.7 |
|
""" |
|
|
|
name = 'MoinMoin/Trac Wiki markup' |
|
aliases = ['trac-wiki', 'moin'] |
|
filenames = [] |
|
mimetypes = ['text/x-trac-wiki'] |
|
flags = re.MULTILINE | re.IGNORECASE |
|
|
|
tokens = { |
|
'root': [ |
|
(r'^#.*$', Comment), |
|
(r'(!)(\S+)', bygroups(Keyword, Text)), # Ignore-next |
|
# Titles |
|
(r'^(=+)([^=]+)(=+)(\s*#.+)?$', |
|
bygroups(Generic.Heading, using(this), Generic.Heading, String)), |
|
# Literal code blocks, with optional shebang |
|
(r'(\{\{\{)(\n#!.+)?', bygroups(Name.Builtin, Name.Namespace), 'codeblock'), |
|
(r'(\'\'\'?|\|\||`|__|~~|\^|,,|::)', Comment), # Formatting |
|
# Lists |
|
(r'^( +)([.*-])( )', bygroups(Text, Name.Builtin, Text)), |
|
(r'^( +)([a-z]{1,5}\.)( )', bygroups(Text, Name.Builtin, Text)), |
|
# Other Formatting |
|
(r'\[\[\w+.*?\]\]', Keyword), # Macro |
|
(r'(\[[^\s\]]+)(\s+[^\]]+?)?(\])', |
|
bygroups(Keyword, String, Keyword)), # Link |
|
(r'^----+$', Keyword), # Horizontal rules |
|
(r'[^\n\'\[{!_~^,|]+', Text), |
|
(r'\n', Text), |
|
(r'.', Text), |
|
], |
|
'codeblock': [ |
|
(r'\}\}\}', Name.Builtin, '#pop'), |
|
# these blocks are allowed to be nested in Trac, but not MoinMoin |
|
(r'\{\{\{', Text, '#push'), |
|
(r'[^{}]+', Comment.Preproc), # slurp boring text |
|
(r'.', Comment.Preproc), # allow loose { or } |
|
], |
|
} |
|
|
|
|
|
class RstLexer(RegexLexer): |
|
""" |
|
For reStructuredText markup. |
|
|
|
.. versionadded:: 0.7 |
|
|
|
Additional options accepted: |
|
|
|
`handlecodeblocks` |
|
Highlight the contents of ``.. sourcecode:: language``, |
|
``.. code:: language`` and ``.. code-block:: language`` |
|
directives with a lexer for the given language (default: |
|
``True``). |
|
|
|
.. versionadded:: 0.8 |
|
""" |
|
name = 'reStructuredText' |
|
url = 'https://docutils.sourceforge.io/rst.html' |
|
aliases = ['restructuredtext', 'rst', 'rest'] |
|
filenames = ['*.rst', '*.rest'] |
|
mimetypes = ["text/x-rst", "text/prs.fallenstein.rst"] |
|
flags = re.MULTILINE |
|
|
|
def _handle_sourcecode(self, match): |
|
from pygments.lexers import get_lexer_by_name |
|
|
|
# section header |
|
yield match.start(1), Punctuation, match.group(1) |
|
yield match.start(2), Text, match.group(2) |
|
yield match.start(3), Operator.Word, match.group(3) |
|
yield match.start(4), Punctuation, match.group(4) |
|
yield match.start(5), Text, match.group(5) |
|
yield match.start(6), Keyword, match.group(6) |
|
yield match.start(7), Text, match.group(7) |
|
|
|
# lookup lexer if wanted and existing |
|
lexer = None |
|
if self.handlecodeblocks: |
|
try: |
|
lexer = get_lexer_by_name(match.group(6).strip()) |
|
except ClassNotFound: |
|
pass |
|
indention = match.group(8) |
|
indention_size = len(indention) |
|
code = (indention + match.group(9) + match.group(10) + match.group(11)) |
|
|
|
# no lexer for this language. handle it like it was a code block |
|
if lexer is None: |
|
yield match.start(8), String, code |
|
return |
|
|
|
# highlight the lines with the lexer. |
|
ins = [] |
|
codelines = code.splitlines(True) |
|
code = '' |
|
for line in codelines: |
|
if len(line) > indention_size: |
|
ins.append((len(code), [(0, Text, line[:indention_size])])) |
|
code += line[indention_size:] |
|
else: |
|
code += line |
|
yield from do_insertions(ins, lexer.get_tokens_unprocessed(code)) |
|
|
|
# from docutils.parsers.rst.states |
|
closers = '\'")]}>\u2019\u201d\xbb!?' |
|
unicode_delimiters = '\u2010\u2011\u2012\u2013\u2014\u00a0' |
|
end_string_suffix = (r'((?=$)|(?=[-/:.,; \n\x00%s%s]))' |
|
% (re.escape(unicode_delimiters), |
|
re.escape(closers))) |
|
|
|
tokens = { |
|
'root': [ |
|
# Heading with overline |
|
(r'^(=+|-+|`+|:+|\.+|\'+|"+|~+|\^+|_+|\*+|\++|#+)([ \t]*\n)' |
|
r'(.+)(\n)(\1)(\n)', |
|
bygroups(Generic.Heading, Text, Generic.Heading, |
|
Text, Generic.Heading, Text)), |
|
# Plain heading |
|
(r'^(\S.*)(\n)(={3,}|-{3,}|`{3,}|:{3,}|\.{3,}|\'{3,}|"{3,}|' |
|
r'~{3,}|\^{3,}|_{3,}|\*{3,}|\+{3,}|#{3,})(\n)', |
|
bygroups(Generic.Heading, Text, Generic.Heading, Text)), |
|
# Bulleted lists |
|
(r'^(\s*)([-*+])( .+\n(?:\1 .+\n)*)', |
|
bygroups(Text, Number, using(this, state='inline'))), |
|
# Numbered lists |
|
(r'^(\s*)([0-9#ivxlcmIVXLCM]+\.)( .+\n(?:\1 .+\n)*)', |
|
bygroups(Text, Number, using(this, state='inline'))), |
|
(r'^(\s*)(\(?[0-9#ivxlcmIVXLCM]+\))( .+\n(?:\1 .+\n)*)', |
|
bygroups(Text, Number, using(this, state='inline'))), |
|
# Numbered, but keep words at BOL from becoming lists |
|
(r'^(\s*)([A-Z]+\.)( .+\n(?:\1 .+\n)+)', |
|
bygroups(Text, Number, using(this, state='inline'))), |
|
(r'^(\s*)(\(?[A-Za-z]+\))( .+\n(?:\1 .+\n)+)', |
|
bygroups(Text, Number, using(this, state='inline'))), |
|
# Line blocks |
|
(r'^(\s*)(\|)( .+\n(?:\| .+\n)*)', |
|
bygroups(Text, Operator, using(this, state='inline'))), |
|
# Sourcecode directives |
|
(r'^( *\.\.)(\s*)((?:source)?code(?:-block)?)(::)([ \t]*)([^\n]+)' |
|
r'(\n[ \t]*\n)([ \t]+)(.*)(\n)((?:(?:\8.*)?\n)+)', |
|
_handle_sourcecode), |
|
# A directive |
|
(r'^( *\.\.)(\s*)([\w:-]+?)(::)(?:([ \t]*)(.*))', |
|
bygroups(Punctuation, Text, Operator.Word, Punctuation, Text, |
|
using(this, state='inline'))), |
|
# A reference target |
|
(r'^( *\.\.)(\s*)(_(?:[^:\\]|\\.)+:)(.*?)$', |
|
bygroups(Punctuation, Text, Name.Tag, using(this, state='inline'))), |
|
# A footnote/citation target |
|
(r'^( *\.\.)(\s*)(\[.+\])(.*?)$', |
|
bygroups(Punctuation, Text, Name.Tag, using(this, state='inline'))), |
|
# A substitution def |
|
(r'^( *\.\.)(\s*)(\|.+\|)(\s*)([\w:-]+?)(::)(?:([ \t]*)(.*))', |
|
bygroups(Punctuation, Text, Name.Tag, Text, Operator.Word, |
|
Punctuation, Text, using(this, state='inline'))), |
|
# Comments |
|
(r'^ *\.\..*(\n( +.*\n|\n)+)?', Comment.Preproc), |
|
# Field list marker |
|
(r'^( *)(:(?:\\\\|\\:|[^:\n])+:(?=\s))([ \t]*)', |
|
bygroups(Text, Name.Class, Text)), |
|
# Definition list |
|
(r'^(\S.*(?<!::)\n)((?:(?: +.*)\n)+)', |
|
bygroups(using(this, state='inline'), using(this, state='inline'))), |
|
# Code blocks |
|
(r'(::)(\n[ \t]*\n)([ \t]+)(.*)(\n)((?:(?:\3.*)?\n)+)', |
|
bygroups(String.Escape, Text, String, String, Text, String)), |
|
include('inline'), |
|
], |
|
'inline': [ |
|
(r'\\.', Text), # escape |
|
(r'``', String, 'literal'), # code |
|
(r'(`.+?)(<.+?>)(`__?)', # reference with inline target |
|
bygroups(String, String.Interpol, String)), |
|
(r'`.+?`__?', String), # reference |
|
(r'(`.+?`)(:[a-zA-Z0-9:-]+?:)?', |
|
bygroups(Name.Variable, Name.Attribute)), # role |
|
(r'(:[a-zA-Z0-9:-]+?:)(`.+?`)', |
|
bygroups(Name.Attribute, Name.Variable)), # role (content first) |
|
(r'\*\*.+?\*\*', Generic.Strong), # Strong emphasis |
|
(r'\*.+?\*', Generic.Emph), # Emphasis |
|
(r'\[.*?\]_', String), # Footnote or citation |
|
(r'<.+?>', Name.Tag), # Hyperlink |
|
(r'[^\\\n\[*`:]+', Text), |
|
(r'.', Text), |
|
], |
|
'literal': [ |
|
(r'[^`]+', String), |
|
(r'``' + end_string_suffix, String, '#pop'), |
|
(r'`', String), |
|
] |
|
} |
|
|
|
def __init__(self, **options): |
|
self.handlecodeblocks = get_bool_opt(options, 'handlecodeblocks', True) |
|
RegexLexer.__init__(self, **options) |
|
|
|
def analyse_text(text): |
|
if text[:2] == '..' and text[2:3] != '.': |
|
return 0.3 |
|
p1 = text.find("\n") |
|
p2 = text.find("\n", p1 + 1) |
|
if (p2 > -1 and # has two lines |
|
p1 * 2 + 1 == p2 and # they are the same length |
|
text[p1+1] in '-=' and # the next line both starts and ends with |
|
text[p1+1] == text[p2-1]): # ...a sufficiently high header |
|
return 0.5 |
|
|
|
|
|
class TexLexer(RegexLexer): |
|
""" |
|
Lexer for the TeX and LaTeX typesetting languages. |
|
""" |
|
|
|
name = 'TeX' |
|
aliases = ['tex', 'latex'] |
|
filenames = ['*.tex', '*.aux', '*.toc'] |
|
mimetypes = ['text/x-tex', 'text/x-latex'] |
|
|
|
tokens = { |
|
'general': [ |
|
(r'%.*?\n', Comment), |
|
(r'[{}]', Name.Builtin), |
|
(r'[&_^]', Name.Builtin), |
|
], |
|
'root': [ |
|
(r'\\\[', String.Backtick, 'displaymath'), |
|
(r'\\\(', String, 'inlinemath'), |
|
(r'\$\$', String.Backtick, 'displaymath'), |
|
(r'\$', String, 'inlinemath'), |
|
(r'\\([a-zA-Z]+|.)', Keyword, 'command'), |
|
(r'\\$', Keyword), |
|
include('general'), |
|
(r'[^\\$%&_^{}]+', Text), |
|
], |
|
'math': [ |
|
(r'\\([a-zA-Z]+|.)', Name.Variable), |
|
include('general'), |
|
(r'[0-9]+', Number), |
|
(r'[-=!+*/()\[\]]', Operator), |
|
(r'[^=!+*/()\[\]\\$%&_^{}0-9-]+', Name.Builtin), |
|
], |
|
'inlinemath': [ |
|
(r'\\\)', String, '#pop'), |
|
(r'\$', String, '#pop'), |
|
include('math'), |
|
], |
|
'displaymath': [ |
|
(r'\\\]', String, '#pop'), |
|
(r'\$\$', String, '#pop'), |
|
(r'\$', Name.Builtin), |
|
include('math'), |
|
], |
|
'command': [ |
|
(r'\[.*?\]', Name.Attribute), |
|
(r'\*', Keyword), |
|
default('#pop'), |
|
], |
|
} |
|
|
|
def analyse_text(text): |
|
for start in ("\\documentclass", "\\input", "\\documentstyle", |
|
"\\relax"): |
|
if text[:len(start)] == start: |
|
return True |
|
|
|
|
|
class GroffLexer(RegexLexer): |
|
""" |
|
Lexer for the (g)roff typesetting language, supporting groff |
|
extensions. Mainly useful for highlighting manpage sources. |
|
|
|
.. versionadded:: 0.6 |
|
""" |
|
|
|
name = 'Groff' |
|
aliases = ['groff', 'nroff', 'man'] |
|
filenames = ['*.[1-9]', '*.man', '*.1p', '*.3pm'] |
|
mimetypes = ['application/x-troff', 'text/troff'] |
|
|
|
tokens = { |
|
'root': [ |
|
(r'(\.)(\w+)', bygroups(Text, Keyword), 'request'), |
|
(r'\.', Punctuation, 'request'), |
|
# Regular characters, slurp till we find a backslash or newline |
|
(r'[^\\\n]+', Text, 'textline'), |
|
default('textline'), |
|
], |
|
'textline': [ |
|
include('escapes'), |
|
(r'[^\\\n]+', Text), |
|
(r'\n', Text, '#pop'), |
|
], |
|
'escapes': [ |
|
# groff has many ways to write escapes. |
|
(r'\\"[^\n]*', Comment), |
|
(r'\\[fn]\w', String.Escape), |
|
(r'\\\(.{2}', String.Escape), |
|
(r'\\.\[.*\]', String.Escape), |
|
(r'\\.', String.Escape), |
|
(r'\\\n', Text, 'request'), |
|
], |
|
'request': [ |
|
(r'\n', Text, '#pop'), |
|
include('escapes'), |
|
(r'"[^\n"]+"', String.Double), |
|
(r'\d+', Number), |
|
(r'\S+', String), |
|
(r'\s+', Text), |
|
], |
|
} |
|
|
|
def analyse_text(text): |
|
if text[:1] != '.': |
|
return False |
|
if text[:3] == '.\\"': |
|
return True |
|
if text[:4] == '.TH ': |
|
return True |
|
if text[1:3].isalnum() and text[3].isspace(): |
|
return 0.9 |
|
|
|
|
|
class MozPreprocHashLexer(RegexLexer): |
|
""" |
|
Lexer for Mozilla Preprocessor files (with '#' as the marker). |
|
|
|
Other data is left untouched. |
|
|
|
.. versionadded:: 2.0 |
|
""" |
|
name = 'mozhashpreproc' |
|
aliases = [name] |
|
filenames = [] |
|
mimetypes = [] |
|
|
|
tokens = { |
|
'root': [ |
|
(r'^#', Comment.Preproc, ('expr', 'exprstart')), |
|
(r'.+', Other), |
|
], |
|
'exprstart': [ |
|
(r'(literal)(.*)', bygroups(Comment.Preproc, Text), '#pop:2'), |
|
(words(( |
|
'define', 'undef', 'if', 'ifdef', 'ifndef', 'else', 'elif', |
|
'elifdef', 'elifndef', 'endif', 'expand', 'filter', 'unfilter', |
|
'include', 'includesubst', 'error')), |
|
Comment.Preproc, '#pop'), |
|
], |
|
'expr': [ |
|
(words(('!', '!=', '==', '&&', '||')), Operator), |
|
(r'(defined)(\()', bygroups(Keyword, Punctuation)), |
|
(r'\)', Punctuation), |
|
(r'[0-9]+', Number.Decimal), |
|
(r'__\w+?__', Name.Variable), |
|
(r'@\w+?@', Name.Class), |
|
(r'\w+', Name), |
|
(r'\n', Text, '#pop'), |
|
(r'\s+', Text), |
|
(r'\S', Punctuation), |
|
], |
|
} |
|
|
|
|
|
class MozPreprocPercentLexer(MozPreprocHashLexer): |
|
""" |
|
Lexer for Mozilla Preprocessor files (with '%' as the marker). |
|
|
|
Other data is left untouched. |
|
|
|
.. versionadded:: 2.0 |
|
""" |
|
name = 'mozpercentpreproc' |
|
aliases = [name] |
|
filenames = [] |
|
mimetypes = [] |
|
|
|
tokens = { |
|
'root': [ |
|
(r'^%', Comment.Preproc, ('expr', 'exprstart')), |
|
(r'.+', Other), |
|
], |
|
} |
|
|
|
|
|
class MozPreprocXulLexer(DelegatingLexer): |
|
""" |
|
Subclass of the `MozPreprocHashLexer` that highlights unlexed data with the |
|
`XmlLexer`. |
|
|
|
.. versionadded:: 2.0 |
|
""" |
|
name = "XUL+mozpreproc" |
|
aliases = ['xul+mozpreproc'] |
|
filenames = ['*.xul.in'] |
|
mimetypes = [] |
|
|
|
def __init__(self, **options): |
|
super().__init__(XmlLexer, MozPreprocHashLexer, **options) |
|
|
|
|
|
class MozPreprocJavascriptLexer(DelegatingLexer): |
|
""" |
|
Subclass of the `MozPreprocHashLexer` that highlights unlexed data with the |
|
`JavascriptLexer`. |
|
|
|
.. versionadded:: 2.0 |
|
""" |
|
name = "Javascript+mozpreproc" |
|
aliases = ['javascript+mozpreproc'] |
|
filenames = ['*.js.in'] |
|
mimetypes = [] |
|
|
|
def __init__(self, **options): |
|
super().__init__(JavascriptLexer, MozPreprocHashLexer, **options) |
|
|
|
|
|
class MozPreprocCssLexer(DelegatingLexer): |
|
""" |
|
Subclass of the `MozPreprocHashLexer` that highlights unlexed data with the |
|
`CssLexer`. |
|
|
|
.. versionadded:: 2.0 |
|
""" |
|
name = "CSS+mozpreproc" |
|
aliases = ['css+mozpreproc'] |
|
filenames = ['*.css.in'] |
|
mimetypes = [] |
|
|
|
def __init__(self, **options): |
|
super().__init__(CssLexer, MozPreprocPercentLexer, **options) |
|
|
|
|
|
class MarkdownLexer(RegexLexer): |
|
""" |
|
For Markdown markup. |
|
|
|
.. versionadded:: 2.2 |
|
""" |
|
name = 'Markdown' |
|
url = 'https://daringfireball.net/projects/markdown/' |
|
aliases = ['markdown', 'md'] |
|
filenames = ['*.md', '*.markdown'] |
|
mimetypes = ["text/x-markdown"] |
|
flags = re.MULTILINE |
|
|
|
def _handle_codeblock(self, match): |
|
from pygments.lexers import get_lexer_by_name |
|
|
|
yield match.start('initial'), String.Backtick, match.group('initial') |
|
yield match.start('lang'), String.Backtick, match.group('lang') |
|
if match.group('afterlang') is not None: |
|
yield match.start('whitespace'), Whitespace, match.group('whitespace') |
|
yield match.start('extra'), Text, match.group('extra') |
|
yield match.start('newline'), Whitespace, match.group('newline') |
|
|
|
# lookup lexer if wanted and existing |
|
lexer = None |
|
if self.handlecodeblocks: |
|
try: |
|
lexer = get_lexer_by_name(match.group('lang').strip()) |
|
except ClassNotFound: |
|
pass |
|
code = match.group('code') |
|
# no lexer for this language. handle it like it was a code block |
|
if lexer is None: |
|
yield match.start('code'), String, code |
|
else: |
|
# FIXME: aren't the offsets wrong? |
|
yield from do_insertions([], lexer.get_tokens_unprocessed(code)) |
|
|
|
yield match.start('terminator'), String.Backtick, match.group('terminator') |
|
|
|
tokens = { |
|
'root': [ |
|
# heading with '#' prefix (atx-style) |
|
(r'(^#[^#].+)(\n)', bygroups(Generic.Heading, Text)), |
|
# subheading with '#' prefix (atx-style) |
|
(r'(^#{2,6}[^#].+)(\n)', bygroups(Generic.Subheading, Text)), |
|
# heading with '=' underlines (Setext-style) |
|
(r'^(.+)(\n)(=+)(\n)', bygroups(Generic.Heading, Text, Generic.Heading, Text)), |
|
# subheading with '-' underlines (Setext-style) |
|
(r'^(.+)(\n)(-+)(\n)', bygroups(Generic.Subheading, Text, Generic.Subheading, Text)), |
|
# task list |
|
(r'^(\s*)([*-] )(\[[ xX]\])( .+\n)', |
|
bygroups(Whitespace, Keyword, Keyword, using(this, state='inline'))), |
|
# bulleted list |
|
(r'^(\s*)([*-])(\s)(.+\n)', |
|
bygroups(Whitespace, Keyword, Whitespace, using(this, state='inline'))), |
|
# numbered list |
|
(r'^(\s*)([0-9]+\.)( .+\n)', |
|
bygroups(Whitespace, Keyword, using(this, state='inline'))), |
|
# quote |
|
(r'^(\s*>\s)(.+\n)', bygroups(Keyword, Generic.Emph)), |
|
# code block fenced by 3 backticks |
|
(r'^(\s*```\n[\w\W]*?^\s*```$\n)', String.Backtick), |
|
# code block with language |
|
# Some tools include extra stuff after the language name, just |
|
# highlight that as text. For example: https://docs.enola.dev/use/execmd |
|
(r'''(?x) |
|
^(?P<initial>\s*```) |
|
(?P<lang>[\w\-]+) |
|
(?P<afterlang> |
|
(?P<whitespace>[^\S\n]+) |
|
(?P<extra>.*))? |
|
(?P<newline>\n) |
|
(?P<code>(.|\n)*?) |
|
(?P<terminator>^\s*```$\n) |
|
''', |
|
_handle_codeblock), |
|
|
|
include('inline'), |
|
], |
|
'inline': [ |
|
# escape |
|
(r'\\.', Text), |
|
# inline code |
|
(r'([^`]?)(`[^`\n]+`)', bygroups(Text, String.Backtick)), |
|
# warning: the following rules eat outer tags. |
|
# eg. **foo _bar_ baz** => foo and baz are not recognized as bold |
|
# bold fenced by '**' |
|
(r'([^\*]?)(\*\*[^* \n][^*\n]*\*\*)', bygroups(Text, Generic.Strong)), |
|
# bold fenced by '__' |
|
(r'([^_]?)(__[^_ \n][^_\n]*__)', bygroups(Text, Generic.Strong)), |
|
# italics fenced by '*' |
|
(r'([^\*]?)(\*[^* \n][^*\n]*\*)', bygroups(Text, Generic.Emph)), |
|
# italics fenced by '_' |
|
(r'([^_]?)(_[^_ \n][^_\n]*_)', bygroups(Text, Generic.Emph)), |
|
# strikethrough |
|
(r'([^~]?)(~~[^~ \n][^~\n]*~~)', bygroups(Text, Generic.Deleted)), |
|
# mentions and topics (twitter and github stuff) |
|
(r'[@#][\w/:]+', Name.Entity), |
|
# (image?) links eg:  |
|
(r'(!?\[)([^]]+)(\])(\()([^)]+)(\))', |
|
bygroups(Text, Name.Tag, Text, Text, Name.Attribute, Text)), |
|
# reference-style links, e.g.: |
|
# [an example][id] |
|
# [id]: http://example.com/ |
|
(r'(\[)([^]]+)(\])(\[)([^]]*)(\])', |
|
bygroups(Text, Name.Tag, Text, Text, Name.Label, Text)), |
|
(r'^(\s*\[)([^]]*)(\]:\s*)(.+)', |
|
bygroups(Text, Name.Label, Text, Name.Attribute)), |
|
|
|
# general text, must come last! |
|
(r'[^\\\s]+', Text), |
|
(r'.', Text), |
|
], |
|
} |
|
|
|
def __init__(self, **options): |
|
self.handlecodeblocks = get_bool_opt(options, 'handlecodeblocks', True) |
|
RegexLexer.__init__(self, **options) |
|
|
|
|
|
class TiddlyWiki5Lexer(RegexLexer): |
|
""" |
|
For TiddlyWiki5 markup. |
|
|
|
.. versionadded:: 2.7 |
|
""" |
|
name = 'tiddler' |
|
url = 'https://tiddlywiki.com/#TiddlerFiles' |
|
aliases = ['tid'] |
|
filenames = ['*.tid'] |
|
mimetypes = ["text/vnd.tiddlywiki"] |
|
flags = re.MULTILINE |
|
|
|
def _handle_codeblock(self, match): |
|
""" |
|
match args: 1:backticks, 2:lang_name, 3:newline, 4:code, 5:backticks |
|
""" |
|
from pygments.lexers import get_lexer_by_name |
|
|
|
# section header |
|
yield match.start(1), String, match.group(1) |
|
yield match.start(2), String, match.group(2) |
|
yield match.start(3), Text, match.group(3) |
|
|
|
# lookup lexer if wanted and existing |
|
lexer = None |
|
if self.handlecodeblocks: |
|
try: |
|
lexer = get_lexer_by_name(match.group(2).strip()) |
|
except ClassNotFound: |
|
pass |
|
code = match.group(4) |
|
|
|
# no lexer for this language. handle it like it was a code block |
|
if lexer is None: |
|
yield match.start(4), String, code |
|
return |
|
|
|
yield from do_insertions([], lexer.get_tokens_unprocessed(code)) |
|
|
|
yield match.start(5), String, match.group(5) |
|
|
|
def _handle_cssblock(self, match): |
|
""" |
|
match args: 1:style tag 2:newline, 3:code, 4:closing style tag |
|
""" |
|
from pygments.lexers import get_lexer_by_name |
|
|
|
# section header |
|
yield match.start(1), String, match.group(1) |
|
yield match.start(2), String, match.group(2) |
|
|
|
lexer = None |
|
if self.handlecodeblocks: |
|
try: |
|
lexer = get_lexer_by_name('css') |
|
except ClassNotFound: |
|
pass |
|
code = match.group(3) |
|
|
|
# no lexer for this language. handle it like it was a code block |
|
if lexer is None: |
|
yield match.start(3), String, code |
|
return |
|
|
|
yield from do_insertions([], lexer.get_tokens_unprocessed(code)) |
|
|
|
yield match.start(4), String, match.group(4) |
|
|
|
tokens = { |
|
'root': [ |
|
# title in metadata section |
|
(r'^(title)(:\s)(.+\n)', bygroups(Keyword, Text, Generic.Heading)), |
|
# headings |
|
(r'^(!)([^!].+\n)', bygroups(Generic.Heading, Text)), |
|
(r'^(!{2,6})(.+\n)', bygroups(Generic.Subheading, Text)), |
|
# bulleted or numbered lists or single-line block quotes |
|
# (can be mixed) |
|
(r'^(\s*)([*#>]+)(\s*)(.+\n)', |
|
bygroups(Text, Keyword, Text, using(this, state='inline'))), |
|
# multi-line block quotes |
|
(r'^(<<<.*\n)([\w\W]*?)(^<<<.*$)', bygroups(String, Text, String)), |
|
# table header |
|
(r'^(\|.*?\|h)$', bygroups(Generic.Strong)), |
|
# table footer or caption |
|
(r'^(\|.*?\|[cf])$', bygroups(Generic.Emph)), |
|
# table class |
|
(r'^(\|.*?\|k)$', bygroups(Name.Tag)), |
|
# definitions |
|
(r'^(;.*)$', bygroups(Generic.Strong)), |
|
# text block |
|
(r'^(```\n)([\w\W]*?)(^```$)', bygroups(String, Text, String)), |
|
# code block with language |
|
(r'^(```)(\w+)(\n)([\w\W]*?)(^```$)', _handle_codeblock), |
|
# CSS style block |
|
(r'^(<style>)(\n)([\w\W]*?)(^</style>$)', _handle_cssblock), |
|
|
|
include('keywords'), |
|
include('inline'), |
|
], |
|
'keywords': [ |
|
(words(( |
|
'\\define', '\\end', 'caption', 'created', 'modified', 'tags', |
|
'title', 'type'), prefix=r'^', suffix=r'\b'), |
|
Keyword), |
|
], |
|
'inline': [ |
|
# escape |
|
(r'\\.', Text), |
|
# created or modified date |
|
(r'\d{17}', Number.Integer), |
|
# italics |
|
(r'(\s)(//[^/]+//)((?=\W|\n))', |
|
bygroups(Text, Generic.Emph, Text)), |
|
# superscript |
|
(r'(\s)(\^\^[^\^]+\^\^)', bygroups(Text, Generic.Emph)), |
|
# subscript |
|
(r'(\s)(,,[^,]+,,)', bygroups(Text, Generic.Emph)), |
|
# underscore |
|
(r'(\s)(__[^_]+__)', bygroups(Text, Generic.Strong)), |
|
# bold |
|
(r"(\s)(''[^']+'')((?=\W|\n))", |
|
bygroups(Text, Generic.Strong, Text)), |
|
# strikethrough |
|
(r'(\s)(~~[^~]+~~)((?=\W|\n))', |
|
bygroups(Text, Generic.Deleted, Text)), |
|
# TiddlyWiki variables |
|
(r'<<[^>]+>>', Name.Tag), |
|
(r'\$\$[^$]+\$\$', Name.Tag), |
|
(r'\$\([^)]+\)\$', Name.Tag), |
|
# TiddlyWiki style or class |
|
(r'^@@.*$', Name.Tag), |
|
# HTML tags |
|
(r'</?[^>]+>', Name.Tag), |
|
# inline code |
|
(r'`[^`]+`', String.Backtick), |
|
# HTML escaped symbols |
|
(r'&\S*?;', String.Regex), |
|
# Wiki links |
|
(r'(\[{2})([^]\|]+)(\]{2})', bygroups(Text, Name.Tag, Text)), |
|
# External links |
|
(r'(\[{2})([^]\|]+)(\|)([^]\|]+)(\]{2})', |
|
bygroups(Text, Name.Tag, Text, Name.Attribute, Text)), |
|
# Transclusion |
|
(r'(\{{2})([^}]+)(\}{2})', bygroups(Text, Name.Tag, Text)), |
|
# URLs |
|
(r'(\b.?.?tps?://[^\s"]+)', bygroups(Name.Attribute)), |
|
|
|
# general text, must come last! |
|
(r'[\w]+', Text), |
|
(r'.', Text) |
|
], |
|
} |
|
|
|
def __init__(self, **options): |
|
self.handlecodeblocks = get_bool_opt(options, 'handlecodeblocks', True) |
|
RegexLexer.__init__(self, **options) |
|
|
|
|
|
class WikitextLexer(RegexLexer): |
|
""" |
|
For MediaWiki Wikitext. |
|
|
|
Parsing Wikitext is tricky, and results vary between different MediaWiki |
|
installations, so we only highlight common syntaxes (built-in or from |
|
popular extensions), and also assume templates produce no unbalanced |
|
syntaxes. |
|
|
|
.. versionadded:: 2.15 |
|
""" |
|
name = 'Wikitext' |
|
url = 'https://www.mediawiki.org/wiki/Wikitext' |
|
aliases = ['wikitext', 'mediawiki'] |
|
filenames = [] |
|
mimetypes = ['text/x-wiki'] |
|
flags = re.MULTILINE |
|
|
|
def nowiki_tag_rules(tag_name): |
|
return [ |
|
(r'(?i)(</)({})(\s*)(>)'.format(tag_name), bygroups(Punctuation, |
|
Name.Tag, Whitespace, Punctuation), '#pop'), |
|
include('entity'), |
|
include('text'), |
|
] |
|
|
|
def plaintext_tag_rules(tag_name): |
|
return [ |
|
(r'(?si)(.*?)(</)({})(\s*)(>)'.format(tag_name), bygroups(Text, |
|
Punctuation, Name.Tag, Whitespace, Punctuation), '#pop'), |
|
] |
|
|
|
def delegate_tag_rules(tag_name, lexer): |
|
return [ |
|
(r'(?i)(</)({})(\s*)(>)'.format(tag_name), bygroups(Punctuation, |
|
Name.Tag, Whitespace, Punctuation), '#pop'), |
|
(r'(?si).+?(?=</{}\s*>)'.format(tag_name), using(lexer)), |
|
] |
|
|
|
def text_rules(token): |
|
return [ |
|
(r'\w+', token), |
|
(r'[^\S\n]+', token), |
|
(r'(?s).', token), |
|
] |
|
|
|
def handle_syntaxhighlight(self, match, ctx): |
|
from pygments.lexers import get_lexer_by_name |
|
|
|
attr_content = match.group() |
|
start = 0 |
|
index = 0 |
|
while True: |
|
index = attr_content.find('>', start) |
|
# Exclude comment end (-->) |
|
if attr_content[index-2:index] != '--': |
|
break |
|
start = index + 1 |
|
|
|
if index == -1: |
|
# No tag end |
|
yield from self.get_tokens_unprocessed(attr_content, stack=['root', 'attr']) |
|
return |
|
attr = attr_content[:index] |
|
yield from self.get_tokens_unprocessed(attr, stack=['root', 'attr']) |
|
yield match.start(3) + index, Punctuation, '>' |
|
|
|
lexer = None |
|
content = attr_content[index+1:] |
|
lang_match = re.findall(r'\blang=("|\'|)(\w+)(\1)', attr) |
|
|
|
if len(lang_match) >= 1: |
|
# Pick the last match in case of multiple matches |
|
lang = lang_match[-1][1] |
|
try: |
|
lexer = get_lexer_by_name(lang) |
|
except ClassNotFound: |
|
pass |
|
|
|
if lexer is None: |
|
yield match.start() + index + 1, Text, content |
|
else: |
|
yield from lexer.get_tokens_unprocessed(content) |
|
|
|
def handle_score(self, match, ctx): |
|
attr_content = match.group() |
|
start = 0 |
|
index = 0 |
|
while True: |
|
index = attr_content.find('>', start) |
|
# Exclude comment end (-->) |
|
if attr_content[index-2:index] != '--': |
|
break |
|
start = index + 1 |
|
|
|
if index == -1: |
|
# No tag end |
|
yield from self.get_tokens_unprocessed(attr_content, stack=['root', 'attr']) |
|
return |
|
attr = attr_content[:index] |
|
content = attr_content[index+1:] |
|
yield from self.get_tokens_unprocessed(attr, stack=['root', 'attr']) |
|
yield match.start(3) + index, Punctuation, '>' |
|
|
|
lang_match = re.findall(r'\blang=("|\'|)(\w+)(\1)', attr) |
|
# Pick the last match in case of multiple matches |
|
lang = lang_match[-1][1] if len(lang_match) >= 1 else 'lilypond' |
|
|
|
if lang == 'lilypond': # Case sensitive |
|
yield from LilyPondLexer().get_tokens_unprocessed(content) |
|
else: # ABC |
|
# FIXME: Use ABC lexer in the future |
|
yield match.start() + index + 1, Text, content |
|
|
|
# a-z removed to prevent linter from complaining, REMEMBER to use (?i) |
|
title_char = r' %!"$&\'()*,\-./0-9:;=?@A-Z\\\^_`~+\u0080-\uFFFF' |
|
nbsp_char = r'(?:\t| |&\#0*160;|&\#[Xx]0*[Aa]0;|[ \xA0\u1680\u2000-\u200A\u202F\u205F\u3000])' |
|
link_address = r'(?:[0-9.]+|\[[0-9a-f:.]+\]|[^\x00-\x20"<>\[\]\x7F\xA0\u1680\u2000-\u200A\u202F\u205F\u3000\uFFFD])' |
|
link_char_class = r'[^\x00-\x20"<>\[\]\x7F\xA0\u1680\u2000-\u200A\u202F\u205F\u3000\uFFFD]' |
|
double_slashes_i = { |
|
'__FORCETOC__', '__NOCONTENTCONVERT__', '__NOCC__', '__NOEDITSECTION__', '__NOGALLERY__', |
|
'__NOTITLECONVERT__', '__NOTC__', '__NOTOC__', '__TOC__', |
|
} |
|
double_slashes = { |
|
'__EXPECTUNUSEDCATEGORY__', '__HIDDENCAT__', '__INDEX__', '__NEWSECTIONLINK__', |
|
'__NOINDEX__', '__NONEWSECTIONLINK__', '__STATICREDIRECT__', '__NOGLOBAL__', |
|
'__DISAMBIG__', '__EXPECTED_UNCONNECTED_PAGE__', |
|
} |
|
protocols = { |
|
'bitcoin:', 'ftp://', 'ftps://', 'geo:', 'git://', 'gopher://', 'http://', 'https://', |
|
'irc://', 'ircs://', 'magnet:', 'mailto:', 'mms://', 'news:', 'nntp://', 'redis://', |
|
'sftp://', 'sip:', 'sips:', 'sms:', 'ssh://', 'svn://', 'tel:', 'telnet://', 'urn:', |
|
'worldwind://', 'xmpp:', '//', |
|
} |
|
non_relative_protocols = protocols - {'//'} |
|
html_tags = { |
|
'abbr', 'b', 'bdi', 'bdo', 'big', 'blockquote', 'br', 'caption', 'center', 'cite', 'code', |
|
'data', 'dd', 'del', 'dfn', 'div', 'dl', 'dt', 'em', 'font', 'h1', 'h2', 'h3', 'h4', 'h5', |
|
'h6', 'hr', 'i', 'ins', 'kbd', 'li', 'link', 'mark', 'meta', 'ol', 'p', 'q', 'rb', 'rp', |
|
'rt', 'rtc', 'ruby', 's', 'samp', 'small', 'span', 'strike', 'strong', 'sub', 'sup', |
|
'table', 'td', 'th', 'time', 'tr', 'tt', 'u', 'ul', 'var', 'wbr', |
|
} |
|
parser_tags = { |
|
'graph', 'charinsert', 'rss', 'chem', 'categorytree', 'nowiki', 'inputbox', 'math', |
|
'hiero', 'score', 'pre', 'ref', 'translate', 'imagemap', 'templatestyles', 'languages', |
|
'noinclude', 'mapframe', 'section', 'poem', 'syntaxhighlight', 'includeonly', 'tvar', |
|
'onlyinclude', 'templatedata', 'langconvert', 'timeline', 'dynamicpagelist', 'gallery', |
|
'maplink', 'ce', 'references', |
|
} |
|
variant_langs = { |
|
# ZhConverter.php |
|
'zh', 'zh-hans', 'zh-hant', 'zh-cn', 'zh-hk', 'zh-mo', 'zh-my', 'zh-sg', 'zh-tw', |
|
# UnConverter.php |
|
'uz', 'uz-latn', 'uz-cyrl', |
|
# TlyConverter.php |
|
'tly', 'tly-cyrl', |
|
# TgConverter.php |
|
'tg', 'tg-latn', |
|
# SrConverter.php |
|
'sr', 'sr-ec', 'sr-el', |
|
# ShiConverter.php |
|
'shi', 'shi-tfng', 'shi-latn', |
|
# ShConverter.php |
|
'sh-latn', 'sh-cyrl', |
|
# KuConverter.php |
|
'ku', 'ku-arab', 'ku-latn', |
|
# KkConverter.php |
|
'kk', 'kk-cyrl', 'kk-latn', 'kk-arab', 'kk-kz', 'kk-tr', 'kk-cn', |
|
# IuConverter.php |
|
'iu', 'ike-cans', 'ike-latn', |
|
# GanConverter.php |
|
'gan', 'gan-hans', 'gan-hant', |
|
# EnConverter.php |
|
'en', 'en-x-piglatin', |
|
# CrhConverter.php |
|
'crh', 'crh-cyrl', 'crh-latn', |
|
# BanConverter.php |
|
'ban', 'ban-bali', 'ban-x-dharma', 'ban-x-palmleaf', 'ban-x-pku', |
|
} |
|
magic_vars_i = { |
|
'ARTICLEPATH', 'INT', 'PAGEID', 'SCRIPTPATH', 'SERVER', 'SERVERNAME', 'STYLEPATH', |
|
} |
|
magic_vars = { |
|
'!', '=', 'BASEPAGENAME', 'BASEPAGENAMEE', 'CASCADINGSOURCES', 'CONTENTLANGUAGE', |
|
'CONTENTLANG', 'CURRENTDAY', 'CURRENTDAY2', 'CURRENTDAYNAME', 'CURRENTDOW', 'CURRENTHOUR', |
|
'CURRENTMONTH', 'CURRENTMONTH2', 'CURRENTMONTH1', 'CURRENTMONTHABBREV', 'CURRENTMONTHNAME', |
|
'CURRENTMONTHNAMEGEN', 'CURRENTTIME', 'CURRENTTIMESTAMP', 'CURRENTVERSION', 'CURRENTWEEK', |
|
'CURRENTYEAR', 'DIRECTIONMARK', 'DIRMARK', 'FULLPAGENAME', 'FULLPAGENAMEE', 'LOCALDAY', |
|
'LOCALDAY2', 'LOCALDAYNAME', 'LOCALDOW', 'LOCALHOUR', 'LOCALMONTH', 'LOCALMONTH2', |
|
'LOCALMONTH1', 'LOCALMONTHABBREV', 'LOCALMONTHNAME', 'LOCALMONTHNAMEGEN', 'LOCALTIME', |
|
'LOCALTIMESTAMP', 'LOCALWEEK', 'LOCALYEAR', 'NAMESPACE', 'NAMESPACEE', 'NAMESPACENUMBER', |
|
'NUMBEROFACTIVEUSERS', 'NUMBEROFADMINS', 'NUMBEROFARTICLES', 'NUMBEROFEDITS', |
|
'NUMBEROFFILES', 'NUMBEROFPAGES', 'NUMBEROFUSERS', 'PAGELANGUAGE', 'PAGENAME', 'PAGENAMEE', |
|
'REVISIONDAY', 'REVISIONDAY2', 'REVISIONID', 'REVISIONMONTH', 'REVISIONMONTH1', |
|
'REVISIONSIZE', 'REVISIONTIMESTAMP', 'REVISIONUSER', 'REVISIONYEAR', 'ROOTPAGENAME', |
|
'ROOTPAGENAMEE', 'SITENAME', 'SUBJECTPAGENAME', 'ARTICLEPAGENAME', 'SUBJECTPAGENAMEE', |
|
'ARTICLEPAGENAMEE', 'SUBJECTSPACE', 'ARTICLESPACE', 'SUBJECTSPACEE', 'ARTICLESPACEE', |
|
'SUBPAGENAME', 'SUBPAGENAMEE', 'TALKPAGENAME', 'TALKPAGENAMEE', 'TALKSPACE', 'TALKSPACEE', |
|
} |
|
parser_functions_i = { |
|
'ANCHORENCODE', 'BIDI', 'CANONICALURL', 'CANONICALURLE', 'FILEPATH', 'FORMATNUM', |
|
'FULLURL', 'FULLURLE', 'GENDER', 'GRAMMAR', 'INT', r'\#LANGUAGE', 'LC', 'LCFIRST', 'LOCALURL', |
|
'LOCALURLE', 'NS', 'NSE', 'PADLEFT', 'PADRIGHT', 'PAGEID', 'PLURAL', 'UC', 'UCFIRST', |
|
'URLENCODE', |
|
} |
|
parser_functions = { |
|
'BASEPAGENAME', 'BASEPAGENAMEE', 'CASCADINGSOURCES', 'DEFAULTSORT', 'DEFAULTSORTKEY', |
|
'DEFAULTCATEGORYSORT', 'FULLPAGENAME', 'FULLPAGENAMEE', 'NAMESPACE', 'NAMESPACEE', |
|
'NAMESPACENUMBER', 'NUMBERINGROUP', 'NUMINGROUP', 'NUMBEROFACTIVEUSERS', 'NUMBEROFADMINS', |
|
'NUMBEROFARTICLES', 'NUMBEROFEDITS', 'NUMBEROFFILES', 'NUMBEROFPAGES', 'NUMBEROFUSERS', |
|
'PAGENAME', 'PAGENAMEE', 'PAGESINCATEGORY', 'PAGESINCAT', 'PAGESIZE', 'PROTECTIONEXPIRY', |
|
'PROTECTIONLEVEL', 'REVISIONDAY', 'REVISIONDAY2', 'REVISIONID', 'REVISIONMONTH', |
|
'REVISIONMONTH1', 'REVISIONTIMESTAMP', 'REVISIONUSER', 'REVISIONYEAR', 'ROOTPAGENAME', |
|
'ROOTPAGENAMEE', 'SUBJECTPAGENAME', 'ARTICLEPAGENAME', 'SUBJECTPAGENAMEE', |
|
'ARTICLEPAGENAMEE', 'SUBJECTSPACE', 'ARTICLESPACE', 'SUBJECTSPACEE', 'ARTICLESPACEE', |
|
'SUBPAGENAME', 'SUBPAGENAMEE', 'TALKPAGENAME', 'TALKPAGENAMEE', 'TALKSPACE', 'TALKSPACEE', |
|
'INT', 'DISPLAYTITLE', 'PAGESINNAMESPACE', 'PAGESINNS', |
|
} |
|
|
|
tokens = { |
|
'root': [ |
|
# Redirects |
|
(r"""(?xi) |
|
(\A\s*?)(\#REDIRECT:?) # may contain a colon |
|
(\s+)(\[\[) (?=[^\]\n]* \]\]$) |
|
""", |
|
bygroups(Whitespace, Keyword, Whitespace, Punctuation), 'redirect-inner'), |
|
# Subheadings |
|
(r'^(={2,6})(.+?)(\1)(\s*$\n)', |
|
bygroups(Generic.Subheading, Generic.Subheading, Generic.Subheading, Whitespace)), |
|
# Headings |
|
(r'^(=.+?=)(\s*$\n)', |
|
bygroups(Generic.Heading, Whitespace)), |
|
# Double-slashed magic words |
|
(words(double_slashes_i, prefix=r'(?i)'), Name.Function.Magic), |
|
(words(double_slashes), Name.Function.Magic), |
|
# Raw URLs |
|
(r'(?i)\b(?:{}){}{}*'.format('|'.join(protocols), |
|
link_address, link_char_class), Name.Label), |
|
# Magic links |
|
(r'\b(?:RFC|PMID){}+[0-9]+\b'.format(nbsp_char), |
|
Name.Function.Magic), |
|
(r"""(?x) |
|
\bISBN {nbsp_char} |
|
(?: 97[89] {nbsp_dash}? )? |
|
(?: [0-9] {nbsp_dash}? ){{9}} # escape format() |
|
[0-9Xx]\b |
|
""".format(nbsp_char=nbsp_char, nbsp_dash=f'(?:-|{nbsp_char})'), Name.Function.Magic), |
|
include('list'), |
|
include('inline'), |
|
include('text'), |
|
], |
|
'redirect-inner': [ |
|
(r'(\]\])(\s*?\n)', bygroups(Punctuation, Whitespace), '#pop'), |
|
(r'(\#)([^#]*?)', bygroups(Punctuation, Name.Label)), |
|
(r'(?i)[{}]+'.format(title_char), Name.Tag), |
|
], |
|
'list': [ |
|
# Description lists |
|
(r'^;', Keyword, 'dt'), |
|
# Ordered lists, unordered lists and indents |
|
(r'^[#:*]+', Keyword), |
|
# Horizontal rules |
|
(r'^-{4,}', Keyword), |
|
], |
|
'inline': [ |
|
# Signatures |
|
(r'~{3,5}', Keyword), |
|
# Entities |
|
include('entity'), |
|
# Bold & italic |
|
(r"('')(''')(?!')", bygroups(Generic.Emph, |
|
Generic.EmphStrong), 'inline-italic-bold'), |
|
(r"'''(?!')", Generic.Strong, 'inline-bold'), |
|
(r"''(?!')", Generic.Emph, 'inline-italic'), |
|
# Comments & parameters & templates |
|
include('replaceable'), |
|
# Media links |
|
( |
|
r"""(?xi) |
|
(\[\[) |
|
(File|Image) (:) |
|
((?: [%s] | \{{2,3}[^{}]*?\}{2,3} | <!--[\s\S]*?--> )*) |
|
(?: (\#) ([%s]*?) )? |
|
""" % (title_char, f'{title_char}#'), |
|
bygroups(Punctuation, Name.Namespace, Punctuation, |
|
using(this, state=['wikilink-name']), Punctuation, Name.Label), |
|
'medialink-inner' |
|
), |
|
# Wikilinks |
|
( |
|
r"""(?xi) |
|
(\[\[)(?!%s) # Should not contain URLs |
|
(?: ([%s]*) (:))? |
|
((?: [%s] | \{{2,3}[^{}]*?\}{2,3} | <!--[\s\S]*?--> )*?) |
|
(?: (\#) ([%s]*?) )? |
|
(\]\]) |
|
""" % ('|'.join(protocols), title_char.replace('/', ''), |
|
title_char, f'{title_char}#'), |
|
bygroups(Punctuation, Name.Namespace, Punctuation, |
|
using(this, state=['wikilink-name']), Punctuation, Name.Label, Punctuation) |
|
), |
|
( |
|
r"""(?xi) |
|
(\[\[)(?!%s) |
|
(?: ([%s]*) (:))? |
|
((?: [%s] | \{{2,3}[^{}]*?\}{2,3} | <!--[\s\S]*?--> )*?) |
|
(?: (\#) ([%s]*?) )? |
|
(\|) |
|
""" % ('|'.join(protocols), title_char.replace('/', ''), |
|
title_char, f'{title_char}#'), |
|
bygroups(Punctuation, Name.Namespace, Punctuation, |
|
using(this, state=['wikilink-name']), Punctuation, Name.Label, Punctuation), |
|
'wikilink-inner' |
|
), |
|
# External links |
|
( |
|
r"""(?xi) |
|
(\[) |
|
((?:{}) {} {}*) |
|
(\s*) |
|
""".format('|'.join(protocols), link_address, link_char_class), |
|
bygroups(Punctuation, Name.Label, Whitespace), |
|
'extlink-inner' |
|
), |
|
# Tables |
|
(r'^(:*)(\s*?)(\{\|)([^\n]*)$', bygroups(Keyword, |
|
Whitespace, Punctuation, using(this, state=['root', 'attr'])), 'table'), |
|
# HTML tags |
|
(r'(?i)(<)({})\b'.format('|'.join(html_tags)), |
|
bygroups(Punctuation, Name.Tag), 'tag-inner-ordinary'), |
|
(r'(?i)(</)({})\b(\s*)(>)'.format('|'.join(html_tags)), |
|
bygroups(Punctuation, Name.Tag, Whitespace, Punctuation)), |
|
# <nowiki> |
|
(r'(?i)(<)(nowiki)\b', bygroups(Punctuation, |
|
Name.Tag), ('tag-nowiki', 'tag-inner')), |
|
# <pre> |
|
(r'(?i)(<)(pre)\b', bygroups(Punctuation, |
|
Name.Tag), ('tag-pre', 'tag-inner')), |
|
# <categorytree> |
|
(r'(?i)(<)(categorytree)\b', bygroups( |
|
Punctuation, Name.Tag), ('tag-categorytree', 'tag-inner')), |
|
# <hiero> |
|
(r'(?i)(<)(hiero)\b', bygroups(Punctuation, |
|
Name.Tag), ('tag-hiero', 'tag-inner')), |
|
# <math> |
|
(r'(?i)(<)(math)\b', bygroups(Punctuation, |
|
Name.Tag), ('tag-math', 'tag-inner')), |
|
# <chem> |
|
(r'(?i)(<)(chem)\b', bygroups(Punctuation, |
|
Name.Tag), ('tag-chem', 'tag-inner')), |
|
# <ce> |
|
(r'(?i)(<)(ce)\b', bygroups(Punctuation, |
|
Name.Tag), ('tag-ce', 'tag-inner')), |
|
# <charinsert> |
|
(r'(?i)(<)(charinsert)\b', bygroups( |
|
Punctuation, Name.Tag), ('tag-charinsert', 'tag-inner')), |
|
# <templatedata> |
|
(r'(?i)(<)(templatedata)\b', bygroups( |
|
Punctuation, Name.Tag), ('tag-templatedata', 'tag-inner')), |
|
# <gallery> |
|
(r'(?i)(<)(gallery)\b', bygroups( |
|
Punctuation, Name.Tag), ('tag-gallery', 'tag-inner')), |
|
# <graph> |
|
(r'(?i)(<)(gallery)\b', bygroups( |
|
Punctuation, Name.Tag), ('tag-graph', 'tag-inner')), |
|
# <dynamicpagelist> |
|
(r'(?i)(<)(dynamicpagelist)\b', bygroups( |
|
Punctuation, Name.Tag), ('tag-dynamicpagelist', 'tag-inner')), |
|
# <inputbox> |
|
(r'(?i)(<)(inputbox)\b', bygroups( |
|
Punctuation, Name.Tag), ('tag-inputbox', 'tag-inner')), |
|
# <rss> |
|
(r'(?i)(<)(rss)\b', bygroups( |
|
Punctuation, Name.Tag), ('tag-rss', 'tag-inner')), |
|
# <imagemap> |
|
(r'(?i)(<)(imagemap)\b', bygroups( |
|
Punctuation, Name.Tag), ('tag-imagemap', 'tag-inner')), |
|
# <syntaxhighlight> |
|
(r'(?i)(</)(syntaxhighlight)\b(\s*)(>)', |
|
bygroups(Punctuation, Name.Tag, Whitespace, Punctuation)), |
|
(r'(?si)(<)(syntaxhighlight)\b([^>]*?(?<!/)>.*?)(?=</\2\s*>)', |
|
bygroups(Punctuation, Name.Tag, handle_syntaxhighlight)), |
|
# <syntaxhighlight>: Fallback case for self-closing tags |
|
(r'(?i)(<)(syntaxhighlight)\b(\s*?)((?:[^>]|-->)*?)(/\s*?(?<!--)>)', bygroups( |
|
Punctuation, Name.Tag, Whitespace, using(this, state=['root', 'attr']), Punctuation)), |
|
# <source> |
|
(r'(?i)(</)(source)\b(\s*)(>)', |
|
bygroups(Punctuation, Name.Tag, Whitespace, Punctuation)), |
|
(r'(?si)(<)(source)\b([^>]*?(?<!/)>.*?)(?=</\2\s*>)', |
|
bygroups(Punctuation, Name.Tag, handle_syntaxhighlight)), |
|
# <source>: Fallback case for self-closing tags |
|
(r'(?i)(<)(source)\b(\s*?)((?:[^>]|-->)*?)(/\s*?(?<!--)>)', bygroups( |
|
Punctuation, Name.Tag, Whitespace, using(this, state=['root', 'attr']), Punctuation)), |
|
# <score> |
|
(r'(?i)(</)(score)\b(\s*)(>)', |
|
bygroups(Punctuation, Name.Tag, Whitespace, Punctuation)), |
|
(r'(?si)(<)(score)\b([^>]*?(?<!/)>.*?)(?=</\2\s*>)', |
|
bygroups(Punctuation, Name.Tag, handle_score)), |
|
# <score>: Fallback case for self-closing tags |
|
(r'(?i)(<)(score)\b(\s*?)((?:[^>]|-->)*?)(/\s*?(?<!--)>)', bygroups( |
|
Punctuation, Name.Tag, Whitespace, using(this, state=['root', 'attr']), Punctuation)), |
|
# Other parser tags |
|
(r'(?i)(<)({})\b'.format('|'.join(parser_tags)), |
|
bygroups(Punctuation, Name.Tag), 'tag-inner-ordinary'), |
|
(r'(?i)(</)({})\b(\s*)(>)'.format('|'.join(parser_tags)), |
|
bygroups(Punctuation, Name.Tag, Whitespace, Punctuation)), |
|
# LanguageConverter markups |
|
( |
|
r"""(?xi) |
|
(-\{{) # Escape format() |
|
(?: ([^|]) (\|))? |
|
(?: (\s* (?:{variants}) \s*) (=>))? |
|
(\s* (?:{variants}) \s*) (:) |
|
""".format(variants='|'.join(variant_langs)), |
|
bygroups(Punctuation, Keyword, Punctuation, |
|
Name.Label, Operator, Name.Label, Punctuation), |
|
'lc-inner' |
|
), |
|
(r'-\{(?!\{)', Punctuation, 'lc-raw'), |
|
], |
|
'wikilink-name': [ |
|
include('replaceable'), |
|
(r'[^{<]+', Name.Tag), |
|
(r'(?s).', Name.Tag), |
|
], |
|
'wikilink-inner': [ |
|
# Quit in case of another wikilink |
|
(r'(?=\[\[)', Punctuation, '#pop'), |
|
(r'\]\]', Punctuation, '#pop'), |
|
include('inline'), |
|
include('text'), |
|
], |
|
'medialink-inner': [ |
|
(r'\]\]', Punctuation, '#pop'), |
|
(r'(\|)([^\n=|]*)(=)', |
|
bygroups(Punctuation, Name.Attribute, Operator)), |
|
(r'\|', Punctuation), |
|
include('inline'), |
|
include('text'), |
|
], |
|
'quote-common': [ |
|
# Quit in case of link/template endings |
|
(r'(?=\]\]|\{\{|\}\})', Punctuation, '#pop'), |
|
(r'\n', Text, '#pop'), |
|
], |
|
'inline-italic': [ |
|
include('quote-common'), |
|
(r"('')(''')(?!')", bygroups(Generic.Emph, |
|
Generic.Strong), ('#pop', 'inline-bold')), |
|
(r"'''(?!')", Generic.EmphStrong, ('#pop', 'inline-italic-bold')), |
|
(r"''(?!')", Generic.Emph, '#pop'), |
|
include('inline'), |
|
include('text-italic'), |
|
], |
|
'inline-bold': [ |
|
include('quote-common'), |
|
(r"(''')('')(?!')", bygroups( |
|
Generic.Strong, Generic.Emph), ('#pop', 'inline-italic')), |
|
(r"'''(?!')", Generic.Strong, '#pop'), |
|
(r"''(?!')", Generic.EmphStrong, ('#pop', 'inline-bold-italic')), |
|
include('inline'), |
|
include('text-bold'), |
|
], |
|
'inline-bold-italic': [ |
|
include('quote-common'), |
|
(r"('')(''')(?!')", bygroups(Generic.EmphStrong, |
|
Generic.Strong), '#pop'), |
|
(r"'''(?!')", Generic.EmphStrong, ('#pop', 'inline-italic')), |
|
(r"''(?!')", Generic.EmphStrong, ('#pop', 'inline-bold')), |
|
include('inline'), |
|
include('text-bold-italic'), |
|
], |
|
'inline-italic-bold': [ |
|
include('quote-common'), |
|
(r"(''')('')(?!')", bygroups( |
|
Generic.EmphStrong, Generic.Emph), '#pop'), |
|
(r"'''(?!')", Generic.EmphStrong, ('#pop', 'inline-italic')), |
|
(r"''(?!')", Generic.EmphStrong, ('#pop', 'inline-bold')), |
|
include('inline'), |
|
include('text-bold-italic'), |
|
], |
|
'lc-inner': [ |
|
( |
|
r"""(?xi) |
|
(;) |
|
(?: (\s* (?:{variants}) \s*) (=>))? |
|
(\s* (?:{variants}) \s*) (:) |
|
""".format(variants='|'.join(variant_langs)), |
|
bygroups(Punctuation, Name.Label, |
|
Operator, Name.Label, Punctuation) |
|
), |
|
(r';?\s*?\}-', Punctuation, '#pop'), |
|
include('inline'), |
|
include('text'), |
|
], |
|
'lc-raw': [ |
|
(r'\}-', Punctuation, '#pop'), |
|
include('inline'), |
|
include('text'), |
|
], |
|
'replaceable': [ |
|
# Comments |
|
(r'<!--[\s\S]*?(?:-->|\Z)', Comment.Multiline), |
|
# Parameters |
|
( |
|
r"""(?x) |
|
(\{{3}) |
|
([^|]*?) |
|
(?=\}{3}|\|) |
|
""", |
|
bygroups(Punctuation, Name.Variable), |
|
'parameter-inner', |
|
), |
|
# Magic variables |
|
(r'(?i)(\{\{)(\s*)(%s)(\s*)(\}\})' % '|'.join(magic_vars_i), |
|
bygroups(Punctuation, Whitespace, Name.Function, Whitespace, Punctuation)), |
|
(r'(\{\{)(\s*)(%s)(\s*)(\}\})' % '|'.join(magic_vars), |
|
bygroups(Punctuation, Whitespace, Name.Function, Whitespace, Punctuation)), |
|
# Parser functions & templates |
|
(r'\{\{', Punctuation, 'template-begin-space'), |
|
# <tvar> legacy syntax |
|
(r'(?i)(<)(tvar)\b(\|)([^>]*?)(>)', bygroups(Punctuation, |
|
Name.Tag, Punctuation, String, Punctuation)), |
|
(r'</>', Punctuation, '#pop'), |
|
# <tvar> |
|
(r'(?i)(<)(tvar)\b', bygroups(Punctuation, Name.Tag), 'tag-inner-ordinary'), |
|
(r'(?i)(</)(tvar)\b(\s*)(>)', |
|
bygroups(Punctuation, Name.Tag, Whitespace, Punctuation)), |
|
], |
|
'parameter-inner': [ |
|
(r'\}{3}', Punctuation, '#pop'), |
|
(r'\|', Punctuation), |
|
include('inline'), |
|
include('text'), |
|
], |
|
'template-begin-space': [ |
|
# Templates allow line breaks at the beginning, and due to how MediaWiki handles |
|
# comments, an extra state is required to handle things like {{\n<!---->\n name}} |
|
(r'<!--[\s\S]*?(?:-->|\Z)', Comment.Multiline), |
|
(r'\s+', Whitespace), |
|
# Parser functions |
|
( |
|
r'(?i)(\#[%s]*?|%s)(:)' % (title_char, |
|
'|'.join(parser_functions_i)), |
|
bygroups(Name.Function, Punctuation), ('#pop', 'template-inner') |
|
), |
|
( |
|
r'(%s)(:)' % ('|'.join(parser_functions)), |
|
bygroups(Name.Function, Punctuation), ('#pop', 'template-inner') |
|
), |
|
# Templates |
|
( |
|
r'(?i)([%s]*?)(:)' % title_char, |
|
bygroups(Name.Namespace, Punctuation), ('#pop', 'template-name') |
|
), |
|
default(('#pop', 'template-name'),), |
|
], |
|
'template-name': [ |
|
(r'(\s*?)(\|)', bygroups(Text, Punctuation), ('#pop', 'template-inner')), |
|
(r'\}\}', Punctuation, '#pop'), |
|
(r'\n', Text, '#pop'), |
|
include('replaceable'), |
|
*text_rules(Name.Tag), |
|
], |
|
'template-inner': [ |
|
(r'\}\}', Punctuation, '#pop'), |
|
(r'\|', Punctuation), |
|
( |
|
r"""(?x) |
|
(?<=\|) |
|
( (?: (?! \{\{ | \}\} )[^=\|<])*? ) # Exclude templates and tags |
|
(=) |
|
""", |
|
bygroups(Name.Label, Operator) |
|
), |
|
include('inline'), |
|
include('text'), |
|
], |
|
'table': [ |
|
# Use [ \t\n\r\0\x0B] instead of \s to follow PHP trim() behavior |
|
# Endings |
|
(r'^([ \t\n\r\0\x0B]*?)(\|\})', |
|
bygroups(Whitespace, Punctuation), '#pop'), |
|
# Table rows |
|
(r'^([ \t\n\r\0\x0B]*?)(\|-+)(.*)$', bygroups(Whitespace, Punctuation, |
|
using(this, state=['root', 'attr']))), |
|
# Captions |
|
( |
|
r"""(?x) |
|
^([ \t\n\r\0\x0B]*?)(\|\+) |
|
# Exclude links, template and tags |
|
(?: ( (?: (?! \[\[ | \{\{ )[^|\n<] )*? )(\|) )? |
|
(.*?)$ |
|
""", |
|
bygroups(Whitespace, Punctuation, using(this, state=[ |
|
'root', 'attr']), Punctuation, Generic.Heading), |
|
), |
|
# Table data |
|
( |
|
r"""(?x) |
|
( ^(?:[ \t\n\r\0\x0B]*?)\| | \|\| ) |
|
(?: ( (?: (?! \[\[ | \{\{ )[^|\n<] )*? )(\|)(?!\|) )? |
|
""", |
|
bygroups(Punctuation, using(this, state=[ |
|
'root', 'attr']), Punctuation), |
|
), |
|
# Table headers |
|
( |
|
r"""(?x) |
|
( ^(?:[ \t\n\r\0\x0B]*?)! ) |
|
(?: ( (?: (?! \[\[ | \{\{ )[^|\n<] )*? )(\|)(?!\|) )? |
|
""", |
|
bygroups(Punctuation, using(this, state=[ |
|
'root', 'attr']), Punctuation), |
|
'table-header', |
|
), |
|
include('list'), |
|
include('inline'), |
|
include('text'), |
|
], |
|
'table-header': [ |
|
# Requires another state for || handling inside headers |
|
(r'\n', Text, '#pop'), |
|
( |
|
r"""(?x) |
|
(!!|\|\|) |
|
(?: |
|
( (?: (?! \[\[ | \{\{ )[^|\n<] )*? ) |
|
(\|)(?!\|) |
|
)? |
|
""", |
|
bygroups(Punctuation, using(this, state=[ |
|
'root', 'attr']), Punctuation) |
|
), |
|
*text_rules(Generic.Subheading), |
|
], |
|
'entity': [ |
|
(r'&\S*?;', Name.Entity), |
|
], |
|
'dt': [ |
|
(r'\n', Text, '#pop'), |
|
include('inline'), |
|
(r':', Keyword, '#pop'), |
|
include('text'), |
|
], |
|
'extlink-inner': [ |
|
(r'\]', Punctuation, '#pop'), |
|
include('inline'), |
|
include('text'), |
|
], |
|
'nowiki-ish': [ |
|
include('entity'), |
|
include('text'), |
|
], |
|
'attr': [ |
|
include('replaceable'), |
|
(r'\s+', Whitespace), |
|
(r'(=)(\s*)(")', bygroups(Operator, Whitespace, String.Double), 'attr-val-2'), |
|
(r"(=)(\s*)(')", bygroups(Operator, Whitespace, String.Single), 'attr-val-1'), |
|
(r'(=)(\s*)', bygroups(Operator, Whitespace), 'attr-val-0'), |
|
(r'[\w:-]+', Name.Attribute), |
|
|
|
], |
|
'attr-val-0': [ |
|
(r'\s', Whitespace, '#pop'), |
|
include('replaceable'), |
|
*text_rules(String), |
|
], |
|
'attr-val-1': [ |
|
(r"'", String.Single, '#pop'), |
|
include('replaceable'), |
|
*text_rules(String.Single), |
|
], |
|
'attr-val-2': [ |
|
(r'"', String.Double, '#pop'), |
|
include('replaceable'), |
|
*text_rules(String.Double), |
|
], |
|
'tag-inner-ordinary': [ |
|
(r'/?\s*>', Punctuation, '#pop'), |
|
include('tag-attr'), |
|
], |
|
'tag-inner': [ |
|
# Return to root state for self-closing tags |
|
(r'/\s*>', Punctuation, '#pop:2'), |
|
(r'\s*>', Punctuation, '#pop'), |
|
include('tag-attr'), |
|
], |
|
# There states below are just like their non-tag variants, the key difference is |
|
# they forcibly quit when encountering tag closing markup |
|
'tag-attr': [ |
|
include('replaceable'), |
|
(r'\s+', Whitespace), |
|
(r'(=)(\s*)(")', bygroups(Operator, |
|
Whitespace, String.Double), 'tag-attr-val-2'), |
|
(r"(=)(\s*)(')", bygroups(Operator, |
|
Whitespace, String.Single), 'tag-attr-val-1'), |
|
(r'(=)(\s*)', bygroups(Operator, Whitespace), 'tag-attr-val-0'), |
|
(r'[\w:-]+', Name.Attribute), |
|
|
|
], |
|
'tag-attr-val-0': [ |
|
(r'\s', Whitespace, '#pop'), |
|
(r'/?>', Punctuation, '#pop:2'), |
|
include('replaceable'), |
|
*text_rules(String), |
|
], |
|
'tag-attr-val-1': [ |
|
(r"'", String.Single, '#pop'), |
|
(r'/?>', Punctuation, '#pop:2'), |
|
include('replaceable'), |
|
*text_rules(String.Single), |
|
], |
|
'tag-attr-val-2': [ |
|
(r'"', String.Double, '#pop'), |
|
(r'/?>', Punctuation, '#pop:2'), |
|
include('replaceable'), |
|
*text_rules(String.Double), |
|
], |
|
'tag-nowiki': nowiki_tag_rules('nowiki'), |
|
'tag-pre': nowiki_tag_rules('pre'), |
|
'tag-categorytree': plaintext_tag_rules('categorytree'), |
|
'tag-dynamicpagelist': plaintext_tag_rules('dynamicpagelist'), |
|
'tag-hiero': plaintext_tag_rules('hiero'), |
|
'tag-inputbox': plaintext_tag_rules('inputbox'), |
|
'tag-imagemap': plaintext_tag_rules('imagemap'), |
|
'tag-charinsert': plaintext_tag_rules('charinsert'), |
|
'tag-timeline': plaintext_tag_rules('timeline'), |
|
'tag-gallery': plaintext_tag_rules('gallery'), |
|
'tag-graph': plaintext_tag_rules('graph'), |
|
'tag-rss': plaintext_tag_rules('rss'), |
|
'tag-math': delegate_tag_rules('math', TexLexer), |
|
'tag-chem': delegate_tag_rules('chem', TexLexer), |
|
'tag-ce': delegate_tag_rules('ce', TexLexer), |
|
'tag-templatedata': delegate_tag_rules('templatedata', JsonLexer), |
|
'text-italic': text_rules(Generic.Emph), |
|
'text-bold': text_rules(Generic.Strong), |
|
'text-bold-italic': text_rules(Generic.EmphStrong), |
|
'text': text_rules(Text), |
|
}
|
|
|