Simple regex-based lexer/parser for inline markup
- Python 3
Example:
import re from pprint import pprint from reparser import Parser, Token, MatchGroup boundary_chars = r'\s`!()\[\]{{}};:\'".,<>?«»“”‘’*_~=' b_left = r'(?:(?<=[' + boundary_chars + r'])|(?<=^))' # Lookbehind b_right = r'(?:(?=[' + boundary_chars + r'])|(?=$))' # Lookahead markdown_start = b_left + r'(?<!\\){tag}(?!\s)(?!{tag})' markdown_end = r'(?<!{tag})(?<!\s)(?<!\\){tag}' + b_right markdown_link = r'(?<!\\)\[(?P<link>.+?)\]\((?P<url>.+?)\)' newline = r'\n|\r\n' url_proto_regex = re.compile(r'(?i)^[a-z][\w-]+:/{1,3}') def markdown(tag): """Return sequence of start and end regex patterns for simple Markdown tag""" return (markdown_start.format(tag=tag), markdown_end.format(tag=tag)) def url_complete(url): """If URL doesn't start with protocol, prepend it with http://""" return url if url_proto_regex.search(url) else 'http://' + url tokens = [ Token('bi1', *markdown(r'\*\*\*'), is_bold=True, is_italic=True), Token('bi2', *markdown(r'___'), is_bold=True, is_italic=True), Token('b1', *markdown(r'\*\*'), is_bold=True), Token('b2', *markdown(r'__'), is_bold=True), Token('i1', *markdown(r'\*'), is_italic=True), Token('i2', *markdown(r'_'), is_italic=True), Token('pre3', *markdown(r'```'), skip=True), Token('pre2', *markdown(r'``'), skip=True), Token('pre1', *markdown(r'`'), skip=True), Token('s', *markdown(r'~~'), is_strikethrough=True), Token('u', *markdown(r'=='), is_underline=True), Token('link', markdown_link, text=MatchGroup('link'), link_target=MatchGroup('url', func=url_complete)), Token('br', newline, text='\n', segment_type="LINE_BREAK") ] parser = Parser(tokens) text = ('Hello **bold** world!\n' 'You can **try *this* awesome** [link](www.eff.org).') segments = parser.parse(text) pprint([(segment.text, segment.params) for segment in segments])
Output:
[('Hello ', {}), ('bold', {'is_bold': True}), (' world!', {}), ('\n', {'segment_type': 'LINE_BREAK'}), ('You can ', {}), ('try ', {'is_bold': True}), ('this', {'is_bold': True, 'is_italic': True}), (' awesome', {'is_bold': True}), (' ', {}), ('link', {'link_target': 'http://www.eff.org'}), ('.', {})]