/lib/cpython-doc/tools/docutils/parsers/rst/states.py
Python | 3008 lines | 2912 code | 21 blank | 75 comment | 48 complexity | 494a8d8114df0149de8ee8fef642e535 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0, GPL-2.0
Large files files are truncated, but you can click here to view the full file
- # $Id: states.py 78909 2010-03-13 10:49:23Z georg.brandl $
- # Author: David Goodger <goodger@python.org>
- # Copyright: This module has been placed in the public domain.
- """
- This is the ``docutils.parsers.restructuredtext.states`` module, the core of
- the reStructuredText parser. It defines the following:
- :Classes:
- - `RSTStateMachine`: reStructuredText parser's entry point.
- - `NestedStateMachine`: recursive StateMachine.
- - `RSTState`: reStructuredText State superclass.
- - `Inliner`: For parsing inline markup.
- - `Body`: Generic classifier of the first line of a block.
- - `SpecializedBody`: Superclass for compound element members.
- - `BulletList`: Second and subsequent bullet_list list_items
- - `DefinitionList`: Second+ definition_list_items.
- - `EnumeratedList`: Second+ enumerated_list list_items.
- - `FieldList`: Second+ fields.
- - `OptionList`: Second+ option_list_items.
- - `RFC2822List`: Second+ RFC2822-style fields.
- - `ExtensionOptions`: Parses directive option fields.
- - `Explicit`: Second+ explicit markup constructs.
- - `SubstitutionDef`: For embedded directives in substitution definitions.
- - `Text`: Classifier of second line of a text block.
- - `SpecializedText`: Superclass for continuation lines of Text-variants.
- - `Definition`: Second line of potential definition_list_item.
- - `Line`: Second line of overlined section title or transition marker.
- - `Struct`: An auxiliary collection class.
- :Exception classes:
- - `MarkupError`
- - `ParserError`
- - `MarkupMismatch`
- :Functions:
- - `escape2null()`: Return a string, escape-backslashes converted to nulls.
- - `unescape()`: Return a string, nulls removed or restored to backslashes.
- :Attributes:
- - `state_classes`: set of State classes used with `RSTStateMachine`.
- Parser Overview
- ===============
- The reStructuredText parser is implemented as a recursive state machine,
- examining its input one line at a time. To understand how the parser works,
- please first become familiar with the `docutils.statemachine` module. In the
- description below, references are made to classes defined in this module;
- please see the individual classes for details.
- Parsing proceeds as follows:
- 1. The state machine examines each line of input, checking each of the
- transition patterns of the state `Body`, in order, looking for a match.
- The implicit transitions (blank lines and indentation) are checked before
- any others. The 'text' transition is a catch-all (matches anything).
- 2. The method associated with the matched transition pattern is called.
- A. Some transition methods are self-contained, appending elements to the
- document tree (`Body.doctest` parses a doctest block). The parser's
- current line index is advanced to the end of the element, and parsing
- continues with step 1.
- B. Other transition methods trigger the creation of a nested state machine,
- whose job is to parse a compound construct ('indent' does a block quote,
- 'bullet' does a bullet list, 'overline' does a section [first checking
- for a valid section header], etc.).
- - In the case of lists and explicit markup, a one-off state machine is
- created and run to parse contents of the first item.
- - A new state machine is created and its initial state is set to the
- appropriate specialized state (`BulletList` in the case of the
- 'bullet' transition; see `SpecializedBody` for more detail). This
- state machine is run to parse the compound element (or series of
- explicit markup elements), and returns as soon as a non-member element
- is encountered. For example, the `BulletList` state machine ends as
- soon as it encounters an element which is not a list item of that
- bullet list. The optional omission of inter-element blank lines is
- enabled by this nested state machine.
- - The current line index is advanced to the end of the elements parsed,
- and parsing continues with step 1.
- C. The result of the 'text' transition depends on the next line of text.
- The current state is changed to `Text`, under which the second line is
- examined. If the second line is:
- - Indented: The element is a definition list item, and parsing proceeds
- similarly to step 2.B, using the `DefinitionList` state.
- - A line of uniform punctuation characters: The element is a section
- header; again, parsing proceeds as in step 2.B, and `Body` is still
- used.
- - Anything else: The element is a paragraph, which is examined for
- inline markup and appended to the parent element. Processing
- continues with step 1.
- """
- __docformat__ = 'reStructuredText'
- import sys
- import re
- import roman
- from types import FunctionType, MethodType
- from docutils import nodes, statemachine, utils, urischemes
- from docutils import ApplicationError, DataError
- from docutils.statemachine import StateMachineWS, StateWS
- from docutils.nodes import fully_normalize_name as normalize_name
- from docutils.nodes import whitespace_normalize_name
- from docutils.utils import escape2null, unescape, column_width
- import docutils.parsers.rst
- from docutils.parsers.rst import directives, languages, tableparser, roles
- from docutils.parsers.rst.languages import en as _fallback_language_module
- class MarkupError(DataError): pass
- class UnknownInterpretedRoleError(DataError): pass
- class InterpretedRoleNotImplementedError(DataError): pass
- class ParserError(ApplicationError): pass
- class MarkupMismatch(Exception): pass
- class Struct:
- """Stores data attributes for dotted-attribute access."""
- def __init__(self, **keywordargs):
- self.__dict__.update(keywordargs)
- class RSTStateMachine(StateMachineWS):
- """
- reStructuredText's master StateMachine.
- The entry point to reStructuredText parsing is the `run()` method.
- """
- def run(self, input_lines, document, input_offset=0, match_titles=1,
- inliner=None):
- """
- Parse `input_lines` and modify the `document` node in place.
- Extend `StateMachineWS.run()`: set up parse-global data and
- run the StateMachine.
- """
- self.language = languages.get_language(
- document.settings.language_code)
- self.match_titles = match_titles
- if inliner is None:
- inliner = Inliner()
- inliner.init_customizations(document.settings)
- self.memo = Struct(document=document,
- reporter=document.reporter,
- language=self.language,
- title_styles=[],
- section_level=0,
- section_bubble_up_kludge=0,
- inliner=inliner)
- self.document = document
- self.attach_observer(document.note_source)
- self.reporter = self.memo.reporter
- self.node = document
- results = StateMachineWS.run(self, input_lines, input_offset,
- input_source=document['source'])
- assert results == [], 'RSTStateMachine.run() results should be empty!'
- self.node = self.memo = None # remove unneeded references
- class NestedStateMachine(StateMachineWS):
- """
- StateMachine run from within other StateMachine runs, to parse nested
- document structures.
- """
- def run(self, input_lines, input_offset, memo, node, match_titles=1):
- """
- Parse `input_lines` and populate a `docutils.nodes.document` instance.
- Extend `StateMachineWS.run()`: set up document-wide data.
- """
- self.match_titles = match_titles
- self.memo = memo
- self.document = memo.document
- self.attach_observer(self.document.note_source)
- self.reporter = memo.reporter
- self.language = memo.language
- self.node = node
- results = StateMachineWS.run(self, input_lines, input_offset)
- assert results == [], ('NestedStateMachine.run() results should be '
- 'empty!')
- return results
- class RSTState(StateWS):
- """
- reStructuredText State superclass.
- Contains methods used by all State subclasses.
- """
- nested_sm = NestedStateMachine
- nested_sm_cache = []
- def __init__(self, state_machine, debug=0):
- self.nested_sm_kwargs = {'state_classes': state_classes,
- 'initial_state': 'Body'}
- StateWS.__init__(self, state_machine, debug)
- def runtime_init(self):
- StateWS.runtime_init(self)
- memo = self.state_machine.memo
- self.memo = memo
- self.reporter = memo.reporter
- self.inliner = memo.inliner
- self.document = memo.document
- self.parent = self.state_machine.node
- def goto_line(self, abs_line_offset):
- """
- Jump to input line `abs_line_offset`, ignoring jumps past the end.
- """
- try:
- self.state_machine.goto_line(abs_line_offset)
- except EOFError:
- pass
- def no_match(self, context, transitions):
- """
- Override `StateWS.no_match` to generate a system message.
- This code should never be run.
- """
- self.reporter.severe(
- 'Internal error: no transition pattern match. State: "%s"; '
- 'transitions: %s; context: %s; current line: %r.'
- % (self.__class__.__name__, transitions, context,
- self.state_machine.line),
- line=self.state_machine.abs_line_number())
- return context, None, []
- def bof(self, context):
- """Called at beginning of file."""
- return [], []
- def nested_parse(self, block, input_offset, node, match_titles=0,
- state_machine_class=None, state_machine_kwargs=None):
- """
- Create a new StateMachine rooted at `node` and run it over the input
- `block`.
- """
- use_default = 0
- if state_machine_class is None:
- state_machine_class = self.nested_sm
- use_default += 1
- if state_machine_kwargs is None:
- state_machine_kwargs = self.nested_sm_kwargs
- use_default += 1
- block_length = len(block)
- state_machine = None
- if use_default == 2:
- try:
- state_machine = self.nested_sm_cache.pop()
- except IndexError:
- pass
- if not state_machine:
- state_machine = state_machine_class(debug=self.debug,
- **state_machine_kwargs)
- state_machine.run(block, input_offset, memo=self.memo,
- node=node, match_titles=match_titles)
- if use_default == 2:
- self.nested_sm_cache.append(state_machine)
- else:
- state_machine.unlink()
- new_offset = state_machine.abs_line_offset()
- # No `block.parent` implies disconnected -- lines aren't in sync:
- if block.parent and (len(block) - block_length) != 0:
- # Adjustment for block if modified in nested parse:
- self.state_machine.next_line(len(block) - block_length)
- return new_offset
- def nested_list_parse(self, block, input_offset, node, initial_state,
- blank_finish,
- blank_finish_state=None,
- extra_settings={},
- match_titles=0,
- state_machine_class=None,
- state_machine_kwargs=None):
- """
- Create a new StateMachine rooted at `node` and run it over the input
- `block`. Also keep track of optional intermediate blank lines and the
- required final one.
- """
- if state_machine_class is None:
- state_machine_class = self.nested_sm
- if state_machine_kwargs is None:
- state_machine_kwargs = self.nested_sm_kwargs.copy()
- state_machine_kwargs['initial_state'] = initial_state
- state_machine = state_machine_class(debug=self.debug,
- **state_machine_kwargs)
- if blank_finish_state is None:
- blank_finish_state = initial_state
- state_machine.states[blank_finish_state].blank_finish = blank_finish
- for key, value in extra_settings.items():
- setattr(state_machine.states[initial_state], key, value)
- state_machine.run(block, input_offset, memo=self.memo,
- node=node, match_titles=match_titles)
- blank_finish = state_machine.states[blank_finish_state].blank_finish
- state_machine.unlink()
- return state_machine.abs_line_offset(), blank_finish
- def section(self, title, source, style, lineno, messages):
- """Check for a valid subsection and create one if it checks out."""
- if self.check_subsection(source, style, lineno):
- self.new_subsection(title, lineno, messages)
- def check_subsection(self, source, style, lineno):
- """
- Check for a valid subsection header. Return 1 (true) or None (false).
- When a new section is reached that isn't a subsection of the current
- section, back up the line count (use ``previous_line(-x)``), then
- ``raise EOFError``. The current StateMachine will finish, then the
- calling StateMachine can re-examine the title. This will work its way
- back up the calling chain until the correct section level isreached.
- @@@ Alternative: Evaluate the title, store the title info & level, and
- back up the chain until that level is reached. Store in memo? Or
- return in results?
- :Exception: `EOFError` when a sibling or supersection encountered.
- """
- memo = self.memo
- title_styles = memo.title_styles
- mylevel = memo.section_level
- try: # check for existing title style
- level = title_styles.index(style) + 1
- except ValueError: # new title style
- if len(title_styles) == memo.section_level: # new subsection
- title_styles.append(style)
- return 1
- else: # not at lowest level
- self.parent += self.title_inconsistent(source, lineno)
- return None
- if level <= mylevel: # sibling or supersection
- memo.section_level = level # bubble up to parent section
- if len(style) == 2:
- memo.section_bubble_up_kludge = 1
- # back up 2 lines for underline title, 3 for overline title
- self.state_machine.previous_line(len(style) + 1)
- raise EOFError # let parent section re-evaluate
- if level == mylevel + 1: # immediate subsection
- return 1
- else: # invalid subsection
- self.parent += self.title_inconsistent(source, lineno)
- return None
- def title_inconsistent(self, sourcetext, lineno):
- error = self.reporter.severe(
- 'Title level inconsistent:', nodes.literal_block('', sourcetext),
- line=lineno)
- return error
- def new_subsection(self, title, lineno, messages):
- """Append new subsection to document tree. On return, check level."""
- memo = self.memo
- mylevel = memo.section_level
- memo.section_level += 1
- section_node = nodes.section()
- self.parent += section_node
- textnodes, title_messages = self.inline_text(title, lineno)
- titlenode = nodes.title(title, '', *textnodes)
- name = normalize_name(titlenode.astext())
- section_node['names'].append(name)
- section_node += titlenode
- section_node += messages
- section_node += title_messages
- self.document.note_implicit_target(section_node, section_node)
- offset = self.state_machine.line_offset + 1
- absoffset = self.state_machine.abs_line_offset() + 1
- newabsoffset = self.nested_parse(
- self.state_machine.input_lines[offset:], input_offset=absoffset,
- node=section_node, match_titles=1)
- self.goto_line(newabsoffset)
- if memo.section_level <= mylevel: # can't handle next section?
- raise EOFError # bubble up to supersection
- # reset section_level; next pass will detect it properly
- memo.section_level = mylevel
- def paragraph(self, lines, lineno):
- """
- Return a list (paragraph & messages) & a boolean: literal_block next?
- """
- data = '\n'.join(lines).rstrip()
- if re.search(r'(?<!\\)(\\\\)*::$', data):
- if len(data) == 2:
- return [], 1
- elif data[-3] in ' \n':
- text = data[:-3].rstrip()
- else:
- text = data[:-1]
- literalnext = 1
- else:
- text = data
- literalnext = 0
- textnodes, messages = self.inline_text(text, lineno)
- p = nodes.paragraph(data, '', *textnodes)
- p.line = lineno
- return [p] + messages, literalnext
- def inline_text(self, text, lineno):
- """
- Return 2 lists: nodes (text and inline elements), and system_messages.
- """
- return self.inliner.parse(text, lineno, self.memo, self.parent)
- def unindent_warning(self, node_name):
- return self.reporter.warning(
- '%s ends without a blank line; unexpected unindent.' % node_name,
- line=(self.state_machine.abs_line_number() + 1))
- def build_regexp(definition, compile=1):
- """
- Build, compile and return a regular expression based on `definition`.
- :Parameter: `definition`: a 4-tuple (group name, prefix, suffix, parts),
- where "parts" is a list of regular expressions and/or regular
- expression definitions to be joined into an or-group.
- """
- name, prefix, suffix, parts = definition
- part_strings = []
- for part in parts:
- if type(part) is tuple:
- part_strings.append(build_regexp(part, None))
- else:
- part_strings.append(part)
- or_group = '|'.join(part_strings)
- regexp = '%(prefix)s(?P<%(name)s>%(or_group)s)%(suffix)s' % locals()
- if compile:
- return re.compile(regexp, re.UNICODE)
- else:
- return regexp
- class Inliner:
- """
- Parse inline markup; call the `parse()` method.
- """
- def __init__(self):
- self.implicit_dispatch = [(self.patterns.uri, self.standalone_uri),]
- """List of (pattern, bound method) tuples, used by
- `self.implicit_inline`."""
- def init_customizations(self, settings):
- """Setting-based customizations; run when parsing begins."""
- if settings.pep_references:
- self.implicit_dispatch.append((self.patterns.pep,
- self.pep_reference))
- if settings.rfc_references:
- self.implicit_dispatch.append((self.patterns.rfc,
- self.rfc_reference))
- def parse(self, text, lineno, memo, parent):
- # Needs to be refactored for nested inline markup.
- # Add nested_parse() method?
- """
- Return 2 lists: nodes (text and inline elements), and system_messages.
- Using `self.patterns.initial`, a pattern which matches start-strings
- (emphasis, strong, interpreted, phrase reference, literal,
- substitution reference, and inline target) and complete constructs
- (simple reference, footnote reference), search for a candidate. When
- one is found, check for validity (e.g., not a quoted '*' character).
- If valid, search for the corresponding end string if applicable, and
- check it for validity. If not found or invalid, generate a warning
- and ignore the start-string. Implicit inline markup (e.g. standalone
- URIs) is found last.
- """
- self.reporter = memo.reporter
- self.document = memo.document
- self.language = memo.language
- self.parent = parent
- pattern_search = self.patterns.initial.search
- dispatch = self.dispatch
- remaining = escape2null(text)
- processed = []
- unprocessed = []
- messages = []
- while remaining:
- match = pattern_search(remaining)
- if match:
- groups = match.groupdict()
- method = dispatch[groups['start'] or groups['backquote']
- or groups['refend'] or groups['fnend']]
- before, inlines, remaining, sysmessages = method(self, match,
- lineno)
- unprocessed.append(before)
- messages += sysmessages
- if inlines:
- processed += self.implicit_inline(''.join(unprocessed),
- lineno)
- processed += inlines
- unprocessed = []
- else:
- break
- remaining = ''.join(unprocessed) + remaining
- if remaining:
- processed += self.implicit_inline(remaining, lineno)
- return processed, messages
- openers = u'\'"([{<\u2018\u201c\xab\u00a1\u00bf' # see quoted_start below
- closers = u'\'")]}>\u2019\u201d\xbb!?'
- unicode_delimiters = u'\u2010\u2011\u2012\u2013\u2014\u00a0'
- start_string_prefix = (u'((?<=^)|(?<=[-/: \\n\u2019%s%s]))'
- % (re.escape(unicode_delimiters),
- re.escape(openers)))
- end_string_suffix = (r'((?=$)|(?=[-/:.,; \n\x00%s%s]))'
- % (re.escape(unicode_delimiters),
- re.escape(closers)))
- non_whitespace_before = r'(?<![ \n])'
- non_whitespace_escape_before = r'(?<![ \n\x00])'
- non_whitespace_after = r'(?![ \n])'
- # Alphanumerics with isolated internal [-._+:] chars (i.e. not 2 together):
- simplename = r'(?:(?!_)\w)+(?:[-._+:](?:(?!_)\w)+)*'
- # Valid URI characters (see RFC 2396 & RFC 2732);
- # final \x00 allows backslash escapes in URIs:
- uric = r"""[-_.!~*'()[\];/:@&=+$,%a-zA-Z0-9\x00]"""
- # Delimiter indicating the end of a URI (not part of the URI):
- uri_end_delim = r"""[>]"""
- # Last URI character; same as uric but no punctuation:
- urilast = r"""[_~*/=+a-zA-Z0-9]"""
- # End of a URI (either 'urilast' or 'uric followed by a
- # uri_end_delim'):
- uri_end = r"""(?:%(urilast)s|%(uric)s(?=%(uri_end_delim)s))""" % locals()
- emailc = r"""[-_!~*'{|}/#?^`&=+$%a-zA-Z0-9\x00]"""
- email_pattern = r"""
- %(emailc)s+(?:\.%(emailc)s+)* # name
- (?<!\x00)@ # at
- %(emailc)s+(?:\.%(emailc)s*)* # host
- %(uri_end)s # final URI char
- """
- parts = ('initial_inline', start_string_prefix, '',
- [('start', '', non_whitespace_after, # simple start-strings
- [r'\*\*', # strong
- r'\*(?!\*)', # emphasis but not strong
- r'``', # literal
- r'_`', # inline internal target
- r'\|(?!\|)'] # substitution reference
- ),
- ('whole', '', end_string_suffix, # whole constructs
- [# reference name & end-string
- r'(?P<refname>%s)(?P<refend>__?)' % simplename,
- ('footnotelabel', r'\[', r'(?P<fnend>\]_)',
- [r'[0-9]+', # manually numbered
- r'\#(%s)?' % simplename, # auto-numbered (w/ label?)
- r'\*', # auto-symbol
- r'(?P<citationlabel>%s)' % simplename] # citation reference
- )
- ]
- ),
- ('backquote', # interpreted text or phrase reference
- '(?P<role>(:%s:)?)' % simplename, # optional role
- non_whitespace_after,
- ['`(?!`)'] # but not literal
- )
- ]
- )
- patterns = Struct(
- initial=build_regexp(parts),
- emphasis=re.compile(non_whitespace_escape_before
- + r'(\*)' + end_string_suffix),
- strong=re.compile(non_whitespace_escape_before
- + r'(\*\*)' + end_string_suffix),
- interpreted_or_phrase_ref=re.compile(
- r"""
- %(non_whitespace_escape_before)s
- (
- `
- (?P<suffix>
- (?P<role>:%(simplename)s:)?
- (?P<refend>__?)?
- )
- )
- %(end_string_suffix)s
- """ % locals(), re.VERBOSE | re.UNICODE),
- embedded_uri=re.compile(
- r"""
- (
- (?:[ \n]+|^) # spaces or beginning of line/string
- < # open bracket
- %(non_whitespace_after)s
- ([^<>\x00]+) # anything but angle brackets & nulls
- %(non_whitespace_before)s
- > # close bracket w/o whitespace before
- )
- $ # end of string
- """ % locals(), re.VERBOSE),
- literal=re.compile(non_whitespace_before + '(``)'
- + end_string_suffix),
- target=re.compile(non_whitespace_escape_before
- + r'(`)' + end_string_suffix),
- substitution_ref=re.compile(non_whitespace_escape_before
- + r'(\|_{0,2})'
- + end_string_suffix),
- email=re.compile(email_pattern % locals() + '$', re.VERBOSE),
- uri=re.compile(
- (r"""
- %(start_string_prefix)s
- (?P<whole>
- (?P<absolute> # absolute URI
- (?P<scheme> # scheme (http, ftp, mailto)
- [a-zA-Z][a-zA-Z0-9.+-]*
- )
- :
- (
- ( # either:
- (//?)? # hierarchical URI
- %(uric)s* # URI characters
- %(uri_end)s # final URI char
- )
- ( # optional query
- \?%(uric)s*
- %(uri_end)s
- )?
- ( # optional fragment
- \#%(uric)s*
- %(uri_end)s
- )?
- )
- )
- | # *OR*
- (?P<email> # email address
- """ + email_pattern + r"""
- )
- )
- %(end_string_suffix)s
- """) % locals(), re.VERBOSE),
- pep=re.compile(
- r"""
- %(start_string_prefix)s
- (
- (pep-(?P<pepnum1>\d+)(.txt)?) # reference to source file
- |
- (PEP\s+(?P<pepnum2>\d+)) # reference by name
- )
- %(end_string_suffix)s""" % locals(), re.VERBOSE),
- rfc=re.compile(
- r"""
- %(start_string_prefix)s
- (RFC(-|\s+)?(?P<rfcnum>\d+))
- %(end_string_suffix)s""" % locals(), re.VERBOSE))
- def quoted_start(self, match):
- """Return 1 if inline markup start-string is 'quoted', 0 if not."""
- string = match.string
- start = match.start()
- end = match.end()
- if start == 0: # start-string at beginning of text
- return 0
- prestart = string[start - 1]
- try:
- poststart = string[end]
- if self.openers.index(prestart) \
- == self.closers.index(poststart): # quoted
- return 1
- except IndexError: # start-string at end of text
- return 1
- except ValueError: # not quoted
- pass
- return 0
- def inline_obj(self, match, lineno, end_pattern, nodeclass,
- restore_backslashes=0):
- string = match.string
- matchstart = match.start('start')
- matchend = match.end('start')
- if self.quoted_start(match):
- return (string[:matchend], [], string[matchend:], [], '')
- endmatch = end_pattern.search(string[matchend:])
- if endmatch and endmatch.start(1): # 1 or more chars
- text = unescape(endmatch.string[:endmatch.start(1)],
- restore_backslashes)
- textend = matchend + endmatch.end(1)
- rawsource = unescape(string[matchstart:textend], 1)
- return (string[:matchstart], [nodeclass(rawsource, text)],
- string[textend:], [], endmatch.group(1))
- msg = self.reporter.warning(
- 'Inline %s start-string without end-string.'
- % nodeclass.__name__, line=lineno)
- text = unescape(string[matchstart:matchend], 1)
- rawsource = unescape(string[matchstart:matchend], 1)
- prb = self.problematic(text, rawsource, msg)
- return string[:matchstart], [prb], string[matchend:], [msg], ''
- def problematic(self, text, rawsource, message):
- msgid = self.document.set_id(message, self.parent)
- problematic = nodes.problematic(rawsource, text, refid=msgid)
- prbid = self.document.set_id(problematic)
- message.add_backref(prbid)
- return problematic
- def emphasis(self, match, lineno):
- before, inlines, remaining, sysmessages, endstring = self.inline_obj(
- match, lineno, self.patterns.emphasis, nodes.emphasis)
- return before, inlines, remaining, sysmessages
- def strong(self, match, lineno):
- before, inlines, remaining, sysmessages, endstring = self.inline_obj(
- match, lineno, self.patterns.strong, nodes.strong)
- return before, inlines, remaining, sysmessages
- def interpreted_or_phrase_ref(self, match, lineno):
- end_pattern = self.patterns.interpreted_or_phrase_ref
- string = match.string
- matchstart = match.start('backquote')
- matchend = match.end('backquote')
- rolestart = match.start('role')
- role = match.group('role')
- position = ''
- if role:
- role = role[1:-1]
- position = 'prefix'
- elif self.quoted_start(match):
- return (string[:matchend], [], string[matchend:], [])
- endmatch = end_pattern.search(string[matchend:])
- if endmatch and endmatch.start(1): # 1 or more chars
- textend = matchend + endmatch.end()
- if endmatch.group('role'):
- if role:
- msg = self.reporter.warning(
- 'Multiple roles in interpreted text (both '
- 'prefix and suffix present; only one allowed).',
- line=lineno)
- text = unescape(string[rolestart:textend], 1)
- prb = self.problematic(text, text, msg)
- return string[:rolestart], [prb], string[textend:], [msg]
- role = endmatch.group('suffix')[1:-1]
- position = 'suffix'
- escaped = endmatch.string[:endmatch.start(1)]
- rawsource = unescape(string[matchstart:textend], 1)
- if rawsource[-1:] == '_':
- if role:
- msg = self.reporter.warning(
- 'Mismatch: both interpreted text role %s and '
- 'reference suffix.' % position, line=lineno)
- text = unescape(string[rolestart:textend], 1)
- prb = self.problematic(text, text, msg)
- return string[:rolestart], [prb], string[textend:], [msg]
- return self.phrase_ref(string[:matchstart], string[textend:],
- rawsource, escaped, unescape(escaped))
- else:
- rawsource = unescape(string[rolestart:textend], 1)
- nodelist, messages = self.interpreted(rawsource, escaped, role,
- lineno)
- return (string[:rolestart], nodelist,
- string[textend:], messages)
- msg = self.reporter.warning(
- 'Inline interpreted text or phrase reference start-string '
- 'without end-string.', line=lineno)
- text = unescape(string[matchstart:matchend], 1)
- prb = self.problematic(text, text, msg)
- return string[:matchstart], [prb], string[matchend:], [msg]
- def phrase_ref(self, before, after, rawsource, escaped, text):
- match = self.patterns.embedded_uri.search(escaped)
- if match:
- text = unescape(escaped[:match.start(0)])
- uri_text = match.group(2)
- uri = ''.join(uri_text.split())
- uri = self.adjust_uri(uri)
- if uri:
- target = nodes.target(match.group(1), refuri=uri)
- else:
- raise ApplicationError('problem with URI: %r' % uri_text)
- if not text:
- text = uri
- else:
- target = None
- refname = normalize_name(text)
- reference = nodes.reference(rawsource, text,
- name=whitespace_normalize_name(text))
- node_list = [reference]
- if rawsource[-2:] == '__':
- if target:
- reference['refuri'] = uri
- else:
- reference['anonymous'] = 1
- else:
- if target:
- reference['refuri'] = uri
- target['names'].append(refname)
- self.document.note_explicit_target(target, self.parent)
- node_list.append(target)
- else:
- reference['refname'] = refname
- self.document.note_refname(reference)
- return before, node_list, after, []
- def adjust_uri(self, uri):
- match = self.patterns.email.match(uri)
- if match:
- return 'mailto:' + uri
- else:
- return uri
- def interpreted(self, rawsource, text, role, lineno):
- role_fn, messages = roles.role(role, self.language, lineno,
- self.reporter)
- if role_fn:
- nodes, messages2 = role_fn(role, rawsource, text, lineno, self)
- return nodes, messages + messages2
- else:
- msg = self.reporter.error(
- 'Unknown interpreted text role "%s".' % role,
- line=lineno)
- return ([self.problematic(rawsource, rawsource, msg)],
- messages + [msg])
- def literal(self, match, lineno):
- before, inlines, remaining, sysmessages, endstring = self.inline_obj(
- match, lineno, self.patterns.literal, nodes.literal,
- restore_backslashes=1)
- return before, inlines, remaining, sysmessages
- def inline_internal_target(self, match, lineno):
- before, inlines, remaining, sysmessages, endstring = self.inline_obj(
- match, lineno, self.patterns.target, nodes.target)
- if inlines and isinstance(inlines[0], nodes.target):
- assert len(inlines) == 1
- target = inlines[0]
- name = normalize_name(target.astext())
- target['names'].append(name)
- self.document.note_explicit_target(target, self.parent)
- return before, inlines, remaining, sysmessages
- def substitution_reference(self, match, lineno):
- before, inlines, remaining, sysmessages, endstring = self.inline_obj(
- match, lineno, self.patterns.substitution_ref,
- nodes.substitution_reference)
- if len(inlines) == 1:
- subref_node = inlines[0]
- if isinstance(subref_node, nodes.substitution_reference):
- subref_text = subref_node.astext()
- self.document.note_substitution_ref(subref_node, subref_text)
- if endstring[-1:] == '_':
- reference_node = nodes.reference(
- '|%s%s' % (subref_text, endstring), '')
- if endstring[-2:] == '__':
- reference_node['anonymous'] = 1
- else:
- reference_node['refname'] = normalize_name(subref_text)
- self.document.note_refname(reference_node)
- reference_node += subref_node
- inlines = [reference_node]
- return before, inlines, remaining, sysmessages
- def footnote_reference(self, match, lineno):
- """
- Handles `nodes.footnote_reference` and `nodes.citation_reference`
- elements.
- """
- label = match.group('footnotelabel')
- refname = normalize_name(label)
- string = match.string
- before = string[:match.start('whole')]
- remaining = string[match.end('whole'):]
- if match.group('citationlabel'):
- refnode = nodes.citation_reference('[%s]_' % label,
- refname=refname)
- refnode += nodes.Text(label)
- self.document.note_citation_ref(refnode)
- else:
- refnode = nodes.footnote_reference('[%s]_' % label)
- if refname[0] == '#':
- refname = refname[1:]
- refnode['auto'] = 1
- self.document.note_autofootnote_ref(refnode)
- elif refname == '*':
- refname = ''
- refnode['auto'] = '*'
- self.document.note_symbol_footnote_ref(
- refnode)
- else:
- refnode += nodes.Text(label)
- if refname:
- refnode['refname'] = refname
- self.document.note_footnote_ref(refnode)
- if utils.get_trim_footnote_ref_space(self.document.settings):
- before = before.rstrip()
- return (before, [refnode], remaining, [])
- def reference(self, match, lineno, anonymous=None):
- referencename = match.group('refname')
- refname = normalize_name(referencename)
- referencenode = nodes.reference(
- referencename + match.group('refend'), referencename,
- name=whitespace_normalize_name(referencename))
- if anonymous:
- referencenode['anonymous'] = 1
- else:
- referencenode['refname'] = refname
- self.document.note_refname(referencenode)
- string = match.string
- matchstart = match.start('whole')
- matchend = match.end('whole')
- return (string[:matchstart], [referencenode], string[matchend:], [])
- def anonymous_reference(self, match, lineno):
- return self.reference(match, lineno, anonymous=1)
- def standalone_uri(self, match, lineno):
- if (not match.group('scheme')
- or match.group('scheme').lower() in urischemes.schemes):
- if match.group('email'):
- addscheme = 'mailto:'
- else:
- addscheme = ''
- text = match.group('whole')
- unescaped = unescape(text, 0)
- return [nodes.reference(unescape(text, 1), unescaped,
- refuri=addscheme + unescaped)]
- else: # not a valid scheme
- raise MarkupMismatch
- def pep_reference(self, match, lineno):
- text = match.group(0)
- if text.startswith('pep-'):
- pepnum = int(match.group('pepnum1'))
- elif text.startswith('PEP'):
- pepnum = int(match.group('pepnum2'))
- else:
- raise MarkupMismatch
- ref = (self.document.settings.pep_base_url
- + self.document.settings.pep_file_url_template % pepnum)
- unescaped = unescape(text, 0)
- return [nodes.reference(unescape(text, 1), unescaped, refuri=ref)]
- rfc_url = 'rfc%d.html'
- def rfc_reference(self, match, lineno):
- text = match.group(0)
- if text.startswith('RFC'):
- rfcnum = int(match.group('rfcnum'))
- ref = self.document.settings.rfc_base_url + self.rfc_url % rfcnum
- else:
- raise MarkupMismatch
- unescaped = unescape(text, 0)
- return [nodes.reference(unescape(text, 1), unescaped, refuri=ref)]
- def implicit_inline(self, text, lineno):
- """
- Check each of the patterns in `self.implicit_dispatch` for a match,
- and dispatch to the stored method for the pattern. Recursively check
- the text before and after the match. Return a list of `nodes.Text`
- and inline element nodes.
- """
- if not text:
- return []
- for pattern, method in self.implicit_dispatch:
- match = pattern.search(text)
- if match:
- try:
- # Must recurse on strings before *and* after the match;
- # there may be multiple patterns.
- return (self.implicit_inline(text[:match.start()], lineno)
- + method(match, lineno) +
- self.implicit_inline(text[match.end():], lineno))
- except MarkupMismatch:
- pass
- return [nodes.Text(unescape(text), rawsource=unescape(text, 1))]
- dispatch = {'*': emphasis,
- '**': strong,
- '`': interpreted_or_phrase_ref,
- '``': literal,
- '_`': inline_internal_target,
- ']_': footnote_reference,
- '|': substitution_reference,
- '_': reference,
- '__': anonymous_reference}
- def _loweralpha_to_int(s, _zero=(ord('a')-1)):
- return ord(s) - _zero
- def _upperalpha_to_int(s, _zero=(ord('A')-1)):
- return ord(s) - _zero
- def _lowerroman_to_int(s):
- return roman.fromRoman(s.upper())
- class Body(RSTState):
- """
- Generic classifier of the first line of a block.
- """
- double_width_pad_char = tableparser.TableParser.double_width_pad_char
- """Padding character for East Asian double-width text."""
- enum = Struct()
- """Enumerated list parsing information."""
- enum.formatinfo = {
- 'parens': Struct(prefix='(', suffix=')', start=1, end=-1),
- 'rparen': Struct(prefix='', suffix=')', start=0, end=-1),
- 'period': Struct(prefix='', suffix='.', start=0, end=-1)}
- enum.formats = enum.formatinfo.keys()
- enum.sequences = ['arabic', 'loweralpha', 'upperalpha',
- 'lowerroman', 'upperroman'] # ORDERED!
- enum.sequencepats = {'arabic': '[0-9]+',
- 'loweralpha': '[a-z]',
- 'upperalpha': '[A-Z]',
- 'lowerroman': '[ivxlcdm]+',
- 'upperroman': '[IVXLCDM]+',}
- enum.converters = {'arabic': int,
- 'loweralpha': _loweralpha_to_int,
- 'upperalpha': _upperalpha_to_int,
- 'lowerroman': _lowerroman_to_int,
- 'upperroman': roman.fromRoman}
- enum.sequenceregexps = {}
- for sequence in enum.sequences:
- enum.sequenceregexps[sequence] = re.compile(
- enum.sequencepats[sequence] + '$')
- grid_table_top_pat = re.compile(r'\+-[-+]+-\+ *$')
- """Matches the top (& bottom) of a full table)."""
- simple_table_top_pat = re.compile('=+( +=+)+ *$')
- """Matches the top of a simple table."""
- simple_table_border_pat = re.compile('=+[ =]*$')
- """Matches the bottom & header bottom of a simple table."""
- pats = {}
- """Fragments of patterns used by transitions."""
- pats['nonalphanum7bit'] = '[!-/:-@[-`{-~]'
- pats['alpha'] = '[a-zA-Z]'
- pats['alphanum'] = '[a-zA-Z0-9]'
- pats['alphanumplus'] = '[a-zA-Z0-9_-]'
- pats['enum'] = ('(%(arabic)s|%(loweralpha)s|%(upperalpha)s|%(lowerroman)s'
- '|%(upperroman)s|#)' % enum.sequencepats)
- pats['optname'] = '%(alphanum)s%(alphanumplus)s*' % pats
- # @@@ Loosen up the pattern? Allow Unicode?
- pats['optarg'] = '(%(alpha)s%(alphanumplus)s*|<[^<>]+>)' % pats
- pats['shortopt'] = r'(-|\+)%(alphanum)s( ?%(optarg)s)?' % pats
- pats['longopt'] = r'(--|/)%(optname)s([ =]%(optarg)s)?' % pats
- pats['option'] = r'(%(shortopt)s|%(longopt)s)' % pats
- for format in enum.formats:
- pats[format] = '(?P<%s>%s%s%s)' % (
- format, re.escape(enum.formatinfo[format].prefix),
- pats['enum'], re.escape(enum.formatinfo[format].suffix))
- patterns = {
- 'bullet': u'[-+*\u2022\u2023\u2043]( +|$)',
- 'enumerator': r'(%(parens)s|%(rparen)s|%(period)s)( +|$)' % pats,
- 'field_marker': r':(?![: ])([^:\\]|\\.)*(?<! ):( +|$)',
- 'option_marker': r'%(option)s(, %(option)s)*( +| ?$)' % pats,
- 'doctest': r'>>>( +|$)',
- 'line_block': r'\|( +|$)',
- 'grid_table_top': grid_table_top_pat,
- 'simple_table_top': simple_table_top_pat,
- 'explicit_markup': r'\.\.( +|$)',
- 'anonymous': r'__( +|$)',
- 'line': r'(%(nonalphanum7bit)s)\1* *$' % pats,
- 'text': r''}
- initial_transitions = (
- 'bullet',
- 'enumerator',
- 'field_marker',
- 'option_marker',
- 'doctest',
- 'line_block',
- 'grid_table_top',
- 'simple_table_top',
- 'explicit_markup',
- 'anonymous',
- 'line',
- 'text')
- def indent(self, match, context, next_state):
- """Block quote."""
- indented, indent, line_offset, blank_finish = \
- self.state_machine.get_indented()
- elements = self.block_quote(indented, line_offset)
- self.parent += elements
- if not blank_finish:
- self.parent += self.unindent_warning('Block quote')
- return context, next_state, []
- def block_quote(self, indented, line_offset):
- elements = []
- while indented:
- (blockquote_lines,
- attribution_lines,
- attribution_offset,
- indented,
- new_line_offset) = self.split_attribution(indented, line_offset)
- blockquote = nodes.block_quote()
- self.nested_parse(blockquote_lines, line_offset, blockquote)
- elements.append(blockquote)
- if attribution_lines:
- attribution, messages = self.parse_attribution(
- attribution_lines, attribution_offset)
- blockquote += attribution
- elements += messages
- line_offset = new_line_offset
- while indented and not indented[0]:
- indented = indented[1:]
- line_offset += 1
- return elements
- # U+2014 is an em-dash:
- attribution_pattern = re.compile(u'(---?(?!-)|\u2014) *(?=[^ \\n])')
- def split_attribution(self, indented, line_offset):
- """
- Check for a block quote attribution and split it off:
- * First line after a blank line must begin with a dash ("--", "---",
- em-dash; matches `self.attribution_pattern`).
- * Every line after that must have consistent indentation.
- * Attributions must be preceded by block quote content.
- Return a tuple of: (block quote content lines, content offset,
- attribution lines, attribution offset, remaining indented lines).
- """
- blank = None
- nonblank_seen = False
- for i in range(len(indented)):
- line = indented[i].rstrip()
- if line:
- if nonblank_seen and blank == i - 1: # last line blank
- match = self.attribution_pattern.match(line)
- if match:
- attribution_end, indent = self.check_attribution(
- indented, i)
- if attribution_end:
- a_lines = indented[i:attribution_end]
- a_lines.trim_left(match.end(), end=1)
- a_lines.trim_left(indent, start=1)
- return (indented[:i], a_lines,
- i, indented[attribution_end:],
- line_offset + attribution_end)
- nonblank_seen = True
- else:
- blank = i
- else:
- return (indented, None, None, None, None)
- def check_attribution(self, indented, attribution_start):
- """
- Check attribution shape.
- Return the index past the end of the attribution, and the indent.
- """
- indent = None
- i = attribution_start + 1
- for i in range(attribution_start + 1, len(indented)):
- line = indented[i].rstrip()
- if not line:
- break
- if indent is None:
- indent = len(line) - len(line.lstrip())
- elif len(line) - len(line.lstrip()) != indent:
- return None, None # bad shape; not an attribution
- else:
- # return index of line after last attribution line:
- i += 1
- return i, (indent or 0)
- def parse_attribution(self, indented, line_offset):
- text = '\n'.join(indented).rstrip()
- lineno = self.state_machine.abs_line_number() + line_offset
- textnodes, messages = self.inline_text(text, lin…
Large files files are truncated, but you can click here to view the full file