elements.py - Copyright ÂŠ Stephen Day This module is part …

/sahriswiki/creoleparser/elements.py

https://bitbucket.org/prologic/sahriswiki · Python · 1614 lines · 1438 code · 55 blank · 121 comment · 51 complexity · d4b541a361eb509a05323b03e0c921dc MD5 · raw file
Large files are truncated click here to view the full file

# elements.py

# -*- coding: utf-8 -*-

#

# Copyright ÂŠ Stephen Day

#

# This module is part of Creoleparser and is released under

# the MIT License: http://www.opensource.org/licenses/mit-license.php

#



import re

import urlparse

import urllib

import keyword

import sys



import genshi.builder as bldr

from genshi.core import Stream, Markup



from core import (escape_char, esc_neg_look, fragmentize, ImplicitList, P3Template) 



BLOCK_ONLY_TAGS = ['h1','h2','h3','h4','h5','h6',

              'ul','ol','dl',

              'pre','hr','blockquote','address',

              'p','div','form','fieldset','table',

              'noscript']



BLOCK_TAGS = BLOCK_ONLY_TAGS + ['ins','del','script']





MACRO_NAME = r'(?P<name>[a-zA-Z]+([-.]?[a-zA-Z0-9]+)*)'

"""allows any number of non-repeating hyphens or periods.

Underscore is not included because hyphen is"""





# use Genshi's HTMLSanitizer if possible (i.e., not on Google App Engine)

try:

    from genshi.filters import HTMLSanitizer

except:

    SAFE_SCHEMES = frozenset(['file', 'ftp', 'http', 'https', 'mailto', None])

    class HTMLSanitizer(object):

        def is_safe_uri(self,uri):

            if ':' not in uri:

                return True # This is a relative URI

            chars = [char for char in uri.split(':', 1)[0] if char.isalnum()]

            return ''.join(chars).lower() in SAFE_SCHEMES



sanitizer = HTMLSanitizer()



__docformat__ = 'restructuredtext en'



class WikiElement(object):

    

    """Baseclass for all wiki elements."""

    

    append_newline = False

    """Determines if newlines are appended to Element(s) during processing.

    Should only affect readability of source xml.

    """

    

    def __init__(self, tag, token, child_elements=None):

        """Constructor for WikiElement objects.



        Subclasses may have other keyword arguments.   



        :parameters:

          tag

            The xhtml tag associated with the element.

          token

            The character string (or strings) that identifies the element

            in wiki markup.

          child_elements

            A list of wiki_elements that will be searched for in the body of the

            element.  The order of these elements matters, because if an element is

            found before the element that encloses it, the enclosing element will

            never be found.  In cases where this imposes limits (e.g, ``strong`` and

            ``em`` should be allowed to nest each other), place the conflicting

            elements in a sublist. The parser will then find which comes first.

        """

        self.tag = tag

        self.token = token

        if child_elements is None:

            child_elements = []

        self.child_elements = child_elements





    def _build(self,mo,element_store, environ):

        """Returns a genshi Element that has ``self.tag`` as the

        outermost tag.



        This methods if called exclusively by ``_process``



        :parameters:

          mo

            match object, usually the one returned by

            self.regexp.search(s) 

        """

        return bldr.tag.__getattr__(self.tag)(fragmentize(mo.group(1),

                                                          self.child_elements,

                                                          element_store, environ))



    def re_string(self):

        """The regular expression pattern that is compiled into ``self.regexp``.



        The regular expression must consume the entire wiki element,

        including the tokens. For block elements, the newline on the last

        line must be consumed also. group(1) should normally be the

        entire string inside the tokens. If not, a custom ``_build``

        method will be needed.

        """

        pass



        

    def __repr__(self):

        return "<"+self.__class__.__name__ + " " + str(self.tag)+">"







    def _process(self, mos, text, wiki_elements,element_store, environ):

        """Returns genshi Fragments (Elements and text)



        This is mainly for block level markup. See InlineElement

        for the other method.

        """

        frags = []

        end = 0

        for mo in mos:

            if end != mo.start():

            # call again for leading text and extend the result list 

                frags.extend(fragmentize(text[end:mo.start()],wiki_elements[1:],

                                         element_store, environ))

            # append the found wiki element to the result list

            built = self._build(mo,element_store, environ)

            if built is not None:

                frags.append(built)

            # make the source output easier to read

            if self.append_newline:

                frags.append('\n')

            end = mo.end()

        # call again for trailing text and extend the result list

        if end < len(text):

            if not isinstance(wiki_elements[0],(list,tuple)):

                wiki_elements = wiki_elements[1:]

            frags.extend(fragmentize(text[end:],wiki_elements,

                                         element_store, environ))



        return frags





class BlockElement(WikiElement):



    """Block elements inherit form this class



    Wiki elements wanting ``append_newline = True`` should use this

    as the base also.



    """



    append_newline = True

    



class InlineElement(WikiElement):



    r"""For finding generic inline elements



    >>> em = InlineElement('em','//')

    >>> mo1 = em.regexp.search('a //word// in a line')

    >>> mo2 = em.regexp.search('a //word in a line\n or two\n')

    >>> mo1.group(0),mo1.group(1)

    ('//word//', 'word')

    >>> mo2.group(0),mo2.group(1)

    ('//word in a line\n or two', 'word in a line\n or two')



    Use a list for the ``token`` argument to have different start

    and end strings. These must be closed.



    >>> foo = InlineElement('foo',['<<','>>'])

    >>> mo = foo.regexp.search('blaa <<here it is >>\n')

    >>> mo.group(1)

    'here it is '

        

    """



    def __init__(self, tag='', token=''):

        super(InlineElement,self).__init__(tag,token)

        self.regexp = re.compile(self.re_string(),re.DOTALL)



    def re_string(self):

        if isinstance(self.token,str):

            content = '(.+?)'

            end = '(' + esc_neg_look + re.escape(self.token) + r'|$)'

            return esc_neg_look + re.escape(self.token) + content + end

        else:

            content = '(.+?)'

            return esc_neg_look + re.escape(self.token[0]) + content + esc_neg_look + re.escape(self.token[1])



    def _process(self, mos, text, wiki_elements, element_store, environ):

        """Returns genshi Fragments (Elements and text)"""

        parts = []

        end = 0

        for mo in mos:

            processed = self._build(mo,element_store, environ)

            store_id = str(id(processed)) 

            element_store[store_id] = processed

            parts.append(''.join([text[end:mo.start()],'<<<',store_id,'>>>']))

            end = mo.end()

        # call again for trailing text and extend the result list

        if end < len(text):

            parts.append(text[end:])

        new_text = ''.join(parts)

        if not isinstance(wiki_elements[0],(list,tuple)):

            wiki_elements = wiki_elements[1:]

        frags = fragmentize(new_text,wiki_elements,element_store, environ)

        return frags





class SimpleElement(InlineElement):



    r"""For finding generic inline elements like ``strong`` and ``em``.



    >>> em = SimpleElement({'//':'em'})

    >>> mo1 = em.regexp.search('a //word// in a line')

    >>> mo2 = em.regexp.search('a //word in a line\n or two\n')

    >>> mo1.group(0),mo1.group(2)

    ('//word//', 'word')

    >>> mo2.group(0),mo2.group(2)

    ('//word in a line\n or two', 'word in a line\n or two')

       

    """



    def __init__(self, token_dict={}):

        self.token_dict = token_dict

        self.tokens = token_dict.keys()

        super(SimpleElement,self).__init__('','')

        self.regexp = re.compile(self.re_string(),re.DOTALL)



    def re_string(self):

        if isinstance(self.token,basestring):

            tokens = '(' + '|'.join([re.escape(token) for token in self.tokens]) + ')'

            content = '(.+?)'

            end = '(' + esc_neg_look + r'\1|$)'

            return esc_neg_look + tokens + content + end



    def _build(self,mo,element_store, environ):

        return bldr.tag.__getattr__(self.token_dict[mo.group(1)])(fragmentize(mo.group(2),

                                                          self.child_elements,

                                                          element_store, environ))





class LinkElement(InlineElement):



    """Superclass for AnchorLinks and ImageLinks. Parses internal, external,

    and interwiki links.

    

    """

    

    def __init__(self,tag, token, delimiter,

                 interwiki_delimiter,base_urls,links_funcs,default_space_char,space_chars,

                 base_url,space_char,class_func,path_func):

        super(LinkElement,self).__init__(tag,token)

        self.regexp = re.compile(self.re_string(),re.DOTALL)

        self.delimiter = delimiter

        self.interwiki_delimiter = interwiki_delimiter

        self.base_urls = base_urls

        self.links_funcs = links_funcs

        self.default_space_char = default_space_char

        self.space_chars = space_chars

        self.base_url = base_url

        self.space_char = space_char

        self.class_func = class_func

        self.path_func = path_func

        self.content_regexp = re.compile(self.content_re_string(),re.DOTALL)

##        self.arg_regexp = re.compile(self.arg_re_string(),re.DOTALL)

        self.interwikilink_regexp = re.compile(self.interwikilink_re_string())

        self.urllink_regexp = re.compile(self.urllink_re_string(), re.DOTALL)

        self.wikilink_regexp = re.compile(self.wikilink_re_string())



##    def arg_re_string(self):

##        key = r'((?P<key>\w+)\s*\=)?'

##        value = r'(?P<value>.*?)'

##        return r'\s*' + key + r'\s*' + value + r'\s*(?P<delimiter>' + \

##               re.escape(self.delimiter) + r'|$)(?P<tail>.*)'



    def content_re_string(self):

        return r'(?P<body>.*?)(' + re.escape(self.delimiter) + '(?P<arg_string>.*?))?$'



    def interwikilink_re_string(self):

        all_wikis = set(self.links_funcs.keys() + self.base_urls.keys())

        wiki_id = '(?P<wiki_id>' + '|'.join(all_wikis) + ')'

        optional_spaces = ' *'

        page_name = r'(?P<page_name>\S+?( \S+?)*)' #allows any number of single spaces

        return '^' + optional_spaces + wiki_id + \

               re.escape(self.interwiki_delimiter) + ' *' + page_name + \

               optional_spaces + '$'#+ alias



    def urllink_re_string(self):

        protocol = r'^\s*((\w+?:|/)' 

        rest_of_url = r'[\S\n]*?)\s*$'

        return protocol + rest_of_url #+ alias



    def wikilink_re_string(self):

        optional_spaces = ' *'

        page_name = r'(?P<page_name>\S+?( \S+?)*?)' #allows any number of single spaces

        return '^' + optional_spaces + page_name + optional_spaces + '$'#+ \



##    def parse_args(self, arg_string):

##        args = []

##        delimiter = True

##        while delimiter:

##            mo = self.arg_regexp.match(arg_string)

##            key, value, delimiter, tail = mo.group('key'),mo.group('value'),mo.group('delimiter'), mo.group('tail')

##            if key:

##                args.append((key, value))

##            else:

##                args.append(value)

##            arg_string = tail

##        positional_args = []

##        kw_args = {}

##        for arg in args:

##           if isinstance(arg,tuple):

##             k, v  = arg

##             k = str(k).lower()

##             if k in keyword.kwlist:

##                 k = k + '_'

##             if k in kw_args:

##                if isinstance(v,list):

##                   try:

##                      kw_args[k].extend(v)

##                   except AttributeError:

##                      v.insert(0,kw_args[k])

##                      kw_args[k] = v

##                elif isinstance(kw_args[k],list):

##                   kw_args[k].append(v)

##                else:

##                   kw_args[k] = [kw_args[k], v]

##                kw_args[k] = ImplicitList(kw_args[k])

##             else:

##                kw_args[k] = v

##             if isinstance(kw_args[k],ImplicitList):

##                 kw_args[k] = ','.join(kw_args[k])

##           else:

##             positional_args.append(arg)

##

##        return (positional_args, kw_args)





    def page_name(self,mo):

        if 'wiki_id' in mo.groupdict():

            space_char = self.space_chars.get(mo.group('wiki_id'),self.default_space_char)

        else:

            space_char = self.space_char

        return mo.group('page_name').replace(' ',space_char)



    def _build(self,mo,element_store, environ):

        content_mo = self.content_regexp.match(mo.group(1))

        body = content_mo.group('body')

        arg_string = content_mo.group('arg_string')

        the_class = None

        page_name = None

        if self.interwikilink_regexp.match(body):

            interwikilink_mo = self.interwikilink_regexp.match(body)

            link_type = 'interwiki'

            base_url = self.base_urls.get(interwikilink_mo.group('wiki_id'))

            link_func = self.links_funcs.get(interwikilink_mo.group('wiki_id'))

            page_name = self.page_name(interwikilink_mo)

            if link_func:

                url = link_func(page_name)

            else:

                url = urllib.quote(page_name.encode('utf-8'))

            if base_url:

                url = urlparse.urljoin(base_url, url)

        elif self.urllink_regexp.match(body):

            urllink_mo = self.urllink_regexp.match(body)

            link_type = 'url'

            if sanitizer.is_safe_uri(urllink_mo.group(1)):

                url = urllink_mo.group(1)

            else:

                url = None

        elif self.wikilink_regexp.match(body):

            wikilink_mo = self.wikilink_regexp.match(body)

            link_type = 'wiki'

            page_name = self.page_name(wikilink_mo)

            if self.path_func:

                the_path = self.path_func(self.tag, page_name, environ)

            else:

                the_path = urllib.quote(page_name.encode('utf-8'))

            url = urlparse.urljoin(self.base_url, the_path)

        else:

            url = None



        if not url:

            return mo.group(0)

        else:

            if arg_string is not None:

                args, kw_args = [arg_string.strip()], {} #self.parse_args(arg_string)

            else:

                args, kw_args = [], {}



            try:

                if self.class_func:

                    the_class = self.class_func(link_type, url, body, page_name)

                return self.emit(element_store, environ,link_type,body,url,the_class, *args, **kw_args)

            except TypeError:

                return mo.group(0)

                





class AnchorElement(LinkElement):



    """Finds and builds internal, external, and interwiki links.



    >>> link = AnchorElement('a',('[[',']]'),'|',

    ... interwiki_delimiter=':', 

    ... base_urls=dict(somewiki='http://somewiki.org/',

    ...                bigwiki='http://bigwiki.net/'),

    ... links_funcs={},default_space_char='-',

    ... space_chars={'bigwiki':' '},base_url='http://somewiki.org/',

    ... space_char='_',class_func=None,path_func=None)

    

    >>> mo = link.regexp.search("[[http://www.google.com| here]]")

    >>> link._build(mo,{},None).generate().render()

    '<a href="http://www.google.com">here</a>'



    >>> mo = link.regexp.search(" [[somewiki:Home Page|steve]] ")

    >>> link._build(mo,{},None).generate().render()

    '<a href="http://somewiki.org/Home-Page">steve</a>'



    >>> mo = link.regexp.search(" [[bigwiki:Home Page]] ")

    >>> link._build(mo,{},None).generate().render()

    '<a href="http://bigwiki.net/Home%20Page">bigwiki:Home Page</a>'



    >>> mo = link.regexp.search(" [[Home Page |Home]]")

    >>> link._build(mo,{},None).generate().render()

    '<a href="http://somewiki.org/Home_Page">Home</a>'

    

    """

    



    def __init__(self, *args, **kw_args):

        super(AnchorElement,self).__init__(*args, **kw_args)    



    def emit(self,element_store, environ,link_type,body,url,the_class, alias=None):

        if alias:

            alias = fragmentize(alias,self.child_elements,element_store, environ)

        else:

            alias = body.strip()

        return bldr.tag.__getattr__(self.tag)(alias,

                                              href=url,

                                              class_=the_class)



        

class ImageElement(LinkElement):



    def __init__(self, *args, **kw_args):

        super(ImageElement,self).__init__(*args, **kw_args)    



    def emit(self,element_store, environ,link_type,body,url,the_class, alt=None):

        if alt is None:

            if link_type == 'url':

                alt = urlparse.urlsplit(url).path.split('/')[-1]

            else:

                alt = body.strip()

        return bldr.tag.__getattr__(self.tag)(src=url ,alt=alt, title=alt,

                                              #class_=the_class

                                              )           

        



class Link(InlineElement):



    """Finds and builds links."""

    

    def __init__(self,tag, token):

        super(Link,self).__init__(tag,token)

        self.regexp = re.compile(self.re_string(),re.DOTALL)



    def _build(self,mo,element_store, environ):

        

        for tag in self.child_elements:

            m = tag.regexp.search(mo.group(1))

            if m:

                link = tag._build(m,element_store, environ)

                if link:

                    break

        else:

            link = None



        if link:

            return bldr.tag(link)

        else:

            return mo.group(0)





class Macro(WikiElement):

    r"""Finds and processes inline macro elements."""



    def __init__(self, tag, token, func):

        super(Macro,self).__init__(tag,token , [])

        self.func = func

        self.regexp = re.compile(self.re_string())





    def _process(self, mos, text, wiki_elements,element_store, environ):

        """Returns genshi Fragments (Elements and text)"""

        assert len(mos) == 1

        mo = mos[0]

        processed = self._build(mo,element_store, environ)

        if isinstance(processed, list):

            tail = processed[1]

            processed = processed[0]

        else:

            tail = ''

        if isinstance(processed, basestring) and not isinstance(processed,Markup):

            text = ''.join([text[:mo.start()],processed,tail,

                        text[mo.end():]])

        else:

            store_id = str(id(processed))

            element_store[store_id] = processed

            text = ''.join([text[:mo.start()],'<<<',store_id,'>>>',tail,

                        text[mo.end():]])

        frags = fragmentize(text,wiki_elements,element_store, environ)

        return frags





    def re_string(self):

        content = '(.*?)'

        return esc_neg_look + re.escape(self.token[0]) + r'(' + MACRO_NAME + \

               content + ')' + esc_neg_look + re.escape(self.token[1])



    trailing_slash = re.compile(r'(?<=[ "\'\]])/$')

    def _build(self,mo,element_store, environ):

        arg_string = re.sub(self.trailing_slash,'',mo.group(4))

        if self.func:

            value = self.func(mo.group('name'),arg_string,None,False,environ)

        else:

            value = None

        if value is None:

            return bldr.tag.code(self.token[0],bldr.tag.span(mo.group('name'),class_="macro_name"),

                           bldr.tag.span(arg_string,class_="macro_arg_string"),

                           self.token[1],class_="unknown_macro")

        elif isinstance(value, (basestring,bldr.Fragment,bldr.Element, Stream)):

            return value

        else:

            raise Exception("macros can only return strings and genshi objects") 

        



class BodiedMacro(Macro):

    """Finds and processes macros with bodies.



    Does not span across top level block markup

    (see BodiedBlockMacro's for that)."""



    def __init__(self, tag, token, func):

        super(BodiedMacro,self).__init__(tag,token , func)

        self.func = func

        self.regexp = re.compile(self.re_string(),re.DOTALL)



    def re_string(self):

        content = r'(?P<arg_string>[ \S]*?)'

        body = '(?P<body>.+)'

        return esc_neg_look + re.escape(self.token[0]) + MACRO_NAME + \

               content + '(?<!/)' + re.escape(self.token[1]) + \

               body + esc_neg_look + re.escape(self.token[0]) + \

               r'/(?P=name)' + '(?<!/)' + re.escape(self.token[1])



    def _build(self,mo,element_store, environ):

        start = ''.join([esc_neg_look, re.escape(self.token[0]), re.escape(mo.group('name')),

                         r'(?P<arg_string>[ \S]*?)', re.escape(self.token[1])])

        end = ''.join([esc_neg_look, re.escape(self.token[0]), '/', re.escape(mo.group('name')),

                       re.escape(self.token[1])])

        count = 0

        for mo2 in re.finditer(start + '|' + end, mo.group('body')):

            if re.match(end,mo2.group(0)):

                count = count + 1

            else:

                count = count - 1

            if count > 0:

                body = mo.group('body')[:mo2.start()]

                tail = ''.join([mo.group('body')[mo2.end():], self.token[0],

                                '/', mo.group('name'), self.token[1]])

                break

        else:

            body = mo.group('body')

            tail = ''

                

                

        

        if self.func:

            value = self.func(mo.group('name'),mo.group('arg_string'),body,False,environ)

        else:

            value = None

        if value is None:

            content_out = [self.token[0],bldr.tag.span(mo.group('name'),class_="macro_name"),

                           bldr.tag.span(mo.group('arg_string'),class_="macro_arg_string"),

                           self.token[1],bldr.tag.span(mo.group('body'),class_="macro_body"),

                           self.token[0] + '/' + mo.group('name') + self.token[1]]

            return [bldr.tag.code(content_out,class_="unknown_macro", style="white-space:pre-wrap"),tail]

            

        elif isinstance(value, (basestring,bldr.Fragment, Stream)):

            return [value,tail]

        else:

            raise Exception("macros can only return strings and genshi objects")



       



class BodiedBlockMacro(WikiElement):

    """Finds and processes block macros with bodies.



    The opening and closing tokens must be are each on a line alone without

    leading spaces. These macros can enclose other block level markup

    including pre blocks and other BodiedBlockMacro's."""





    def __init__(self, tag, token, func):

        super(BodiedBlockMacro,self).__init__(tag,token , func)

        self.func = func

        self.regexp = re.compile(self.re_string(),re.DOTALL+re.MULTILINE)



    def re_string(self):

        arg_string = r'(?P<arg_string>(?![^\n]*>>[^\n]*>>)[ \S]*?)'

        start = '^' + re.escape(self.token[0])

        body = r'(?P<body>.*\n)'

        end = re.escape(self.token[0]) + \

               r'/(?P=name)' + '(?<!/)' + re.escape(self.token[1]) + r'\s*?$'

        

        return start + '(' + MACRO_NAME + arg_string + ')' + '(?<!/)' + \

               re.escape(self.token[1]) + r'\s*?\n' + body + end



    def _process(self, mos, text, wiki_elements,element_store, environ):

        """Returns genshi Fragments (Elements and text)



        This is mainly for block level markup. See InlineElement

        for the other method.

        """

        assert len(mos) == 1

        mo = mos[0]

        processed = self._build(mo,element_store, environ)

        if isinstance(processed, list):

            tail = processed[1]

            processed = processed[0]

        else:

            tail = ''

        if isinstance(processed, basestring) and not isinstance(processed,Markup):

            #print '_process', repr(processed)

            text = ''.join([text[:mo.start()],processed,tail,

                        text[mo.end():]])

            frags = fragmentize(text,wiki_elements,element_store, environ)

        else:

        

            frags = []

            # call again for leading text and extend the result list 

            if mo.start():

                frags.extend(fragmentize(text[:mo.start()],wiki_elements[1:],

                                         element_store, environ))

            # append the found wiki element to the result list

            frags.append(processed)

            # make the source output easier to read

            if self.append_newline:

                frags.append('\n')

            # call again for trailing text and extend the result list

            if tail or mo.end() < len(text):

                frags.extend(fragmentize(tail + text[mo.end():],wiki_elements,

                                         element_store, environ))

        return frags



    def _build(self,mo,element_store, environ):

        start = ''.join(['^', re.escape(self.token[0]), re.escape(mo.group('name')),

                         r'(?P<arg_string>(?![^\n]*>>[^\n]*>>)[ \S]*?)', re.escape(self.token[1]),r'\s*?\n'])

        end = ''.join(['^', re.escape(self.token[0]), '/', re.escape(mo.group('name')),

                       re.escape(self.token[1]),r'\s*?$'])

        count = 0

        for mo2 in re.finditer(start + '|' + end, mo.group('body'),re.MULTILINE):

            if re.match(end,mo2.group(0)):

                count = count + 1

            else:

                count = count - 1

            if count > 0:

                body = mo.group('body')[:mo2.start()]

                tail = ''.join([mo.group('body')[mo2.end():], self.token[0],

                                '/', mo.group('name'), self.token[1],'\n'])

                break

        else:

            body = mo.group('body')

            tail = ''



        if self.func:

            value = self.func(mo.group('name'),mo.group('arg_string'),body,True,environ)

        else:

            value = None

        if value is None:

            return [bldr.tag.pre(self.token[0],bldr.tag.span(mo.group('name'),class_="macro_name"),

                           bldr.tag.span(mo.group('arg_string'),class_="macro_arg_string"),

                           self.token[1],'\n',bldr.tag.span(mo.group('body'),class_="macro_body"),

                           self.token[0] + '/' + mo.group('name') + self.token[1],

                           class_="unknown_macro"), tail]

        elif (isinstance(value, (Stream, basestring)) or

             (isinstance(value,bldr.Element) and value.tag in BLOCK_TAGS)):

            return [value, tail]

        # Add a p tag if the value is a Fragment or Element that needs one

        elif isinstance(value, bldr.Fragment):

            return [bldr.tag.p(value), tail]

        else:

            raise Exception("macros can only return strings and genshi objects")

        

    

class RawLink(InlineElement):

    

    """Used to find raw urls in wiki text and build xml from them.



    >>> raw_link = RawLink(tag='a')

    >>> mo = raw_link.regexp.search(" a http://www.google.com url ")

    >>> raw_link.href(mo)

    'http://www.google.com'

    >>> raw_link._build(mo,{},None).generate().render()

    '<a href="http://www.google.com">http://www.google.com</a>'

    

    """

    linking_protocols = ['http','https']

    

    def __init__(self, tag):

        super(RawLink,self).__init__(tag=tag, token=None)

        self.regexp = re.compile(self.re_string())



    def re_string(self):

        escape = '(' + re.escape(escape_char) + ')?'

        #protocol = '((https?|ftp)://'

        protocol = '((https?)://'

        rest_of_url = r'\S+?)'

        #allow one punctuation character or '**' or '//'. Don't include a placeholder.

        look_ahead = r'(?=([>)}\]]?[,.?!:;"\']?(([^a-zA-Z0-9])\6)?(\s|$))|<<<)'

        return escape + protocol + rest_of_url + look_ahead



    def _build(self,mo,element_store, environ):

        if (not mo.group(1)) and (mo.group(3) in self.linking_protocols):

            return bldr.tag.__getattr__(self.tag)(self.alias(mo,element_store),

                                              href=self.href(mo))

        else:

            return self.href(mo)

        

    def href(self,mo):

        """Returns the string for the href attribute of the Element."""

        if sanitizer.is_safe_uri(mo.group(2)):

            return mo.group(2)

        else:

            return "unsafe_uri_detected"



    def alias(self,mo,element_store):

        """Returns the string for the content of the Element."""

        return self.href(mo)





class URLLink(WikiElement):

    

    """Used to find url type links inside a link.



    The scope of these is within link markup only (i.e., [[url]]



    >>> url_link = URLLink('a','|')

    >>> mo = url_link.regexp.search(" http://www.google.com| here ")

    >>> url_link.href(mo)

    'http://www.google.com'

    >>> url_link._build(mo,{},None).generate().render()

    '<a href="http://www.google.com">here</a>'

    

    """



    def __init__(self, tag,delimiter):

        super(URLLink,self).__init__(tag, '')

        self.delimiter = delimiter

        self.regexp = re.compile(self.re_string(),re.DOTALL)



    def re_string(self):

        protocol = r'^\s*((\w+?:|/)' 

        rest_of_url = r'[\S\n]*?)\s*'

        alias = r'(' + re.escape(self.delimiter) + r' *(.*?))? *$'

        return protocol + rest_of_url + alias



    def _build(self,mo,element_store, environ):

        if not self.href(mo):

            return None

        return bldr.tag.__getattr__(self.tag)(self.alias(mo,element_store, environ),

                                              href=self.href(mo))

       

    def href(self,mo):

        """Returns the string for the href attribute of the Element."""

        if sanitizer.is_safe_uri(mo.group(1)):

            return mo.group(1)

        else:

            return None 

            



    def alias(self,mo,element_store, environ):

        """Returns the string for the content of the Element."""

        if not mo.group(4):

            return self.href(mo)

        else:

            return fragmentize(mo.group(4),self.child_elements,element_store, environ)







class InterWikiLink(WikiElement):



    """Used to match interwiki links inside a link.



    The search scope for these is only inside links. 



    >>> interwiki_link = InterWikiLink('a',

    ... delimiter1=':', delimiter2 = '|',

    ... base_urls=dict(somewiki='http://somewiki.org/',

    ...                bigwiki='http://bigwiki.net/'),

    ...                links_funcs={},default_space_char='_',

    ...                space_chars={})

    >>> mo = interwiki_link.regexp.search(" somewiki:Home Page|steve ")

    >>> interwiki_link.href(mo)

    'http://somewiki.org/Home_Page'

    >>> interwiki_link.alias(mo,{},None)

    ['steve']

    

    """



    def __init__(self, tag, delimiter1,

                 delimiter2,base_urls,links_funcs,default_space_char,space_chars):

        super(InterWikiLink,self).__init__(tag, '')

        self.delimiter1 = delimiter1

        self.delimiter2 = delimiter2

        #self.regexp = re.compile(self.re_string())

        self.base_urls = base_urls

        self.links_funcs = links_funcs

        self.default_space_char = default_space_char

        self.space_chars = space_chars

        self.regexp = re.compile(self.re_string())



    def re_string(self):

        #all_wikis = set(self.links_funcs.keys() + self.base_urls.keys())

        #wiki_id = '(' + '|'.join(all_wikis) + ')'



        wiki_id = r'(\w+)'

        optional_spaces = ' *'

        page_name = r'(\S+?( \S+?)*)' #allows any number of single spaces

        alias = r'(' + re.escape(self.delimiter2) + r' *(.*?))? *$'

        return '^' + optional_spaces + wiki_id + optional_spaces + \

               re.escape(self.delimiter1) + optional_spaces + page_name + \

               optional_spaces + alias



    def page_name(self,mo):

        space_char = self.space_chars.get(mo.group(1),self.default_space_char)

        return mo.group(2).replace(' ',space_char)



    def href(self,mo):

        linktype = mo.group(1)

        base_url = self.base_urls.get(linktype)

        link_func = self.links_funcs.get(linktype)

        if not (link_func or base_url):

            return None

        else:

            href = self.page_name(mo)

            if link_func:

                href = link_func(href)

            else:

                href = urllib.quote(href.encode('utf-8'))

            if base_url:

                href = urlparse.urljoin(base_url, href)

            return href



    def _build(self,mo,element_store, environ):

        if not self.href(mo):

            return '[[' + mo.group(0) + ']]'

        return bldr.tag.__getattr__(self.tag)(self.alias(mo,element_store, environ),

                                              href=self.href(mo))

    def alias(self,mo,element_store, environ):

        """Returns the string for the content of the Element."""

        if not mo.group(5):

            return ''.join([mo.group(1),self.delimiter1,mo.group(2)])

        else:

            return fragmentize(mo.group(5),self.child_elements,element_store, environ)







class WikiLink(WikiElement):



    """Used to match wiki links inside a link.



    The search scope for these is only inside links.



    >>> wiki_link = WikiLink('a','|',base_url='http://somewiki.org/',

    ...                      space_char='_',class_func=None, path_func=None)

    >>> mo = wiki_link.regexp.search(" Home Page |Home")

    >>> wiki_link.href(mo)

    'http://somewiki.org/Home_Page'

    >>> wiki_link.alias(mo,{},None)

    ['Home']

    

    """



    def __init__(self, tag, delimiter,

                 base_url,space_char,class_func,path_func):

        super(WikiLink,self).__init__(tag, '')

        self.delimiter = delimiter

        self.base_url = base_url

        self.space_char = space_char

        self.class_func = class_func

        self.path_func = path_func

        self.regexp = re.compile(self.re_string())



    def re_string(self):

        optional_spaces = ' *'

        page_name = r'(\S+?( \S+?)*?)' #allows any number of single spaces

        alias = r'(' + re.escape(self.delimiter) + r' *(.*?))? *$'

        return '^' + optional_spaces + page_name + optional_spaces + \

               alias



    def page_name(self,mo):

        return mo.group(1).replace(' ',self.space_char)

    

    def href(self,mo,environ):

        if self.path_func:

            the_path = self.path_func(self.tag, self.page_name(mo), environ)

        else:

            the_path = urllib.quote(self.page_name(mo).encode('utf-8'))

        return urlparse.urljoin(self.base_url, the_path)



    def _build(self,mo,element_store, environ):

        if self.class_func:

            the_class = self.class_func(self.page_name(mo))

        else:

            the_class = None

        return bldr.tag.__getattr__(self.tag)(self.alias(mo,element_store, environ),

                                              href=self.href(mo, environ),

                                              class_=the_class)

    

    def alias(self,mo,element_store, environ):

        """Returns the string for the content of the Element."""

        if not mo.group(3):

            return mo.group(1)

        else:

            return fragmentize(mo.group(4),self.child_elements,element_store, environ)





class List(BlockElement):



    """Finds list (ordered, unordered, and definition) wiki elements.



    group(1) of the match object includes all lines from the list

    including newline characters.

        

    """



    def __init__(self, tag, token,stop_tokens=None):

        self.stop_tokens = stop_tokens

        super(List,self).__init__(tag, token)

        #self.stop_tokens = stop_tokens

        self.regexp = re.compile(self.re_string(),re.DOTALL+re.MULTILINE)



    def re_string(self):

        """This re_string is for finding generic block elements like

        lists (ordered, unordered, and definition) that start with a

        single token.

        """

        leading_whitespace = r'^([ \t]*'

        only_one_token = re.escape(self.token)+ '(?!' + re.escape(self.token) + ')'

        rest_of_list = r'.*?(?:\n|\Z))'

        only_one_stop_token = '([' + re.escape(self.stop_tokens) + r'])(?!\3)'        

        look_ahead = '(?=([ \t]*' + only_one_stop_token + '|$))'

        return leading_whitespace + only_one_token + rest_of_list + \

               look_ahead







class ListItem(BlockElement):

    r"""Matches the current list item.



    Everything up to the next same-level list item is matched.



    >>> list_item = ListItem('li','#*')

    >>> mo = list_item.regexp.search("*one\n**one.1\n**one.2\n*two\n")

    >>> mo.group(3)

    'one\n**one.1\n**one.2\n'

    >>> mo.group(0)

    '*one\n**one.1\n**one.2\n'

    

    """

    

    append_newline = False



    def __init__(self, tag, list_tokens):

        """Constructor for list items.



        :parameters"

          list_tokens

            A string that includes the tokens used for lists

        """

        self.list_tokens = list_tokens

        super(ListItem,self).__init__(tag, None)

        self.regexp = re.compile(self.re_string(),re.DOTALL)



    def re_string(self):

        whitespace = r'[ \t]*'

        item_start = '(([' + self.list_tokens + r'])\2*)'

        rest_of_item = r'(.*?(?:\n|\Z))'

        start_of_same_level_item = r'\1(?!\2)'

        look_ahead = r'(?=(' + whitespace + start_of_same_level_item + '|$))'

        return whitespace + item_start + whitespace + \

               rest_of_item + look_ahead



    def _build(self,mo,element_store, environ):

        return bldr.tag.__getattr__(self.tag)(fragmentize(mo.group(3),

                                                          self.child_elements,

                                                          element_store, environ))





class NestedList(WikiElement):



    r"""Finds a list in the current list item.



    >>> nested_ul = NestedList('ul','*')

    >>> mo = nested_ul.regexp.search('one\n**one.1\n**one.2\n')

    >>> mo.group(1)

    '**one.1\n**one.2\n'

    >>> mo.group(0) == mo.group(1)

    True



    """



    def __init__(self, tag, token):

        super(NestedList,self).__init__(tag, token)

        self.regexp = re.compile(self.re_string(),re.DOTALL+re.MULTILINE)



    def re_string(self):

        look_behind = r'(?<=\n)' # have to avoid finding a list on the first line

        whitespace = r'(\s*'

        rest_of_list = '.*$)'

        return look_behind + '^' + whitespace + re.escape(self.token) + \

               rest_of_list





class DefinitionTerm(BlockElement):



    r"""Processes definition terms.



    >>> term = DefinitionTerm('dt',';',stop_token=':')

    >>> mo1,mo2 = term.regexp.finditer(";term1\n:def1\n;term2:def2\n")

    >>> mo1.group(1), mo2.group(1)

    ('term1', 'term2')

    >>> mo1.group(0), mo2.group(0)

    (';term1\n', ';term2')



    group(1) of the match object is the term line or up to the first ':'

        

    """



    def __init__(self, tag, token,stop_token):

        super(DefinitionTerm,self).__init__(tag, token)

        self.stop_token = stop_token

        self.regexp = re.compile(self.re_string(),re.DOTALL+re.MULTILINE)



    def re_string(self):

        look_ahead = r'(\n|(?=(' + esc_neg_look + re.escape(self.stop_token) + r'|$)))'

        return r'^[ \t]*' + re.escape(self.token) + r'[ \t]*(.*?' + \

               re.escape(self.stop_token) +  '?)\s*' + look_ahead 





class DefinitionDef(BlockElement):



    r"""Processes definitions.



    >>> definition = DefinitionDef('dd',':')

    >>> mo1,mo2 = definition.regexp.finditer(":def1a\ndef1b\n:def2\n")

    >>> mo1.group(1), mo2.group(1)

    ('def1a\ndef1b', 'def2')

    >>> mo1.group(0), mo2.group(0)

    (':def1a\ndef1b\n', ':def2\n')



    group(1) of the match object includes all lines from the defintion

    up to the next definition.

        

    """



    def __init__(self, tag, token):

        super(DefinitionDef,self).__init__(tag, token)

        self.regexp = re.compile(self.re_string(),re.DOTALL+re.MULTILINE)



    def re_string(self):

        look_ahead = r'(?=(^[ \t]*' + re.escape(self.token) + r')|\Z)'

        return r'^[ \t]*' + re.escape(self.token) + r'?[ \t]*(.+?)\s*' + look_ahead 



class Paragraph(BlockElement):

    """"This should be the last outer level wiki element to be searched.



    Anything that is left over will be placed in a paragraphs unless it looks

    like block content according to xhtml1 strict. Block content is defined

    here as valid children of the <body> element (see BLOCK_TAGS). Only genshi

    Element objects will be evaluated (see BLOCK_TAGS). Fragments and stings

    are treated as inline while Streams are block content.

    

    """



    def __init__(self, tag):

        super(Paragraph,self).__init__(tag,None)

        self.regexp = re.compile(self.re_string(),re.DOTALL)#+re.MULTILINE)



    def re_string(self):

        return r'^(.*?)\n?$' 



    def _build(self,mo,element_store, environ):

        content = fragmentize(mo.group(1), self.child_elements, element_store, environ)

        # Check each list item and record those that are block only

        block_only_frags = []

        for i,element in enumerate(content):

            if ((isinstance(element, bldr.Element) and

                element.tag in BLOCK_ONLY_TAGS) or

                isinstance(element,(Stream,Markup))):

                block_only_frags.append(i)

        # Build a new result list if needed

        if block_only_frags:

            new_content = []

            last_i = -1

            for i in block_only_frags:

                if content[last_i+1:i]:

                    if not (len(content[last_i+1:i])==1 and

                                                content[last_i+1] == '\n'):

                        new_content.append(bldr.tag.__getattr__(self.tag)(content[last_i+1:i]))

                    else:

                        new_content.append('\n')

                new_content.append(content[i])

                last_i = i

            if content[last_i+1:]:

                new_content.append(bldr.tag.__getattr__(self.tag)(content[last_i+1:]))

            return bldr.tag(new_content)

        else:

            return bldr.tag.__getattr__(self.tag)(content)

            



class Heading(BlockElement):



    r"""Finds heading wiki elements.



    >>> h1 = Heading(['h1','h2'],'=')

    >>> mo = h1.regexp.search('before\n = An important thing = \n after')

    >>> mo.group(2)

    'An important thing'

    >>> mo.group(0)

    ' = An important thing = \n'



    """

  

    def __init__(self, tag, token):

        super(Heading,self).__init__('',token)

        self.tags = tag

        self.regexp = re.compile(self.re_string(),re.MULTILINE)



    def re_string(self):

        whitespace = r'[ \t]*'

        tokens = '(' + re.escape(self.token) + '{1,' + str(len(self.tags)) +'})'

        content = '(.*?)'

        trailing_markup = '(' + re.escape(self.token) + r'+[ \t]*)?(\n|\Z)'

        return '^' + whitespace + tokens + \

               whitespace + content + whitespace + trailing_markup



    def _build(self,mo,element_store, environ):

        heading_tag = self.tags[len(mo.group(1))-1]

        return bldr.tag.__getattr__(heading_tag)(fragmentize(mo.group(2),

                                                          self.child_elements,

                                                          element_store, environ))





class Table(BlockElement):



    r"""Find tables.



    >>> table = Table('table','|')

    >>> mo = table.regexp.search("before\n | one | two |\n|one|two \n hi")

    >>> mo.group(1)

    ' | one | two |\n|one|two \n'

    >>> mo.group(0) == mo.group(1)

    True

    

    """



    def __init__(self, tag, token):

        super(Table,self).__init__(tag,token)

        self.regexp = re.compile(self.re_string(),re.MULTILINE)



    def re_string(self):

        whitespace = r'[ \t]*'

        rest_of_line = r'.*?(\n|\Z)'

        return '^((' + whitespace + re.escape(self.token) + \

               rest_of_line + ')+)'







class TableRow(BlockElement):



    r"""Finds rows in a table.



    >>> row = TableRow('tr','|')

    >>> mo = row.regexp.search(' | one | two |\n|one|two \n')

    >>> mo.group(1)

    '| one | two '

    >>> mo.group(0)

    ' | one | two |\n'

    

    """



    def __init__(self, tag, token):

        super(TableRow,self).__init__(tag,token)

        self.regexp = re.compile(self.re_string(),re.MULTILINE)



    def re_string(self):

        whitespace = r'[ \t]*'

        content = '(' + re.escape(self.token) + '.*?)'

        trailing_token = re.escape(self.token) + '?'

        return '^' + whitespace + content + trailing_token + \

               whitespace + r'(\n|\Z)'





class TableCell(WikiElement):



    r"""Finds cells in a table row.



    >>> cell = TableCell('td','|')

    >>> mo = cell.regexp.search('| one | two ')

    >>> mo.group(1)

    'one'

    >>> mo.group(0)

    '| one '

    

    """



    def __init__(self, tag, token):

        super(TableCell,self).__init__(tag,token )

        self.regexp = re.compile(self.re_string())



    def re_string(self):

        whitespace = r'[ \t]*'

        content = '(.*?)'

        look_ahead = '((?=' + esc_neg_look + re.escape(self.token[0]) + ')|$)'

        return esc_neg_look + re.escape(self.token) + whitespace + \

               content + whitespace + look_ahead    







##class Link(InlineElement):

##

##    """Finds and builds links."""

##    

##    def __init__(self,tag, token):

##        super(Link,self).__init__(tag,token)

##

##    def _build(self,mo,element_store, environ):

##        

##        for tag in self.child_elements:

##            m = tag.regexp.search(mo.group(1))

##            if m:

##                link = tag._build(m,element_store, environ)

##                if link:

##                    break

##        else:

##            link = None

##

##        if link:

##            return bldr.tag(link)

##        else:

##            return mo.group(0)



class Image(InlineElement):



    """Processes image elements.



    >>> img = Image('img',('{{','}}'), delimiter='|')

    >>> mo = img.regexp.search('{{ picture.jpg | An image of a house }}')

    >>> img._build(mo,{},None).generate().render()

    '<img src="picture.jpg" alt="An image of a house" title="An image of a house"/>'



    """



    def __init__(self, tag, token, delimiter):

        super(Image,self).__init__(tag,token )

        self.regexp = re.compile(self.re_string())

        self.delimiter = delimiter

        self.src_regexp = re.compile(r'^\s*(\S+)\s*$')



    def _build(self,mo,element_store, environ):

        body = mo.group(1).split(self.delimiter,1)

        src_mo = self.src_regexp.search(body[0])

        if not src_mo:

            return bldr.tag.span('Bad Image src')

        if sanitizer.is_safe_uri(src_mo.group(1)):

            link = src_mo.group(1)

        else:

            link = "unsafe_uri_detected"

        if len(body) == 1:

            alias = link

        else:

            alias = body[1].strip()

        return bldr.tag.__getattr__(self.tag)(src=link ,alt=alias, title=alias)





class NoWikiElement(InlineElement):



    """Inline no-wiki.



    When two or more end tokens are found together, only last marks

    the end of the element.

    

    """



    def __init__(self, tag, token):

        super(NoWikiElement,self).__init__(tag,token )

        self.regexp = re.compile(self.re_string(),re.DOTALL) 



    def _build(self,mo,element_store, environ):

        if self.tag:

            return bldr.tag.__getattr__(self.tag)(

                   fragmentize(mo.group(1), self.child_elements,

                               element_store,environ,

                               remove_escapes=False))

        else:

            return bldr.tag(fragmentize(mo.group(1),self.child_elements,

                                        element_store, environ,

                                        remove_escapes=False))



    def re_string(self):

        if isinstance(self.token,str):

            content = '(.+?' + re.escape(self.token[-1]) + '*)'

            return esc_neg_look + re.escape(self.token) + \

                   content + re.escape(self.token)

        else:

            content = '(.+?' + re.escape(self.token[1][-1]) + '*)'

            return esc_neg_look + re.escape(self.token[0]) + \

                   content + re.escape(self.token[1])





class PreBlock(BlockElement):

    """A preformatted block.



    If a closing token is found on a line with a space as the first

    character, the space will be removed from the output.

    

    """…
Tech Fingerprint

Alerts (47)

'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
38
'def' Ensure functions have docstrings for documentation
41 187 236 277 283 286 295 300 305 346 523 556 617 722 831 843 847 903 910 913 994 1026 1054 1080 1149 1181 1206 1231 1320
'isinstance(' Overuse may indicate design issues; consider polymorphism
141 188 210 237 319 325 331 338 506 511 597 636 641 698 1321
'raise Exception(' Raise specific exception types for better error handling
600 701