notation3.py - Python standard libraries

/SmartObjectFramework/src/rdflib/plugins/parsers/notation3.py

https://github.com/mjkoster/SmartObject · Python · 2427 lines · 1845 code · 313 blank · 269 comment · 339 complexity · 825ecb377cb8c00574f56dbdcc5a06e0 MD5 · raw file
Large files are truncated click here to view the full file

#!/usr/bin/env python
u"""
notation3.py - Standalone Notation3 Parser
Derived from CWM, the Closed World Machine

Authors of the original suite:

* Dan Connolly <@@>
* Tim Berners-Lee <@@>
* Yosi Scharf <@@>
* Joseph M. Reagle Jr. <reagle@w3.org>
* Rich Salz <rsalz@zolera.com>

http://www.w3.org/2000/10/swap/notation3.py

Copyright 2000-2007, World Wide Web Consortium.
Copyright 2001, MIT.
Copyright 2001, Zolera Systems Inc.

License: W3C Software License
http://www.w3.org/Consortium/Legal/copyright-software

Modified by Sean B. Palmer
Copyright 2007, Sean B. Palmer. \u32E1

Modified to work with rdflib by Gunnar Aastrand Grimnes
Copyright 2010, Gunnar A. Grimnes

"""

# Python standard libraries
import types
import sys
import os
import re
import StringIO
import codecs

from binascii import a2b_hex
from decimal import Decimal

from rdflib.term import URIRef, BNode, Literal, Variable, _XSD_PFX, _unique_id
from rdflib.graph import QuotedGraph, ConjunctiveGraph
from rdflib import py3compat
b = py3compat.b

__all__ = [
    'URISyntaxError', 'BadSyntax', 'N3Parser', "verbosity", "setVerbosity",
    "progress", "splitFrag", "splitFragP", "join", "refTo", "base",
    "canonical", "runNamespace", "uniqueURI", "Canonicalize", "stripCR",
    "dummyWrite", "toBool", "stringToN3", "backslashUify", "hexify"]

from rdflib.parser import Parser


# Incestuous.. would be nice to separate N3 and XML
# from sax2rdf import XMLtoDOM
def XMLtoDOM(*args, **kargs):
     # print >> sys.stderr, args, kargs
    pass


# SWAP http://www.w3.org/2000/10/swap
# from diag import verbosity, setVerbosity, progress
def verbosity(*args, **kargs):
     # print >> sys.stderr, args, kargs
    pass


def setVerbosity(*args, **kargs):
     # print >> sys.stderr, args, kargs
    pass


def progress(*args, **kargs):
     # print >> sys.stderr, args, kargs
    pass


def splitFrag(uriref):
    """split a URI reference between the fragment and the rest.

    Punctuation is thrown away.

    e.g.

    >>> splitFrag("abc#def")
    ('abc', 'def')

    >>> splitFrag("abcdef")
    ('abcdef', None)

    """

    i = uriref.rfind("#")
    if i >= 0:
        return uriref[:i], uriref[i + 1:]
    else:
        return uriref, None


def splitFragP(uriref, punct=0):
    """split a URI reference before the fragment

    Punctuation is kept.

    e.g.

    >>> splitFragP("abc#def")
    ('abc', '#def')

    >>> splitFragP("abcdef")
    ('abcdef', '')

    """

    i = uriref.rfind("#")
    if i >= 0:
        return uriref[:i], uriref[i:]
    else:
        return uriref, ''


@py3compat.format_doctest_out
def join(here, there):
    """join an absolute URI and URI reference
    (non-ascii characters are supported/doctested;
    haven't checked the details of the IRI spec though)

    here is assumed to be absolute.
    there is URI reference.

    >>> join('http://example/x/y/z', '../abc')
    'http://example/x/abc'

    Raise ValueError if there uses relative path
    syntax but here has no hierarchical path.

    >>> join('mid:foo@example', '../foo')
    Traceback (most recent call last):
        raise ValueError, here
    ValueError: Base <mid:foo@example> has no slash after colon - with relative '../foo'.

    >>> join('http://example/x/y/z', '')
    'http://example/x/y/z'

    >>> join('mid:foo@example', '#foo')
    'mid:foo@example#foo'

    We grok IRIs

    >>> len(%(u)s'Andr\\xe9')
    5

    >>> join('http://example.org/', %(u)s'#Andr\\xe9')
    %(u)s'http://example.org/#Andr\\xe9'
    """

    assert(here.find("#") < 0), "Base may not contain hash: '%s'" % here  # caller must splitFrag (why?)

    slashl = there.find('/')
    colonl = there.find(':')

     # join(base, 'foo:/') -- absolute
    if colonl >= 0 and (slashl < 0 or colonl < slashl):
        return there

    bcolonl = here.find(':')
    assert(bcolonl >= 0), "Base uri '%s' is not absolute" % here  # else it's not absolute

    path, frag = splitFragP(there)
    if not path:
        return here + frag

     # join('mid:foo@example', '../foo') bzzt
    if here[bcolonl + 1:bcolonl + 2] != '/':
        raise ValueError("Base <%s> has no slash after colon - with relative '%s'." % (here, there))

    if here[bcolonl + 1:bcolonl + 3] == '//':
        bpath = here.find('/', bcolonl + 3)
    else:
        bpath = bcolonl + 1

     # join('http://xyz', 'foo')
    if bpath < 0:
        bpath = len(here)
        here = here + '/'

     # join('http://xyz/', '//abc') => 'http://abc'
    if there[:2] == '//':
        return here[:bcolonl + 1] + there

     # join('http://xyz/', '/abc') => 'http://xyz/abc'
    if there[:1] == '/':
        return here[:bpath] + there

    slashr = here.rfind('/')

    while 1:
        if path[:2] == './':
            path = path[2:]
        if path == '.':
            path = ''
        elif path[:3] == '../' or path == '..':
            path = path[3:]
            i = here.rfind('/', bpath, slashr)
            if i >= 0:
                here = here[:i + 1]
                slashr = i
        else:
            break

    return here[:slashr + 1] + path + frag


commonHost = re.compile(r'^[-_a-zA-Z0-9.]+:(//[^/]*)?/[^/]*$')


def refTo(base, uri):
    """figure out a relative URI reference from base to uri

    >>> refTo('http://example/x/y/z', 'http://example/x/abc')
    '../abc'

    >>> refTo('file:/ex/x/y', 'file:/ex/x/q/r#s')
    'q/r#s'

    >>> refTo(None, 'http://ex/x/y')
    'http://ex/x/y'

    >>> refTo('http://ex/x/y', 'http://ex/x/y')
    ''

    Note the relationship between refTo and join:
    join(x, refTo(x, y)) == y
    which points out certain strings which cannot be URIs. e.g.
    >>> x='http://ex/x/y';y='http://ex/x/q:r';join(x, refTo(x, y)) == y
    0

    So 'http://ex/x/q:r' is not a URI. Use 'http://ex/x/q%3ar' instead:
    >>> x='http://ex/x/y';y='http://ex/x/q%3ar';join(x, refTo(x, y)) == y
    1

    This one checks that it uses a root-realtive one where that is
    all they share.  Now uses root-relative where no path is shared.
    This is a matter of taste but tends to give more resilience IMHO
    -- and shorter paths

    Note that base may be None, meaning no base.  In some situations, there
    just ain't a base. Slife. In these cases, relTo returns the absolute value.
    The axiom abs(,rel(b,x))=x still holds.
    This saves people having to set the base to "bogus:".

    >>> refTo('http://ex/x/y/z', 'http://ex/r')
    '/r'

    """

     # assert base  # don't mask bugs -danc  # not a bug. -tim
    if not base:
        return uri
    if base == uri:
        return ""

     # Find how many path segments in common
    i = 0
    while i < len(uri) and i < len(base):
        if uri[i] == base[i]:
            i = i + 1
        else:
            break
     # print "# relative", base, uri, "   same up to ", i
     # i point to end of shortest one or first difference

    m = commonHost.match(base[:i])
    if m:
        k = uri.find("//")
        if k < 0:
            k = -2  # no host
        l = uri.find("/", k + 2)
        if uri[l + 1:l + 2] != "/" and base[l + 1:l + 2] != "/" and uri[:l] == base[:l]:
            return uri[l:]

    if uri[i:i + 1] == "#" and len(base) == i:
        return uri[i:]  # fragment of base

    while i > 0 and uri[i - 1] != '/':
        i = i - 1   # scan for slash

    if i < 3:
        return uri   # No way.
    if base.find("//", i - 2) > 0 or uri.find("//", i - 2) > 0:
        return uri  # An unshared "//"
    if base.find(":", i) > 0:
        return uri   # An unshared ":"
    n = base.count("/", i)
    if n == 0 and i < len(uri) and uri[i] == '#':
        return "./" + uri[i:]
    elif n == 0 and i == len(uri):
        return "./"
    else:
        return ("../" * n) + uri[i:]


def base():
    """The base URI for this process - the Web equiv of cwd

    Relative or abolute unix-standard filenames parsed relative to
    this yeild the URI of the file.
    If we had a reliable way of getting a computer name,
    we should put it in the hostname just to prevent ambiguity

    """
     # return "file://" + hostname + os.getcwd() + "/"
    return "file://" + _fixslash(os.getcwd()) + "/"


def _fixslash(argstr):
    """ Fix windowslike filename to unixlike - (#ifdef WINDOWS)"""
    s = argstr
    for i in range(len(s)):
        if s[i] == "\\":
            s = s[:i] + "/" + s[i + 1:]
    if s[0] != "/" and s[1] == ":":
        s = s[2:]   # @@@ Hack when drive letter present
    return s

URI_unreserved = b("ABCDEFGHIJJLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~")
     # unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"


@py3compat.format_doctest_out
def canonical(str_in):
    """Convert equivalent URIs (or parts) to the same string

    There are many differenet levels of URI canonicalization
    which are possible.  See http://www.ietf.org/rfc/rfc3986.txt
    Done:
    - Converfting unicode IRI to utf-8
    - Escaping all non-ASCII
    - De-escaping, if escaped, ALPHA (%%41-%%5A and %%61-%%7A), DIGIT (%%30-%%39),
    hyphen (%%2D), period (%%2E), underscore (%%5F), or tilde (%%7E) (Sect 2.4)
    - Making all escapes uppercase hexadecimal

    Not done:
    - Making URI scheme lowercase
    - changing /./ or  /foo/../ to / with care not to change host part


    >>> canonical("foo bar")
    %(b)s'foo%%20bar'

    >>> canonical(%(u)s'http:')
    %(b)s'http:'

    >>> canonical('fran%%c3%%83%%c2%%a7ois')
    %(b)s'fran%%C3%%83%%C2%%A7ois'

    >>> canonical('a')
    %(b)s'a'

    >>> canonical('%%4e')
    %(b)s'N'

    >>> canonical('%%9d')
    %(b)s'%%9D'

    >>> canonical('%%2f')
    %(b)s'%%2F'

    >>> canonical('%%2F')
    %(b)s'%%2F'

    """
    if type(str_in) == type(u''):
        s8 = str_in.encode('utf-8')
    else:
        s8 = str_in
    s = b('')
    i = 0
    while i < len(s8):
        if py3compat.PY3:
            n = s8[i]
            ch = bytes([n])
        else:
            ch = s8[i]
            n = ord(ch)
        if (n > 126) or (n < 33):    # %-encode controls, SP, DEL, and utf-8
            s += b("%%%02X" % ord(ch))
        elif ch == b('%') and i + 2 < len(s8):
            ch2 = a2b_hex(s8[i + 1:i + 3])
            if ch2 in URI_unreserved:
                s += ch2
            else:
                s += b("%%%02X" % ord(ch2))
            i = i + 3
            continue
        else:
            s += ch
        i = i + 1
    return s


CONTEXT = 0
PRED = 1
SUBJ = 2
OBJ = 3

PARTS = PRED, SUBJ, OBJ
ALL4 = CONTEXT, PRED, SUBJ, OBJ

SYMBOL = 0
FORMULA = 1
LITERAL = 2
LITERAL_DT = 21
LITERAL_LANG = 22
ANONYMOUS = 3
XMLLITERAL = 25

Logic_NS = "http://www.w3.org/2000/10/swap/log#"
NODE_MERGE_URI = Logic_NS + "is"   # Pseudo-property indicating node merging
forSomeSym = Logic_NS + "forSome"
forAllSym = Logic_NS + "forAll"

RDF_type_URI = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
RDF_NS_URI = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
OWL_NS = "http://www.w3.org/2002/07/owl#"
DAML_sameAs_URI = OWL_NS + "sameAs"
parsesTo_URI = Logic_NS + "parsesTo"
RDF_spec = "http://www.w3.org/TR/REC-rdf-syntax/"

List_NS = RDF_NS_URI      # From 20030808
_Old_Logic_NS = "http://www.w3.org/2000/10/swap/log.n3#"

N3_first = (SYMBOL, List_NS + "first")
N3_rest = (SYMBOL, List_NS + "rest")
N3_li = (SYMBOL, List_NS + "li")
N3_nil = (SYMBOL, List_NS + "nil")
N3_List = (SYMBOL, List_NS + "List")
N3_Empty = (SYMBOL, List_NS + "Empty")


runNamespaceValue = None


def runNamespace():
    "Return a URI suitable as a namespace for run-local objects"
     # @@@ include hostname (privacy?) (hash it?)
    global runNamespaceValue
    if runNamespaceValue == None:
        runNamespaceValue = join(base(), _unique_id()) + '#'
    return runNamespaceValue

nextu = 0


def uniqueURI():
    "A unique URI"
    global nextu
    nextu += 1
    # return runNamespace() + "u_" + `nextu`
    return runNamespace() + "u_" + str(nextu)


class URISyntaxError(ValueError):
    """A parameter is passed to a routine that requires a URI reference"""
    pass


tracking = False
chatty_flag = 50


from xml.dom import Node
try:
    from xml.ns import XMLNS
except:
    class XMLNS:
        BASE = "http://www.w3.org/2000/xmlns/"
        XML = "http://www.w3.org/XML/1998/namespace"


_attrs = lambda E: (E.attributes and E.attributes.values()) or []
_children = lambda E: E.childNodes or []
_IN_XML_NS = lambda n: n.namespaceURI == XMLNS.XML
_inclusive = lambda n: n.unsuppressedPrefixes == None

# Does a document/PI has lesser/greater document order than the
# first element?
_LesserElement, _Element, _GreaterElement = range(3)


def _sorter(n1, n2):
    '''_sorter(n1, n2) -> int
    Sorting predicate for non-NS attributes.'''

    i = cmp(n1.namespaceURI, n2.namespaceURI)
    if i:
        return i
    return cmp(n1.localName, n2.localName)


def _sorter_ns(n1, n2):
    '''_sorter_ns((n,v),(n,v)) -> int
    "(an empty namespace URI is lexicographically least)."'''

    if n1[0] == 'xmlns':
        return -1
    if n2[0] == 'xmlns':
        return 1
    return cmp(n1[0], n2[0])


def _utilized(n, node, other_attrs, unsuppressedPrefixes):
    '''_utilized(n, node, other_attrs, unsuppressedPrefixes) -> boolean
    Return true if that nodespace is utilized within the node'''

    if n.startswith('xmlns:'):
        n = n[6:]
    elif n.startswith('xmlns'):
        n = n[5:]
    if (n == "" and node.prefix in ["#default", None]) or \
          n == node.prefix or n in unsuppressedPrefixes:
        return 1
    for attr in other_attrs:
        if n == attr.prefix:
            return 1
    return 0


#_in_subset = lambda subset, node: not subset or node in subset
_in_subset = lambda subset, node: subset is None or node in subset  # rich's tweak


class _implementation:
    '''Implementation class for C14N. This accompanies a node during it's
    processing and includes the parameters and processing state.'''

     # Handler for each node type; populated during module instantiation.
    handlers = {}

    def __init__(self, node, write, **kw):
        '''Create and run the implementation.'''
        self.write = write
        self.subset = kw.get('subset')
        self.comments = kw.get('comments', 0)
        self.unsuppressedPrefixes = kw.get('unsuppressedPrefixes')
        nsdict = kw.get('nsdict', {'xml': XMLNS.XML, 'xmlns': XMLNS.BASE})

         # Processing state.
        self.state = (nsdict, {'xml': ''}, {})  # 0422

        if node.nodeType == Node.DOCUMENT_NODE:
            self._do_document(node)
        elif node.nodeType == Node.ELEMENT_NODE:
            self.documentOrder = _Element         # At document element
            if not _inclusive(self):
                self._do_element(node)
            else:
                inherited = self._inherit_context(node)
                self._do_element(node, inherited)
        elif node.nodeType == Node.DOCUMENT_TYPE_NODE:
            pass
        elif node.nodeType == Node.TEXT_NODE:
            self._do_text(node)
        else:
            raise TypeError(str(node))

    def _inherit_context(self, node):
        '''_inherit_context(self, node) -> list
        Scan ancestors of attribute and namespace context.  Used only
        for single element node canonicalization, not for subset
        canonicalization.'''

         # Collect the initial list of xml:foo attributes.
        xmlattrs = filter(_IN_XML_NS, _attrs(node))

         # Walk up and get all xml:XXX attributes we inherit.
        inherited, parent = [], node.parentNode
        while parent and parent.nodeType == Node.ELEMENT_NODE:
            for a in filter(_IN_XML_NS, _attrs(parent)):
                n = a.localName
                if n not in xmlattrs:
                    xmlattrs.append(n)
                    inherited.append(a)
            parent = parent.parentNode
        return inherited

    def _do_document(self, node):
        '''_do_document(self, node) -> None
        Process a document node. documentOrder holds whether the document
        element has been encountered such that PIs/comments can be written
        as specified.'''

        self.documentOrder = _LesserElement
        for child in node.childNodes:
            if child.nodeType == Node.ELEMENT_NODE:
                self.documentOrder = _Element         # At document element
                self._do_element(child)
                self.documentOrder = _GreaterElement  # After document element
            elif child.nodeType == Node.PROCESSING_INSTRUCTION_NODE:
                self._do_pi(child)
            elif child.nodeType == Node.COMMENT_NODE:
                self._do_comment(child)
            elif child.nodeType == Node.DOCUMENT_TYPE_NODE:
                pass
            else:
                raise TypeError(str(child))
    handlers[Node.DOCUMENT_NODE] = _do_document

    def _do_text(self, node):
        '''_do_text(self, node) -> None
        Process a text or CDATA node.  Render various special characters
        as their C14N entity representations.'''
        if not _in_subset(self.subset, node):
            return
        s = node.data.replace("&", "&amp;")
        s = s.replace("<", "&lt;")
        s = s.replace(">", "&gt;")
        s = s.replace("\015", "&#xD;")
        if s:
            self.write(s)
    handlers[Node.TEXT_NODE] = _do_text
    handlers[Node.CDATA_SECTION_NODE] = _do_text

    def _do_pi(self, node):
        '''_do_pi(self, node) -> None
        Process a PI node. Render a leading or trailing  # xA if the
        document order of the PI is greater or lesser (respectively)
        than the document element.
        '''
        if not _in_subset(self.subset, node):
            return
        W = self.write
        if self.documentOrder == _GreaterElement:
            W('\n')
        W('<?')
        W(node.nodeName)
        s = node.data
        if s:
            W(' ')
            W(s)
        W('?>')
        if self.documentOrder == _LesserElement:
            W('\n')
    handlers[Node.PROCESSING_INSTRUCTION_NODE] = _do_pi

    def _do_comment(self, node):
        '''_do_comment(self, node) -> None
        Process a comment node. Render a leading or trailing  # xA if the
        document order of the comment is greater or lesser (respectively)
        than the document element.
        '''
        if not _in_subset(self.subset, node):
            return
        if self.comments:
            W = self.write
            if self.documentOrder == _GreaterElement:
                W('\n')
            W('<!--')
            W(node.data)
            W('-->')
            if self.documentOrder == _LesserElement:
                W('\n')
    handlers[Node.COMMENT_NODE] = _do_comment

    def _do_attr(self, n, value):
        ''''_do_attr(self, node) -> None
        Process an attribute.'''

        W = self.write
        W(' ')
        W(n)
        W('="')
        s = value.replace(value, "&", "&amp;")
        s = s.replace("<", "&lt;")
        s = s.replace('"', '&quot;')
        s = s.replace('\011', '&#x9')
        s = s.replace('\012', '&#xA')
        s = s.replace('\015', '&#xD')
        W(s)
        W('"')

    def _do_element(self, node, initial_other_attrs=[]):
        '''_do_element(self, node, initial_other_attrs = []) -> None
        Process an element (and its children).'''

        # Get state (from the stack) make local copies.
        #   ns_parent -- NS declarations in parent
        #   ns_rendered -- NS nodes rendered by ancestors
        #        ns_local -- NS declarations relevant to this element
        #   xml_attrs -- Attributes in XML namespace from parent
        #       xml_attrs_local -- Local attributes in XML namespace.
        ns_parent, ns_rendered, xml_attrs = \
                self.state[0], self.state[1].copy(), self.state[2].copy()  # 0422
        ns_local = ns_parent.copy()
        xml_attrs_local = {}

        # progress("_do_element node.nodeName=", node.nodeName)
        # progress("_do_element node.namespaceURI", node.namespaceURI)
        # progress("_do_element node.tocml()", node.toxml())
        # Divide attributes into NS, XML, and others.
        other_attrs = initial_other_attrs[:]
        in_subset = _in_subset(self.subset, node)
        for a in _attrs(node):
            # progress("\t_do_element a.nodeName=", a.nodeName)
            if a.namespaceURI == XMLNS.BASE:
                n = a.nodeName
                if n == "xmlns:":
                    n = "xmlns"         # DOM bug workaround
                ns_local[n] = a.nodeValue
            elif a.namespaceURI == XMLNS.XML:
                if _inclusive(self) or in_subset:
                    xml_attrs_local[a.nodeName] = a  # 0426
            else:
                other_attrs.append(a)
             # add local xml:foo attributes to ancestor's xml:foo attributes
            xml_attrs.update(xml_attrs_local)

         # Render the node
        W, name = self.write, None
        if in_subset:
            name = node.nodeName
            W('<')
            W(name)

             # Create list of NS attributes to render.
            ns_to_render = []
            for n, v in ns_local.items():

                 # If default namespace is XMLNS.BASE or empty,
                 # and if an ancestor was the same
                if n == "xmlns" and v in [XMLNS.BASE, ''] \
                  and ns_rendered.get('xmlns') in [XMLNS.BASE, '', None]:
                    continue

                 # "omit namespace node with local name xml, which defines
                 # the xml prefix, if its string value is
                 # http://www.w3.org/XML/1998/namespace."
                if n in ["xmlns:xml", "xml"] \
                  and v in ['http://www.w3.org/XML/1998/namespace']:
                    continue

                 # If not previously rendered
                 # and it's inclusive  or utilized
                if (n, v) not in ns_rendered.items() \
                  and (_inclusive(self) or \
                  _utilized(n, node, other_attrs, self.unsuppressedPrefixes)):
                    ns_to_render.append((n, v))

             # Sort and render the ns, marking what was rendered.
            ns_to_render.sort(_sorter_ns)
            for n, v in ns_to_render:
                self._do_attr(n, v)
                ns_rendered[n] = v     # 0417

             # If exclusive or the parent is in the subset, add the local xml attributes
             # Else, add all local and ancestor xml attributes
             # Sort and render the attributes.
            if not _inclusive(self) or _in_subset(self.subset, node.parentNode):   # 0426
                other_attrs.extend(xml_attrs_local.values())
            else:
                other_attrs.extend(xml_attrs.values())
            other_attrs.sort(_sorter)
            for a in other_attrs:
                self._do_attr(a.nodeName, a.value)
            W('>')

         # Push state, recurse, pop state.
        state, self.state = self.state, (ns_local, ns_rendered, xml_attrs)
        for c in _children(node):
            _implementation.handlers[c.nodeType](self, c)
        self.state = state

        if name:
            W('</%s>' % name)
    handlers[Node.ELEMENT_NODE] = _do_element


def Canonicalize(node, output=None, **kw):
    '''Canonicalize(node, output=None, **kw) -> UTF-8

    Canonicalize a DOM document/element node and all descendents.
    Return the text; if output is specified then output.write will
    be called to output the text and None will be returned
    Keyword parameters:
    nsdict -- a dictionary of prefix:uri namespace entries
    assumed to exist in the surrounding context
    comments -- keep comments if non-zero (default is 0)
    subset -- Canonical XML subsetting resulting from XPath (default is [])
    unsuppressedPrefixes -- do exclusive C14N, and this specifies the
    prefixes that should be inherited.
    '''
    if output:
        apply(_implementation, (node, output.write), kw)
    else:
        s = StringIO.StringIO()
        apply(_implementation, (node, s.write), kw)
        return s.getvalue()

# end of xmlC14n.py


# from why import BecauseOfData, becauseSubexpression
def BecauseOfData(*args, **kargs):
     # print args, kargs
    pass


def becauseSubexpression(*args, **kargs):
     # print args, kargs
    pass

N3_forSome_URI = forSomeSym
N3_forAll_URI = forAllSym

# Magic resources we know about

ADDED_HASH = "#"   # Stop where we use this in case we want to remove it!
# This is the hash on namespace URIs

RDF_type = (SYMBOL, RDF_type_URI)
DAML_sameAs = (SYMBOL, DAML_sameAs_URI)

LOG_implies_URI = "http://www.w3.org/2000/10/swap/log#implies"

BOOLEAN_DATATYPE = _XSD_PFX + "boolean"
DECIMAL_DATATYPE = _XSD_PFX + "decimal"
DOUBLE_DATATYPE = _XSD_PFX + "double"
FLOAT_DATATYPE = _XSD_PFX + "float"
INTEGER_DATATYPE = _XSD_PFX + "integer"

option_noregen = 0    # If set, do not regenerate genids on output

# @@ I18n - the notname chars need extending for well known unicode non-text
# characters. The XML spec switched to assuming unknown things were name
# characaters.
# _namechars = string.lowercase + string.uppercase + string.digits + '_-'
_notQNameChars = "\t\r\n !\"#$%&'()*.,+/;<=>?@[\\]^`{|}~"  # else valid qname :-/
_notNameChars = _notQNameChars + ":"   # Assume anything else valid name :-/
_rdfns = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'


N3CommentCharacter = "#"      # For unix script  # ! compatabilty

########################################## Parse string to sink
#
# Regular expressions:
eol = re.compile(r'[ \t]*(#[^\n]*)?\r?\n')       # end  of line, poss. w/comment
eof = re.compile(r'[ \t]*(#[^\n]*)?$')           # end  of file, poss. w/comment
ws = re.compile(r'[ \t]*')                       # Whitespace not including NL
signed_integer = re.compile(r'[-+]?[0-9]+')      # integer
number_syntax = re.compile(r'(?P<integer>[-+]?[0-9]+)(?P<decimal>\.[0-9]+)?(?P<exponent>(?:e|E)[-+]?[0-9]+)?')
digitstring = re.compile(r'[0-9]+')              # Unsigned integer
interesting = re.compile(r'[\\\r\n\"]')
langcode = re.compile(r'[a-zA-Z0-9]+(-[a-zA-Z0-9]+)?')


class SinkParser:
    def __init__(self, store, openFormula=None, thisDoc="", baseURI=None,
                 genPrefix="", flags="", why=None):
        """ note: namespace names should *not* end in  # ;
        the  # will get added during qname processing """

        self._bindings = {}
        self._flags = flags
        if thisDoc != "":
            assert ':' in thisDoc, "Document URI not absolute: <%s>" % thisDoc
            self._bindings[""] = thisDoc + "#"   # default

        self._store = store
        if genPrefix:
            store.setGenPrefix(genPrefix)  # pass it on

        self._thisDoc = thisDoc
        self.lines = 0               # for error handling
        self.startOfLine = 0         # For calculating character number
        self._genPrefix = genPrefix
        self.keywords = ['a', 'this', 'bind', 'has', 'is', 'of', 'true', 'false']
        self.keywordsSet = 0     # Then only can others be considerd qnames
        self._anonymousNodes = {}  # Dict of anon nodes already declared ln: Term
        self._variables = {}
        self._parentVariables = {}
        self._reason = why       # Why the parser was asked to parse this

        self._reason2 = None     # Why these triples
         # was: diag.tracking
        if tracking:
            self._reason2 = BecauseOfData(
                        store.newSymbol(thisDoc), because=self._reason)

        if baseURI:
            self._baseURI = baseURI
        else:
            if thisDoc:
                self._baseURI = thisDoc
            else:
                self._baseURI = None

        assert not self._baseURI or ':' in self._baseURI

        if not self._genPrefix:
            if self._thisDoc:
                self._genPrefix = self._thisDoc + "#_g"
            else:
                self._genPrefix = uniqueURI()

        if openFormula == None:
            if self._thisDoc:
                self._formula = store.newFormula(thisDoc + "#_formula")
            else:
                self._formula = store.newFormula()
        else:
            self._formula = openFormula

        self._context = self._formula
        self._parentContext = None

    def here(self, i):
        """String generated from position in file

        This is for repeatability when refering people to bnodes in a document.
        This has diagnostic uses less formally, as it should point one to which
        bnode the arbitrary identifier actually is. It gives the
        line and character number of the '[' charcacter or path character
        which introduced the blank node. The first blank node is boringly _L1C1.
        It used to be used only for tracking, but for tests in general
        it makes the canonical ordering of bnodes repeatable."""

        return "%s_L%iC%i" % (self._genPrefix, self.lines,
                                            i - self.startOfLine + 1)

    def formula(self):
        return self._formula

    def loadStream(self, stream):
        return self.loadBuf(stream.read())    # Not ideal

    def loadBuf(self, buf):
        """Parses a buffer and returns its top level formula"""
        self.startDoc()

        self.feed(buf)
        return self.endDoc()     # self._formula

    def feed(self, octets):
        """Feed an octet stream tothe parser

        if BadSyntax is raised, the string
        passed in the exception object is the
        remainder after any statements have been parsed.
        So if there is more data to feed to the
        parser, it should be straightforward to recover."""

        if not isinstance(octets, unicode):
            s = octets.decode('utf-8')
             # NB already decoded, so \ufeff
            if len(s) > 0 and s[0] == codecs.BOM_UTF8.decode('utf-8'):
                s = s[1:]
        else:
            s = octets

        i = 0
        while i >= 0:
            j = self.skipSpace(s, i)
            if j < 0:
                return

            i = self.directiveOrStatement(s, j)
            if i < 0:
                print("# next char: %s" % s[j])
                raise BadSyntax(self._thisDoc, self.lines, s, j,
                                    "expected directive or statement")

    def directiveOrStatement(self, argstr, h):

        i = self.skipSpace(argstr, h)
        if i < 0:
            return i    # EOF

        j = self.directive(argstr, i)
        if j >= 0:
            return  self.checkDot(argstr, j)

        j = self.statement(argstr, i)
        if j >= 0:
            return self.checkDot(argstr, j)

        return j

     # @@I18N
    global _notNameChars
     # _namechars = string.lowercase + string.uppercase + string.digits + '_-'

    def tok(self, tok, argstr, i):
        """Check for keyword.  Space must have been stripped on entry and
        we must not be at end of file."""

        assert tok[0] not in _notNameChars  # not for punctuation
        if argstr[i:i + 1] == "@":
            i = i + 1
        else:
            if tok not in self.keywords:
                return -1    # No, this has neither keywords declaration nor "@"

        if (argstr[i:i + len(tok)] == tok
            and (argstr[i + len(tok)] in  _notQNameChars)):
            i = i + len(tok)
            return i
        else:
            return -1

    def directive(self, argstr, i):
        j = self.skipSpace(argstr, i)
        if j < 0:
            return j  # eof
        res = []

        j = self.tok('bind', argstr, i)         # implied "#". Obsolete.
        if j > 0:
            raise BadSyntax(self._thisDoc, self.lines, argstr, i,
                                "keyword bind is obsolete: use @prefix")

        j = self.tok('keywords', argstr, i)
        if j > 0:
            i = self.commaSeparatedList(argstr, j, res, self.bareWord)
            if i < 0:
                raise BadSyntax(self._thisDoc, self.lines, argstr, i,
                    "'@keywords' needs comma separated list of words")
            self.setKeywords(res[:])
             # was: diag.chatty_flag
            if chatty_flag > 80:
                progress("Keywords ", self.keywords)
            return i

        j = self.tok('forAll', argstr, i)
        if j > 0:
            i = self.commaSeparatedList(argstr, j, res, self.uri_ref2)
            if i < 0:
                raise BadSyntax(self._thisDoc, self.lines, argstr, i,
                        "Bad variable list after @forAll")
            for x in res:
                 # self._context.declareUniversal(x)
                if x not in self._variables or x in self._parentVariables:
                    self._variables[x] = self._context.newUniversal(x)
            return i

        j = self.tok('forSome', argstr, i)
        if j > 0:
            i = self. commaSeparatedList(argstr, j, res, self.uri_ref2)
            if i < 0:
                raise BadSyntax(self._thisDoc, self.lines, argstr, i,
                    "Bad variable list after @forSome")
            for x in res:
                self._context.declareExistential(x)
            return i

        j = self.tok('prefix', argstr, i)    # no implied "#"
        if j >= 0:
            t = []
            i = self.qname(argstr, j, t)
            if i < 0:
                raise BadSyntax(self._thisDoc, self.lines, argstr, j,
                                "expected qname after @prefix")
            j = self.uri_ref2(argstr, i, t)
            if j < 0:
                raise BadSyntax(self._thisDoc, self.lines, argstr, i,
                                "expected <uriref> after @prefix _qname_")
            ns = self.uriOf(t[1])

            if self._baseURI:
                ns = join(self._baseURI, ns)
            elif ":" not in ns:
                raise BadSyntax(self._thisDoc, self.lines, argstr, j,
                    "With no base URI, cannot use relative URI in @prefix <" + ns + ">")
            assert ':' in ns  # must be absolute
            self._bindings[t[0][0]] = ns
            self.bind(t[0][0], hexify(ns))
            return j

        j = self.tok('base', argstr, i)       # Added 2007/7/7
        if j >= 0:
            t = []
            i = self.uri_ref2(argstr, j, t)
            if i < 0:
                raise BadSyntax(self._thisDoc, self.lines, argstr, j,
                                "expected <uri> after @base ")
            ns = self.uriOf(t[0])

            if self._baseURI:
                ns = join(self._baseURI, ns)
            else:
                raise BadSyntax(self._thisDoc, self.lines, argstr, j,
                    "With no previous base URI, cannot use relative URI in @base  <" + ns + ">")
            assert ':' in ns  # must be absolute
            self._baseURI = ns
            return i

        return -1   # Not a directive, could be something else.

    def bind(self, qn, uri):
        assert isinstance(uri,
                    types.StringType), "Any unicode must be %x-encoded already"
        if qn == "":
            self._store.setDefaultNamespace(uri)
        else:
            self._store.bind(qn, uri)

    def setKeywords(self, k):
        "Takes a list of strings"
        if k == None:
            self.keywordsSet = 0
        else:
            self.keywords = k
            self.keywordsSet = 1

    def startDoc(self):
         # was: self._store.startDoc()
        self._store.startDoc(self._formula)

    def endDoc(self):
        """Signal end of document and stop parsing. returns formula"""
        self._store.endDoc(self._formula)   # don't canonicalize yet
        return self._formula

    def makeStatement(self, quadruple):
         # $$$$$$$$$$$$$$$$$$$$$
         # print "# Parser output: ", `quadruple`
        self._store.makeStatement(quadruple, why=self._reason2)

    def statement(self, argstr, i):
        r = []

        i = self.object(argstr, i, r)   # Allow literal for subject - extends RDF
        if i < 0:
            return i

        j = self.property_list(argstr, i, r[0])

        if j < 0:
            raise BadSyntax(self._thisDoc, self.lines,
                                    argstr, i, "expected propertylist")
        return j

    def subject(self, argstr, i, res):
        return self.item(argstr, i, res)

    def verb(self, argstr, i, res):
        """ has _prop_
        is _prop_ of
        a
        =
        _prop_
        >- prop ->
        <- prop -<
        _operator_"""

        j = self.skipSpace(argstr, i)
        if j < 0:
            return j  # eof

        r = []

        j = self.tok('has', argstr, i)
        if j >= 0:
            i = self.prop(argstr, j, r)
            if i < 0:
                raise BadSyntax(self._thisDoc, self.lines,
                                argstr, j, "expected property after 'has'")
            res.append(('->', r[0]))
            return i

        j = self.tok('is', argstr, i)
        if j >= 0:
            i = self.prop(argstr, j, r)
            if i < 0:
                raise BadSyntax(self._thisDoc, self.lines, argstr, j,
                                "expected <property> after 'is'")
            j = self.skipSpace(argstr, i)
            if j < 0:
                raise BadSyntax(self._thisDoc, self.lines, argstr, i,
                            "End of file found, expected property after 'is'")
                return j  # eof
            i = j
            j = self.tok('of', argstr, i)
            if j < 0:
                raise BadSyntax(self._thisDoc, self.lines, argstr, i,
                                "expected 'of' after 'is' <prop>")
            res.append(('<-', r[0]))
            return j

        j = self.tok('a', argstr, i)
        if j >= 0:
            res.append(('->', RDF_type))
            return j

        if argstr[i:i + 2] == "<=":
            res.append(('<-', self._store.newSymbol(Logic_NS + "implies")))
            return i + 2

        if argstr[i:i + 1] == "=":
            if argstr[i + 1:i + 2] == ">":
                res.append(('->', self._store.newSymbol(Logic_NS + "implies")))
                return i + 2
            res.append(('->', DAML_sameAs))
            return i + 1

        if argstr[i:i + 2] == ":=":
             # patch file relates two formulae, uses this    @@ really?
            res.append(('->', Logic_NS + "becomes"))
            return i + 2

        j = self.prop(argstr, i, r)
        if j >= 0:
            res.append(('->', r[0]))
            return j

        if argstr[i:i + 2] == ">-" or argstr[i:i + 2] == "<-":
            raise BadSyntax(self._thisDoc, self.lines, argstr, j,
                                        ">- ... -> syntax is obsolete.")

        return -1

    def prop(self, argstr, i, res):
        return self.item(argstr, i, res)

    def item(self, argstr, i, res):
        return self.path(argstr, i, res)

    def blankNode(self, uri=None):
        if "B" not in self._flags:
            return self._context.newBlankNode(uri, why=self._reason2)
        x = self._context.newSymbol(uri)
        self._context.declareExistential(x)
        return x

    def path(self, argstr, i, res):
        """Parse the path production.
        """
        j = self.nodeOrLiteral(argstr, i, res)
        if j < 0:
            return j   # nope

        while argstr[j:j + 1] in "!^.":   # no spaces, must follow exactly (?)
            ch = argstr[j:j + 1]      # @@ Allow "." followed IMMEDIATELY by a node.
            if ch == ".":
                ahead = argstr[j + 1:j + 2]
                if not ahead or (ahead in _notNameChars
                            and ahead not in ":?<[{("):
                    break
            subj = res.pop()
            obj = self.blankNode(uri=self.here(j))
            j = self.node(argstr, j + 1, res)
            if j < 0:
                raise BadSyntax(self._thisDoc, self.lines, argstr, j,
                            "EOF found in middle of path syntax")
            pred = res.pop()
            if ch == "^":  # Reverse traverse
                self.makeStatement((self._context, pred, obj, subj))
            else:
                self.makeStatement((self._context, pred, subj, obj))
            res.append(obj)
        return j

    def anonymousNode(self, ln):
        """Remember or generate a term for one of these _: anonymous nodes"""
        term = self._anonymousNodes.get(ln, None)
        if term != None:
            return term
        term = self._store.newBlankNode(self._context, why=self._reason2)
        self._anonymousNodes[ln] = term
        return term

    def node(self, argstr, i, res, subjectAlready=None):
        """Parse the <node> production.
        Space is now skipped once at the beginning
        instead of in multipe calls to self.skipSpace().
        """
        subj = subjectAlready

        j = self.skipSpace(argstr, i)
        if j < 0:
            return j  # eof
        i = j
        ch = argstr[i:i + 1]   # Quick 1-character checks first:

        if ch == "[":
            bnodeID = self.here(i)
            j = self.skipSpace(argstr, i + 1)
            if j < 0:
                raise BadSyntax(self._thisDoc,
                                    self.lines, argstr, i, "EOF after '['")
            if argstr[j:j + 1] == "=":      # Hack for "is"  binding name to anon node
                i = j + 1
                objs = []
                j = self.objectList(argstr, i, objs)
                if j >= 0:
                    subj = objs[0]
                    if len(objs) > 1:
                        for obj in objs:
                            self.makeStatement((self._context,
                                                DAML_sameAs, subj, obj))
                    j = self.skipSpace(argstr, j)
                    if j < 0:
                        raise BadSyntax(self._thisDoc, self.lines, argstr, i,
                        "EOF when objectList expected after [ = ")
                    if argstr[j:j + 1] == ";":
                        j = j + 1
                else:
                    raise BadSyntax(self._thisDoc, self.lines, argstr, i,
                                        "objectList expected after [= ")

            if subj is None:
                subj = self.blankNode(uri=bnodeID)

            i = self.property_list(argstr, j, subj)
            if i < 0:
                raise BadSyntax(self._thisDoc, self.lines, argstr, j,
                                "property_list expected")

            j = self.skipSpace(argstr, i)
            if j < 0:
                raise BadSyntax(self._thisDoc, self.lines, argstr, i,
                "EOF when ']' expected after [ <propertyList>")
            if argstr[j:j + 1] != "]":
                raise BadSyntax(self._thisDoc,
                                    self.lines, argstr, j, "']' expected")
            res.append(subj)
            return j + 1

        if ch == "{":
            ch2 = argstr[i + 1:i + 2]
            if ch2 == '$':
                i += 1
                j = i + 1
                List = []
                first_run = True
                while 1:
                    i = self.skipSpace(argstr, j)
                    if i < 0:
                        raise BadSyntax(self._thisDoc, self.lines, argstr, i,
                                                    "needed '$}', found end.")
                    if argstr[i:i + 2] == '$}':
                        j = i + 2
                        break

                    if not first_run:
                        if argstr[i:i + 1] == ',':
                            i += 1
                        else:
                            raise BadSyntax(self._thisDoc, self.lines,
                                                argstr, i, "expected: ','")
                    else:
                        first_run = False

                    item = []
                    j = self.item(argstr, i, item)  # @@@@@ should be path, was object
                    if j < 0:
                        raise BadSyntax(self._thisDoc, self.lines, argstr, i,
                                            "expected item in set or '$}'")
                    List.append(self._store.intern(item[0]))
                res.append(self._store.newSet(List, self._context))
                return j
            else:
                j = i + 1
                oldParentContext = self._parentContext
                self._parentContext = self._context
                parentAnonymousNodes = self._anonymousNodes
                grandParentVariables = self._parentVariables
                self._parentVariables = self._variables
                self._anonymousNodes = {}
                self._variables = self._variables.copy()
                reason2 = self._reason2
                self._reason2 = becauseSubexpression
                if subj is None:
                    subj = self._store.newFormula()
                self._context = subj

                while 1:
                    i = self.skipSpace(argstr, j)
                    if i < 0:
                        raise BadSyntax(self._thisDoc, self.lines,
                                    argstr, i, "needed '}', found end.")

                    if argstr[i:i + 1] == "}":
                        j = i + 1
                        break

                    j = self.directiveOrStatement(argstr, i)
                    if j < 0:
                        raise BadSyntax(self._thisDoc, self.lines,
                                    argstr, i, "expected statement or '}'")

                self._anonymousNodes = parentAnonymousNodes
                self._variables = self._parentVariables
                self._parentVariables = grandParentVariables
                self._context = self._parentContext
                self._reason2 = reason2
                self._parentContext = oldParentContext
                res.append(subj.close())    # No use until closed
                return j

        if ch == "(":
            thing_type = self._store.newList
            ch2 = argstr[i + 1:i + 2]
            if ch2 == '$':
                thing_type = self._store.newSet
                i += 1
            j = i + 1

            List = []
            while 1:
                i = self.skipSpace(argstr, j)
                if i < 0:
                    raise BadSyntax(self._thisDoc, self.lines,
                                    argstr, i, "needed ')', found end.")
                if argstr[i:i + 1] == ')':
                    j = i + 1
                    break

                item = []
                j = self.item(argstr, i, item)  # @@@@@ should be path, was object
                if j < 0:
                    raise BadSyntax(self._thisDoc, self.lines, argstr, i,
                                        "expected item in list or ')'")
                List.append(self._store.intern(item[0]))
            res.append(thing_type(List, self._context))
            return j

        j = self.tok('this', argstr, i)    # This context
        if j >= 0:
            raise BadSyntax(self._thisDoc, self.lines, argstr, i,
                "Keyword 'this' was ancient N3. Now use @forSome and @forAll keywords.")
            res.append(self._context)
            return j

         # booleans
        j = self.tok('true', argstr, i)
        if j >= 0:
            res.append(True)
            return j
        j = self.tok('false', argstr, i)
        if j >= 0:
            res.append(False)
            return j

        if subj is None:    # If this can be a named node, then check for a name.
            j = self.uri_ref2(argstr, i, res)
            if j >= 0:
                return j

        return -1

    def property_list(self, argstr, i, subj):
        """Parse property list
        Leaves the terminating punctuation in the buffer
        """
        while 1:
            j = self.skipSpace(argstr, i)
            if j < 0:
                raise BadSyntax(self._thisDoc, self.lines, argstr, i,
                            "EOF found when expected verb in property list")
                return j  # eof

            if argstr[j:j + 2] == ":-":
                i = j + 2
                res = []
                j = self.node(argstr, i, res, subj)
                if j < 0:
                    raise BadSyntax(self._thisDoc, self.lines, argstr, i,
                                        "bad {} or () or [] node after :- ")
                i = j
                continue
            i = j
            v = []
            j = self.verb(argstr, i, v)
            if j <= 0:
                return i  # void but valid

            objs = []
            i = self.objectList(argstr, j, objs)
            if i < 0:
                raise BadSyntax(self._thisDoc, self.lines, argstr, j,
                                                        "objectList expected")
            for obj in objs:
                dira, sym = v[0]
                if dira == '->':
                    self.makeStatement((self._context, sym, subj, obj))
                else:
                    self.makeStatement((self._context, sym, obj, subj))

            j = self.skipSpace(argstr, i)
            if j < 0:
                raise BadSyntax(self._thisDoc, self.lines, argstr, j,
                                                "EOF found in list of objects")
                return j  # eof
            if argstr[i:i + 1] != ";":
                return i
            i = i + 1  # skip semicolon and continue

    def commaSeparatedList(self, argstr, j, res, what):
        """return value: -1 bad syntax; >1 new position in argstr
        res has things found appended
        """
        i = self.skipSpace(argstr, j)
        if i < 0:
            raise BadSyntax(self._thisDoc, self.lines, argstr, i,
                                    "EOF found expecting comma sep list")
            return i
        if argstr[i] == ".":
            return j   # empty list is OK
        i = what(argstr, i, res)
        if i < 0:
            return -1

        while 1:
            j = self.skipSpace(argstr, i)
            if j < 0:
                return j  # eof
            ch = argstr[j:j + 1]
            if ch != ",":…
Tech Fingerprint

Alerts (26)

'def' Ensure functions have docstrings for documentation
58 65 70 75 319 805 810
Complexity hotspot; line 165 (total complexity: 3)
165
Complexity hotspot; lines 267 to 268 (total complexity: 3)
267 268
Complexity hotspot; line 281 (total complexity: 3)
281
Complexity hotspot; line 297 (total complexity: 3)
297
Complexity hotspot; lines 483 to 484 (total complexity: 3)
483 484
'lambda' Avoid complex 'lambda' functions; prefer named functions for clarity and debugging
531 532
Complexity hotspot; lines 571 to 572 (total complexity: 3)
571 572
Complexity hotspot; lines 712 to 713 (total complexity: 3)
712 713
Complexity hotspot; lines 733 to 734 (total complexity: 3)
733 734
Complexity hotspot; lines 746 to 747 (total complexity: 3)
746 747
Complexity hotspot; lines 1248 to 1249 (total complexity: 3)
1248 1249