/SmartObjectFramework/src/rdflib/plugins/parsers/notation3.py
Python | 2427 lines | 2111 code | 176 blank | 140 comment | 224 complexity | 825ecb377cb8c00574f56dbdcc5a06e0 MD5 | raw file
Possible License(s): Apache-2.0, LGPL-3.0
Large files files are truncated, but you can click here to view the full file
- #!/usr/bin/env python
- u"""
- notation3.py - Standalone Notation3 Parser
- Derived from CWM, the Closed World Machine
- Authors of the original suite:
- * Dan Connolly <@@>
- * Tim Berners-Lee <@@>
- * Yosi Scharf <@@>
- * Joseph M. Reagle Jr. <reagle@w3.org>
- * Rich Salz <rsalz@zolera.com>
- http://www.w3.org/2000/10/swap/notation3.py
- Copyright 2000-2007, World Wide Web Consortium.
- Copyright 2001, MIT.
- Copyright 2001, Zolera Systems Inc.
- License: W3C Software License
- http://www.w3.org/Consortium/Legal/copyright-software
- Modified by Sean B. Palmer
- Copyright 2007, Sean B. Palmer. \u32E1
- Modified to work with rdflib by Gunnar Aastrand Grimnes
- Copyright 2010, Gunnar A. Grimnes
- """
- # Python standard libraries
- import types
- import sys
- import os
- import re
- import StringIO
- import codecs
- from binascii import a2b_hex
- from decimal import Decimal
- from rdflib.term import URIRef, BNode, Literal, Variable, _XSD_PFX, _unique_id
- from rdflib.graph import QuotedGraph, ConjunctiveGraph
- from rdflib import py3compat
- b = py3compat.b
- __all__ = [
- 'URISyntaxError', 'BadSyntax', 'N3Parser', "verbosity", "setVerbosity",
- "progress", "splitFrag", "splitFragP", "join", "refTo", "base",
- "canonical", "runNamespace", "uniqueURI", "Canonicalize", "stripCR",
- "dummyWrite", "toBool", "stringToN3", "backslashUify", "hexify"]
- from rdflib.parser import Parser
- # Incestuous.. would be nice to separate N3 and XML
- # from sax2rdf import XMLtoDOM
- def XMLtoDOM(*args, **kargs):
- # print >> sys.stderr, args, kargs
- pass
- # SWAP http://www.w3.org/2000/10/swap
- # from diag import verbosity, setVerbosity, progress
- def verbosity(*args, **kargs):
- # print >> sys.stderr, args, kargs
- pass
- def setVerbosity(*args, **kargs):
- # print >> sys.stderr, args, kargs
- pass
- def progress(*args, **kargs):
- # print >> sys.stderr, args, kargs
- pass
- def splitFrag(uriref):
- """split a URI reference between the fragment and the rest.
- Punctuation is thrown away.
- e.g.
- >>> splitFrag("abc#def")
- ('abc', 'def')
- >>> splitFrag("abcdef")
- ('abcdef', None)
- """
- i = uriref.rfind("#")
- if i >= 0:
- return uriref[:i], uriref[i + 1:]
- else:
- return uriref, None
- def splitFragP(uriref, punct=0):
- """split a URI reference before the fragment
- Punctuation is kept.
- e.g.
- >>> splitFragP("abc#def")
- ('abc', '#def')
- >>> splitFragP("abcdef")
- ('abcdef', '')
- """
- i = uriref.rfind("#")
- if i >= 0:
- return uriref[:i], uriref[i:]
- else:
- return uriref, ''
- @py3compat.format_doctest_out
- def join(here, there):
- """join an absolute URI and URI reference
- (non-ascii characters are supported/doctested;
- haven't checked the details of the IRI spec though)
- here is assumed to be absolute.
- there is URI reference.
- >>> join('http://example/x/y/z', '../abc')
- 'http://example/x/abc'
- Raise ValueError if there uses relative path
- syntax but here has no hierarchical path.
- >>> join('mid:foo@example', '../foo')
- Traceback (most recent call last):
- raise ValueError, here
- ValueError: Base <mid:foo@example> has no slash after colon - with relative '../foo'.
- >>> join('http://example/x/y/z', '')
- 'http://example/x/y/z'
- >>> join('mid:foo@example', '#foo')
- 'mid:foo@example#foo'
- We grok IRIs
- >>> len(%(u)s'Andr\\xe9')
- 5
- >>> join('http://example.org/', %(u)s'#Andr\\xe9')
- %(u)s'http://example.org/#Andr\\xe9'
- """
- assert(here.find("#") < 0), "Base may not contain hash: '%s'" % here # caller must splitFrag (why?)
- slashl = there.find('/')
- colonl = there.find(':')
- # join(base, 'foo:/') -- absolute
- if colonl >= 0 and (slashl < 0 or colonl < slashl):
- return there
- bcolonl = here.find(':')
- assert(bcolonl >= 0), "Base uri '%s' is not absolute" % here # else it's not absolute
- path, frag = splitFragP(there)
- if not path:
- return here + frag
- # join('mid:foo@example', '../foo') bzzt
- if here[bcolonl + 1:bcolonl + 2] != '/':
- raise ValueError("Base <%s> has no slash after colon - with relative '%s'." % (here, there))
- if here[bcolonl + 1:bcolonl + 3] == '//':
- bpath = here.find('/', bcolonl + 3)
- else:
- bpath = bcolonl + 1
- # join('http://xyz', 'foo')
- if bpath < 0:
- bpath = len(here)
- here = here + '/'
- # join('http://xyz/', '//abc') => 'http://abc'
- if there[:2] == '//':
- return here[:bcolonl + 1] + there
- # join('http://xyz/', '/abc') => 'http://xyz/abc'
- if there[:1] == '/':
- return here[:bpath] + there
- slashr = here.rfind('/')
- while 1:
- if path[:2] == './':
- path = path[2:]
- if path == '.':
- path = ''
- elif path[:3] == '../' or path == '..':
- path = path[3:]
- i = here.rfind('/', bpath, slashr)
- if i >= 0:
- here = here[:i + 1]
- slashr = i
- else:
- break
- return here[:slashr + 1] + path + frag
- commonHost = re.compile(r'^[-_a-zA-Z0-9.]+:(//[^/]*)?/[^/]*$')
- def refTo(base, uri):
- """figure out a relative URI reference from base to uri
- >>> refTo('http://example/x/y/z', 'http://example/x/abc')
- '../abc'
- >>> refTo('file:/ex/x/y', 'file:/ex/x/q/r#s')
- 'q/r#s'
- >>> refTo(None, 'http://ex/x/y')
- 'http://ex/x/y'
- >>> refTo('http://ex/x/y', 'http://ex/x/y')
- ''
- Note the relationship between refTo and join:
- join(x, refTo(x, y)) == y
- which points out certain strings which cannot be URIs. e.g.
- >>> x='http://ex/x/y';y='http://ex/x/q:r';join(x, refTo(x, y)) == y
- 0
- So 'http://ex/x/q:r' is not a URI. Use 'http://ex/x/q%3ar' instead:
- >>> x='http://ex/x/y';y='http://ex/x/q%3ar';join(x, refTo(x, y)) == y
- 1
- This one checks that it uses a root-realtive one where that is
- all they share. Now uses root-relative where no path is shared.
- This is a matter of taste but tends to give more resilience IMHO
- -- and shorter paths
- Note that base may be None, meaning no base. In some situations, there
- just ain't a base. Slife. In these cases, relTo returns the absolute value.
- The axiom abs(,rel(b,x))=x still holds.
- This saves people having to set the base to "bogus:".
- >>> refTo('http://ex/x/y/z', 'http://ex/r')
- '/r'
- """
- # assert base # don't mask bugs -danc # not a bug. -tim
- if not base:
- return uri
- if base == uri:
- return ""
- # Find how many path segments in common
- i = 0
- while i < len(uri) and i < len(base):
- if uri[i] == base[i]:
- i = i + 1
- else:
- break
- # print "# relative", base, uri, " same up to ", i
- # i point to end of shortest one or first difference
- m = commonHost.match(base[:i])
- if m:
- k = uri.find("//")
- if k < 0:
- k = -2 # no host
- l = uri.find("/", k + 2)
- if uri[l + 1:l + 2] != "/" and base[l + 1:l + 2] != "/" and uri[:l] == base[:l]:
- return uri[l:]
- if uri[i:i + 1] == "#" and len(base) == i:
- return uri[i:] # fragment of base
- while i > 0 and uri[i - 1] != '/':
- i = i - 1 # scan for slash
- if i < 3:
- return uri # No way.
- if base.find("//", i - 2) > 0 or uri.find("//", i - 2) > 0:
- return uri # An unshared "//"
- if base.find(":", i) > 0:
- return uri # An unshared ":"
- n = base.count("/", i)
- if n == 0 and i < len(uri) and uri[i] == '#':
- return "./" + uri[i:]
- elif n == 0 and i == len(uri):
- return "./"
- else:
- return ("../" * n) + uri[i:]
- def base():
- """The base URI for this process - the Web equiv of cwd
- Relative or abolute unix-standard filenames parsed relative to
- this yeild the URI of the file.
- If we had a reliable way of getting a computer name,
- we should put it in the hostname just to prevent ambiguity
- """
- # return "file://" + hostname + os.getcwd() + "/"
- return "file://" + _fixslash(os.getcwd()) + "/"
- def _fixslash(argstr):
- """ Fix windowslike filename to unixlike - (#ifdef WINDOWS)"""
- s = argstr
- for i in range(len(s)):
- if s[i] == "\\":
- s = s[:i] + "/" + s[i + 1:]
- if s[0] != "/" and s[1] == ":":
- s = s[2:] # @@@ Hack when drive letter present
- return s
- URI_unreserved = b("ABCDEFGHIJJLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~")
- # unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
- @py3compat.format_doctest_out
- def canonical(str_in):
- """Convert equivalent URIs (or parts) to the same string
- There are many differenet levels of URI canonicalization
- which are possible. See http://www.ietf.org/rfc/rfc3986.txt
- Done:
- - Converfting unicode IRI to utf-8
- - Escaping all non-ASCII
- - De-escaping, if escaped, ALPHA (%%41-%%5A and %%61-%%7A), DIGIT (%%30-%%39),
- hyphen (%%2D), period (%%2E), underscore (%%5F), or tilde (%%7E) (Sect 2.4)
- - Making all escapes uppercase hexadecimal
- Not done:
- - Making URI scheme lowercase
- - changing /./ or /foo/../ to / with care not to change host part
- >>> canonical("foo bar")
- %(b)s'foo%%20bar'
- >>> canonical(%(u)s'http:')
- %(b)s'http:'
- >>> canonical('fran%%c3%%83%%c2%%a7ois')
- %(b)s'fran%%C3%%83%%C2%%A7ois'
- >>> canonical('a')
- %(b)s'a'
- >>> canonical('%%4e')
- %(b)s'N'
- >>> canonical('%%9d')
- %(b)s'%%9D'
- >>> canonical('%%2f')
- %(b)s'%%2F'
- >>> canonical('%%2F')
- %(b)s'%%2F'
- """
- if type(str_in) == type(u''):
- s8 = str_in.encode('utf-8')
- else:
- s8 = str_in
- s = b('')
- i = 0
- while i < len(s8):
- if py3compat.PY3:
- n = s8[i]
- ch = bytes([n])
- else:
- ch = s8[i]
- n = ord(ch)
- if (n > 126) or (n < 33): # %-encode controls, SP, DEL, and utf-8
- s += b("%%%02X" % ord(ch))
- elif ch == b('%') and i + 2 < len(s8):
- ch2 = a2b_hex(s8[i + 1:i + 3])
- if ch2 in URI_unreserved:
- s += ch2
- else:
- s += b("%%%02X" % ord(ch2))
- i = i + 3
- continue
- else:
- s += ch
- i = i + 1
- return s
- CONTEXT = 0
- PRED = 1
- SUBJ = 2
- OBJ = 3
- PARTS = PRED, SUBJ, OBJ
- ALL4 = CONTEXT, PRED, SUBJ, OBJ
- SYMBOL = 0
- FORMULA = 1
- LITERAL = 2
- LITERAL_DT = 21
- LITERAL_LANG = 22
- ANONYMOUS = 3
- XMLLITERAL = 25
- Logic_NS = "http://www.w3.org/2000/10/swap/log#"
- NODE_MERGE_URI = Logic_NS + "is" # Pseudo-property indicating node merging
- forSomeSym = Logic_NS + "forSome"
- forAllSym = Logic_NS + "forAll"
- RDF_type_URI = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
- RDF_NS_URI = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
- OWL_NS = "http://www.w3.org/2002/07/owl#"
- DAML_sameAs_URI = OWL_NS + "sameAs"
- parsesTo_URI = Logic_NS + "parsesTo"
- RDF_spec = "http://www.w3.org/TR/REC-rdf-syntax/"
- List_NS = RDF_NS_URI # From 20030808
- _Old_Logic_NS = "http://www.w3.org/2000/10/swap/log.n3#"
- N3_first = (SYMBOL, List_NS + "first")
- N3_rest = (SYMBOL, List_NS + "rest")
- N3_li = (SYMBOL, List_NS + "li")
- N3_nil = (SYMBOL, List_NS + "nil")
- N3_List = (SYMBOL, List_NS + "List")
- N3_Empty = (SYMBOL, List_NS + "Empty")
- runNamespaceValue = None
- def runNamespace():
- "Return a URI suitable as a namespace for run-local objects"
- # @@@ include hostname (privacy?) (hash it?)
- global runNamespaceValue
- if runNamespaceValue == None:
- runNamespaceValue = join(base(), _unique_id()) + '#'
- return runNamespaceValue
- nextu = 0
- def uniqueURI():
- "A unique URI"
- global nextu
- nextu += 1
- # return runNamespace() + "u_" + `nextu`
- return runNamespace() + "u_" + str(nextu)
- class URISyntaxError(ValueError):
- """A parameter is passed to a routine that requires a URI reference"""
- pass
- tracking = False
- chatty_flag = 50
- from xml.dom import Node
- try:
- from xml.ns import XMLNS
- except:
- class XMLNS:
- BASE = "http://www.w3.org/2000/xmlns/"
- XML = "http://www.w3.org/XML/1998/namespace"
- _attrs = lambda E: (E.attributes and E.attributes.values()) or []
- _children = lambda E: E.childNodes or []
- _IN_XML_NS = lambda n: n.namespaceURI == XMLNS.XML
- _inclusive = lambda n: n.unsuppressedPrefixes == None
- # Does a document/PI has lesser/greater document order than the
- # first element?
- _LesserElement, _Element, _GreaterElement = range(3)
- def _sorter(n1, n2):
- '''_sorter(n1, n2) -> int
- Sorting predicate for non-NS attributes.'''
- i = cmp(n1.namespaceURI, n2.namespaceURI)
- if i:
- return i
- return cmp(n1.localName, n2.localName)
- def _sorter_ns(n1, n2):
- '''_sorter_ns((n,v),(n,v)) -> int
- "(an empty namespace URI is lexicographically least)."'''
- if n1[0] == 'xmlns':
- return -1
- if n2[0] == 'xmlns':
- return 1
- return cmp(n1[0], n2[0])
- def _utilized(n, node, other_attrs, unsuppressedPrefixes):
- '''_utilized(n, node, other_attrs, unsuppressedPrefixes) -> boolean
- Return true if that nodespace is utilized within the node'''
- if n.startswith('xmlns:'):
- n = n[6:]
- elif n.startswith('xmlns'):
- n = n[5:]
- if (n == "" and node.prefix in ["#default", None]) or \
- n == node.prefix or n in unsuppressedPrefixes:
- return 1
- for attr in other_attrs:
- if n == attr.prefix:
- return 1
- return 0
- #_in_subset = lambda subset, node: not subset or node in subset
- _in_subset = lambda subset, node: subset is None or node in subset # rich's tweak
- class _implementation:
- '''Implementation class for C14N. This accompanies a node during it's
- processing and includes the parameters and processing state.'''
- # Handler for each node type; populated during module instantiation.
- handlers = {}
- def __init__(self, node, write, **kw):
- '''Create and run the implementation.'''
- self.write = write
- self.subset = kw.get('subset')
- self.comments = kw.get('comments', 0)
- self.unsuppressedPrefixes = kw.get('unsuppressedPrefixes')
- nsdict = kw.get('nsdict', {'xml': XMLNS.XML, 'xmlns': XMLNS.BASE})
- # Processing state.
- self.state = (nsdict, {'xml': ''}, {}) # 0422
- if node.nodeType == Node.DOCUMENT_NODE:
- self._do_document(node)
- elif node.nodeType == Node.ELEMENT_NODE:
- self.documentOrder = _Element # At document element
- if not _inclusive(self):
- self._do_element(node)
- else:
- inherited = self._inherit_context(node)
- self._do_element(node, inherited)
- elif node.nodeType == Node.DOCUMENT_TYPE_NODE:
- pass
- elif node.nodeType == Node.TEXT_NODE:
- self._do_text(node)
- else:
- raise TypeError(str(node))
- def _inherit_context(self, node):
- '''_inherit_context(self, node) -> list
- Scan ancestors of attribute and namespace context. Used only
- for single element node canonicalization, not for subset
- canonicalization.'''
- # Collect the initial list of xml:foo attributes.
- xmlattrs = filter(_IN_XML_NS, _attrs(node))
- # Walk up and get all xml:XXX attributes we inherit.
- inherited, parent = [], node.parentNode
- while parent and parent.nodeType == Node.ELEMENT_NODE:
- for a in filter(_IN_XML_NS, _attrs(parent)):
- n = a.localName
- if n not in xmlattrs:
- xmlattrs.append(n)
- inherited.append(a)
- parent = parent.parentNode
- return inherited
- def _do_document(self, node):
- '''_do_document(self, node) -> None
- Process a document node. documentOrder holds whether the document
- element has been encountered such that PIs/comments can be written
- as specified.'''
- self.documentOrder = _LesserElement
- for child in node.childNodes:
- if child.nodeType == Node.ELEMENT_NODE:
- self.documentOrder = _Element # At document element
- self._do_element(child)
- self.documentOrder = _GreaterElement # After document element
- elif child.nodeType == Node.PROCESSING_INSTRUCTION_NODE:
- self._do_pi(child)
- elif child.nodeType == Node.COMMENT_NODE:
- self._do_comment(child)
- elif child.nodeType == Node.DOCUMENT_TYPE_NODE:
- pass
- else:
- raise TypeError(str(child))
- handlers[Node.DOCUMENT_NODE] = _do_document
- def _do_text(self, node):
- '''_do_text(self, node) -> None
- Process a text or CDATA node. Render various special characters
- as their C14N entity representations.'''
- if not _in_subset(self.subset, node):
- return
- s = node.data.replace("&", "&")
- s = s.replace("<", "<")
- s = s.replace(">", ">")
- s = s.replace("\015", "
")
- if s:
- self.write(s)
- handlers[Node.TEXT_NODE] = _do_text
- handlers[Node.CDATA_SECTION_NODE] = _do_text
- def _do_pi(self, node):
- '''_do_pi(self, node) -> None
- Process a PI node. Render a leading or trailing # xA if the
- document order of the PI is greater or lesser (respectively)
- than the document element.
- '''
- if not _in_subset(self.subset, node):
- return
- W = self.write
- if self.documentOrder == _GreaterElement:
- W('\n')
- W('<?')
- W(node.nodeName)
- s = node.data
- if s:
- W(' ')
- W(s)
- W('?>')
- if self.documentOrder == _LesserElement:
- W('\n')
- handlers[Node.PROCESSING_INSTRUCTION_NODE] = _do_pi
- def _do_comment(self, node):
- '''_do_comment(self, node) -> None
- Process a comment node. Render a leading or trailing # xA if the
- document order of the comment is greater or lesser (respectively)
- than the document element.
- '''
- if not _in_subset(self.subset, node):
- return
- if self.comments:
- W = self.write
- if self.documentOrder == _GreaterElement:
- W('\n')
- W('<!--')
- W(node.data)
- W('-->')
- if self.documentOrder == _LesserElement:
- W('\n')
- handlers[Node.COMMENT_NODE] = _do_comment
- def _do_attr(self, n, value):
- ''''_do_attr(self, node) -> None
- Process an attribute.'''
- W = self.write
- W(' ')
- W(n)
- W('="')
- s = value.replace(value, "&", "&")
- s = s.replace("<", "<")
- s = s.replace('"', '"')
- s = s.replace('\011', '	')
- s = s.replace('\012', '
')
- s = s.replace('\015', '
')
- W(s)
- W('"')
- def _do_element(self, node, initial_other_attrs=[]):
- '''_do_element(self, node, initial_other_attrs = []) -> None
- Process an element (and its children).'''
- # Get state (from the stack) make local copies.
- # ns_parent -- NS declarations in parent
- # ns_rendered -- NS nodes rendered by ancestors
- # ns_local -- NS declarations relevant to this element
- # xml_attrs -- Attributes in XML namespace from parent
- # xml_attrs_local -- Local attributes in XML namespace.
- ns_parent, ns_rendered, xml_attrs = \
- self.state[0], self.state[1].copy(), self.state[2].copy() # 0422
- ns_local = ns_parent.copy()
- xml_attrs_local = {}
- # progress("_do_element node.nodeName=", node.nodeName)
- # progress("_do_element node.namespaceURI", node.namespaceURI)
- # progress("_do_element node.tocml()", node.toxml())
- # Divide attributes into NS, XML, and others.
- other_attrs = initial_other_attrs[:]
- in_subset = _in_subset(self.subset, node)
- for a in _attrs(node):
- # progress("\t_do_element a.nodeName=", a.nodeName)
- if a.namespaceURI == XMLNS.BASE:
- n = a.nodeName
- if n == "xmlns:":
- n = "xmlns" # DOM bug workaround
- ns_local[n] = a.nodeValue
- elif a.namespaceURI == XMLNS.XML:
- if _inclusive(self) or in_subset:
- xml_attrs_local[a.nodeName] = a # 0426
- else:
- other_attrs.append(a)
- # add local xml:foo attributes to ancestor's xml:foo attributes
- xml_attrs.update(xml_attrs_local)
- # Render the node
- W, name = self.write, None
- if in_subset:
- name = node.nodeName
- W('<')
- W(name)
- # Create list of NS attributes to render.
- ns_to_render = []
- for n, v in ns_local.items():
- # If default namespace is XMLNS.BASE or empty,
- # and if an ancestor was the same
- if n == "xmlns" and v in [XMLNS.BASE, ''] \
- and ns_rendered.get('xmlns') in [XMLNS.BASE, '', None]:
- continue
- # "omit namespace node with local name xml, which defines
- # the xml prefix, if its string value is
- # http://www.w3.org/XML/1998/namespace."
- if n in ["xmlns:xml", "xml"] \
- and v in ['http://www.w3.org/XML/1998/namespace']:
- continue
- # If not previously rendered
- # and it's inclusive or utilized
- if (n, v) not in ns_rendered.items() \
- and (_inclusive(self) or \
- _utilized(n, node, other_attrs, self.unsuppressedPrefixes)):
- ns_to_render.append((n, v))
- # Sort and render the ns, marking what was rendered.
- ns_to_render.sort(_sorter_ns)
- for n, v in ns_to_render:
- self._do_attr(n, v)
- ns_rendered[n] = v # 0417
- # If exclusive or the parent is in the subset, add the local xml attributes
- # Else, add all local and ancestor xml attributes
- # Sort and render the attributes.
- if not _inclusive(self) or _in_subset(self.subset, node.parentNode): # 0426
- other_attrs.extend(xml_attrs_local.values())
- else:
- other_attrs.extend(xml_attrs.values())
- other_attrs.sort(_sorter)
- for a in other_attrs:
- self._do_attr(a.nodeName, a.value)
- W('>')
- # Push state, recurse, pop state.
- state, self.state = self.state, (ns_local, ns_rendered, xml_attrs)
- for c in _children(node):
- _implementation.handlers[c.nodeType](self, c)
- self.state = state
- if name:
- W('</%s>' % name)
- handlers[Node.ELEMENT_NODE] = _do_element
- def Canonicalize(node, output=None, **kw):
- '''Canonicalize(node, output=None, **kw) -> UTF-8
- Canonicalize a DOM document/element node and all descendents.
- Return the text; if output is specified then output.write will
- be called to output the text and None will be returned
- Keyword parameters:
- nsdict -- a dictionary of prefix:uri namespace entries
- assumed to exist in the surrounding context
- comments -- keep comments if non-zero (default is 0)
- subset -- Canonical XML subsetting resulting from XPath (default is [])
- unsuppressedPrefixes -- do exclusive C14N, and this specifies the
- prefixes that should be inherited.
- '''
- if output:
- apply(_implementation, (node, output.write), kw)
- else:
- s = StringIO.StringIO()
- apply(_implementation, (node, s.write), kw)
- return s.getvalue()
- # end of xmlC14n.py
- # from why import BecauseOfData, becauseSubexpression
- def BecauseOfData(*args, **kargs):
- # print args, kargs
- pass
- def becauseSubexpression(*args, **kargs):
- # print args, kargs
- pass
- N3_forSome_URI = forSomeSym
- N3_forAll_URI = forAllSym
- # Magic resources we know about
- ADDED_HASH = "#" # Stop where we use this in case we want to remove it!
- # This is the hash on namespace URIs
- RDF_type = (SYMBOL, RDF_type_URI)
- DAML_sameAs = (SYMBOL, DAML_sameAs_URI)
- LOG_implies_URI = "http://www.w3.org/2000/10/swap/log#implies"
- BOOLEAN_DATATYPE = _XSD_PFX + "boolean"
- DECIMAL_DATATYPE = _XSD_PFX + "decimal"
- DOUBLE_DATATYPE = _XSD_PFX + "double"
- FLOAT_DATATYPE = _XSD_PFX + "float"
- INTEGER_DATATYPE = _XSD_PFX + "integer"
- option_noregen = 0 # If set, do not regenerate genids on output
- # @@ I18n - the notname chars need extending for well known unicode non-text
- # characters. The XML spec switched to assuming unknown things were name
- # characaters.
- # _namechars = string.lowercase + string.uppercase + string.digits + '_-'
- _notQNameChars = "\t\r\n !\"#$%&'()*.,+/;<=>?@[\\]^`{|}~" # else valid qname :-/
- _notNameChars = _notQNameChars + ":" # Assume anything else valid name :-/
- _rdfns = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
- N3CommentCharacter = "#" # For unix script # ! compatabilty
- ########################################## Parse string to sink
- #
- # Regular expressions:
- eol = re.compile(r'[ \t]*(#[^\n]*)?\r?\n') # end of line, poss. w/comment
- eof = re.compile(r'[ \t]*(#[^\n]*)?$') # end of file, poss. w/comment
- ws = re.compile(r'[ \t]*') # Whitespace not including NL
- signed_integer = re.compile(r'[-+]?[0-9]+') # integer
- number_syntax = re.compile(r'(?P<integer>[-+]?[0-9]+)(?P<decimal>\.[0-9]+)?(?P<exponent>(?:e|E)[-+]?[0-9]+)?')
- digitstring = re.compile(r'[0-9]+') # Unsigned integer
- interesting = re.compile(r'[\\\r\n\"]')
- langcode = re.compile(r'[a-zA-Z0-9]+(-[a-zA-Z0-9]+)?')
- class SinkParser:
- def __init__(self, store, openFormula=None, thisDoc="", baseURI=None,
- genPrefix="", flags="", why=None):
- """ note: namespace names should *not* end in # ;
- the # will get added during qname processing """
- self._bindings = {}
- self._flags = flags
- if thisDoc != "":
- assert ':' in thisDoc, "Document URI not absolute: <%s>" % thisDoc
- self._bindings[""] = thisDoc + "#" # default
- self._store = store
- if genPrefix:
- store.setGenPrefix(genPrefix) # pass it on
- self._thisDoc = thisDoc
- self.lines = 0 # for error handling
- self.startOfLine = 0 # For calculating character number
- self._genPrefix = genPrefix
- self.keywords = ['a', 'this', 'bind', 'has', 'is', 'of', 'true', 'false']
- self.keywordsSet = 0 # Then only can others be considerd qnames
- self._anonymousNodes = {} # Dict of anon nodes already declared ln: Term
- self._variables = {}
- self._parentVariables = {}
- self._reason = why # Why the parser was asked to parse this
- self._reason2 = None # Why these triples
- # was: diag.tracking
- if tracking:
- self._reason2 = BecauseOfData(
- store.newSymbol(thisDoc), because=self._reason)
- if baseURI:
- self._baseURI = baseURI
- else:
- if thisDoc:
- self._baseURI = thisDoc
- else:
- self._baseURI = None
- assert not self._baseURI or ':' in self._baseURI
- if not self._genPrefix:
- if self._thisDoc:
- self._genPrefix = self._thisDoc + "#_g"
- else:
- self._genPrefix = uniqueURI()
- if openFormula == None:
- if self._thisDoc:
- self._formula = store.newFormula(thisDoc + "#_formula")
- else:
- self._formula = store.newFormula()
- else:
- self._formula = openFormula
- self._context = self._formula
- self._parentContext = None
- def here(self, i):
- """String generated from position in file
- This is for repeatability when refering people to bnodes in a document.
- This has diagnostic uses less formally, as it should point one to which
- bnode the arbitrary identifier actually is. It gives the
- line and character number of the '[' charcacter or path character
- which introduced the blank node. The first blank node is boringly _L1C1.
- It used to be used only for tracking, but for tests in general
- it makes the canonical ordering of bnodes repeatable."""
- return "%s_L%iC%i" % (self._genPrefix, self.lines,
- i - self.startOfLine + 1)
- def formula(self):
- return self._formula
- def loadStream(self, stream):
- return self.loadBuf(stream.read()) # Not ideal
- def loadBuf(self, buf):
- """Parses a buffer and returns its top level formula"""
- self.startDoc()
- self.feed(buf)
- return self.endDoc() # self._formula
- def feed(self, octets):
- """Feed an octet stream tothe parser
- if BadSyntax is raised, the string
- passed in the exception object is the
- remainder after any statements have been parsed.
- So if there is more data to feed to the
- parser, it should be straightforward to recover."""
- if not isinstance(octets, unicode):
- s = octets.decode('utf-8')
- # NB already decoded, so \ufeff
- if len(s) > 0 and s[0] == codecs.BOM_UTF8.decode('utf-8'):
- s = s[1:]
- else:
- s = octets
- i = 0
- while i >= 0:
- j = self.skipSpace(s, i)
- if j < 0:
- return
- i = self.directiveOrStatement(s, j)
- if i < 0:
- print("# next char: %s" % s[j])
- raise BadSyntax(self._thisDoc, self.lines, s, j,
- "expected directive or statement")
- def directiveOrStatement(self, argstr, h):
- i = self.skipSpace(argstr, h)
- if i < 0:
- return i # EOF
- j = self.directive(argstr, i)
- if j >= 0:
- return self.checkDot(argstr, j)
- j = self.statement(argstr, i)
- if j >= 0:
- return self.checkDot(argstr, j)
- return j
- # @@I18N
- global _notNameChars
- # _namechars = string.lowercase + string.uppercase + string.digits + '_-'
- def tok(self, tok, argstr, i):
- """Check for keyword. Space must have been stripped on entry and
- we must not be at end of file."""
- assert tok[0] not in _notNameChars # not for punctuation
- if argstr[i:i + 1] == "@":
- i = i + 1
- else:
- if tok not in self.keywords:
- return -1 # No, this has neither keywords declaration nor "@"
- if (argstr[i:i + len(tok)] == tok
- and (argstr[i + len(tok)] in _notQNameChars)):
- i = i + len(tok)
- return i
- else:
- return -1
- def directive(self, argstr, i):
- j = self.skipSpace(argstr, i)
- if j < 0:
- return j # eof
- res = []
- j = self.tok('bind', argstr, i) # implied "#". Obsolete.
- if j > 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "keyword bind is obsolete: use @prefix")
- j = self.tok('keywords', argstr, i)
- if j > 0:
- i = self.commaSeparatedList(argstr, j, res, self.bareWord)
- if i < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "'@keywords' needs comma separated list of words")
- self.setKeywords(res[:])
- # was: diag.chatty_flag
- if chatty_flag > 80:
- progress("Keywords ", self.keywords)
- return i
- j = self.tok('forAll', argstr, i)
- if j > 0:
- i = self.commaSeparatedList(argstr, j, res, self.uri_ref2)
- if i < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "Bad variable list after @forAll")
- for x in res:
- # self._context.declareUniversal(x)
- if x not in self._variables or x in self._parentVariables:
- self._variables[x] = self._context.newUniversal(x)
- return i
- j = self.tok('forSome', argstr, i)
- if j > 0:
- i = self. commaSeparatedList(argstr, j, res, self.uri_ref2)
- if i < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "Bad variable list after @forSome")
- for x in res:
- self._context.declareExistential(x)
- return i
- j = self.tok('prefix', argstr, i) # no implied "#"
- if j >= 0:
- t = []
- i = self.qname(argstr, j, t)
- if i < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, j,
- "expected qname after @prefix")
- j = self.uri_ref2(argstr, i, t)
- if j < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "expected <uriref> after @prefix _qname_")
- ns = self.uriOf(t[1])
- if self._baseURI:
- ns = join(self._baseURI, ns)
- elif ":" not in ns:
- raise BadSyntax(self._thisDoc, self.lines, argstr, j,
- "With no base URI, cannot use relative URI in @prefix <" + ns + ">")
- assert ':' in ns # must be absolute
- self._bindings[t[0][0]] = ns
- self.bind(t[0][0], hexify(ns))
- return j
- j = self.tok('base', argstr, i) # Added 2007/7/7
- if j >= 0:
- t = []
- i = self.uri_ref2(argstr, j, t)
- if i < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, j,
- "expected <uri> after @base ")
- ns = self.uriOf(t[0])
- if self._baseURI:
- ns = join(self._baseURI, ns)
- else:
- raise BadSyntax(self._thisDoc, self.lines, argstr, j,
- "With no previous base URI, cannot use relative URI in @base <" + ns + ">")
- assert ':' in ns # must be absolute
- self._baseURI = ns
- return i
- return -1 # Not a directive, could be something else.
- def bind(self, qn, uri):
- assert isinstance(uri,
- types.StringType), "Any unicode must be %x-encoded already"
- if qn == "":
- self._store.setDefaultNamespace(uri)
- else:
- self._store.bind(qn, uri)
- def setKeywords(self, k):
- "Takes a list of strings"
- if k == None:
- self.keywordsSet = 0
- else:
- self.keywords = k
- self.keywordsSet = 1
- def startDoc(self):
- # was: self._store.startDoc()
- self._store.startDoc(self._formula)
- def endDoc(self):
- """Signal end of document and stop parsing. returns formula"""
- self._store.endDoc(self._formula) # don't canonicalize yet
- return self._formula
- def makeStatement(self, quadruple):
- # $$$$$$$$$$$$$$$$$$$$$
- # print "# Parser output: ", `quadruple`
- self._store.makeStatement(quadruple, why=self._reason2)
- def statement(self, argstr, i):
- r = []
- i = self.object(argstr, i, r) # Allow literal for subject - extends RDF
- if i < 0:
- return i
- j = self.property_list(argstr, i, r[0])
- if j < 0:
- raise BadSyntax(self._thisDoc, self.lines,
- argstr, i, "expected propertylist")
- return j
- def subject(self, argstr, i, res):
- return self.item(argstr, i, res)
- def verb(self, argstr, i, res):
- """ has _prop_
- is _prop_ of
- a
- =
- _prop_
- >- prop ->
- <- prop -<
- _operator_"""
- j = self.skipSpace(argstr, i)
- if j < 0:
- return j # eof
- r = []
- j = self.tok('has', argstr, i)
- if j >= 0:
- i = self.prop(argstr, j, r)
- if i < 0:
- raise BadSyntax(self._thisDoc, self.lines,
- argstr, j, "expected property after 'has'")
- res.append(('->', r[0]))
- return i
- j = self.tok('is', argstr, i)
- if j >= 0:
- i = self.prop(argstr, j, r)
- if i < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, j,
- "expected <property> after 'is'")
- j = self.skipSpace(argstr, i)
- if j < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "End of file found, expected property after 'is'")
- return j # eof
- i = j
- j = self.tok('of', argstr, i)
- if j < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "expected 'of' after 'is' <prop>")
- res.append(('<-', r[0]))
- return j
- j = self.tok('a', argstr, i)
- if j >= 0:
- res.append(('->', RDF_type))
- return j
- if argstr[i:i + 2] == "<=":
- res.append(('<-', self._store.newSymbol(Logic_NS + "implies")))
- return i + 2
- if argstr[i:i + 1] == "=":
- if argstr[i + 1:i + 2] == ">":
- res.append(('->', self._store.newSymbol(Logic_NS + "implies")))
- return i + 2
- res.append(('->', DAML_sameAs))
- return i + 1
- if argstr[i:i + 2] == ":=":
- # patch file relates two formulae, uses this @@ really?
- res.append(('->', Logic_NS + "becomes"))
- return i + 2
- j = self.prop(argstr, i, r)
- if j >= 0:
- res.append(('->', r[0]))
- return j
- if argstr[i:i + 2] == ">-" or argstr[i:i + 2] == "<-":
- raise BadSyntax(self._thisDoc, self.lines, argstr, j,
- ">- ... -> syntax is obsolete.")
- return -1
- def prop(self, argstr, i, res):
- return self.item(argstr, i, res)
- def item(self, argstr, i, res):
- return self.path(argstr, i, res)
- def blankNode(self, uri=None):
- if "B" not in self._flags:
- return self._context.newBlankNode(uri, why=self._reason2)
- x = self._context.newSymbol(uri)
- self._context.declareExistential(x)
- return x
- def path(self, argstr, i, res):
- """Parse the path production.
- """
- j = self.nodeOrLiteral(argstr, i, res)
- if j < 0:
- return j # nope
- while argstr[j:j + 1] in "!^.": # no spaces, must follow exactly (?)
- ch = argstr[j:j + 1] # @@ Allow "." followed IMMEDIATELY by a node.
- if ch == ".":
- ahead = argstr[j + 1:j + 2]
- if not ahead or (ahead in _notNameChars
- and ahead not in ":?<[{("):
- break
- subj = res.pop()
- obj = self.blankNode(uri=self.here(j))
- j = self.node(argstr, j + 1, res)
- if j < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, j,
- "EOF found in middle of path syntax")
- pred = res.pop()
- if ch == "^": # Reverse traverse
- self.makeStatement((self._context, pred, obj, subj))
- else:
- self.makeStatement((self._context, pred, subj, obj))
- res.append(obj)
- return j
- def anonymousNode(self, ln):
- """Remember or generate a term for one of these _: anonymous nodes"""
- term = self._anonymousNodes.get(ln, None)
- if term != None:
- return term
- term = self._store.newBlankNode(self._context, why=self._reason2)
- self._anonymousNodes[ln] = term
- return term
- def node(self, argstr, i, res, subjectAlready=None):
- """Parse the <node> production.
- Space is now skipped once at the beginning
- instead of in multipe calls to self.skipSpace().
- """
- subj = subjectAlready
- j = self.skipSpace(argstr, i)
- if j < 0:
- return j # eof
- i = j
- ch = argstr[i:i + 1] # Quick 1-character checks first:
- if ch == "[":
- bnodeID = self.here(i)
- j = self.skipSpace(argstr, i + 1)
- if j < 0:
- raise BadSyntax(self._thisDoc,
- self.lines, argstr, i, "EOF after '['")
- if argstr[j:j + 1] == "=": # Hack for "is" binding name to anon node
- i = j + 1
- objs = []
- j = self.objectList(argstr, i, objs)
- if j >= 0:
- subj = objs[0]
- if len(objs) > 1:
- for obj in objs:
- self.makeStatement((self._context,
- DAML_sameAs, subj, obj))
- j = self.skipSpace(argstr, j)
- if j < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "EOF when objectList expected after [ = ")
- if argstr[j:j + 1] == ";":
- j = j + 1
- else:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "objectList expected after [= ")
- if subj is None:
- subj = self.blankNode(uri=bnodeID)
- i = self.property_list(argstr, j, subj)
- if i < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, j,
- "property_list expected")
- j = self.skipSpace(argstr, i)
- if j < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "EOF when ']' expected after [ <propertyList>")
- if argstr[j:j + 1] != "]":
- raise BadSyntax(self._thisDoc,
- self.lines, argstr, j, "']' expected")
- res.append(subj)
- return j + 1
- if ch == "{":
- ch2 = argstr[i + 1:i + 2]
- if ch2 == '$':
- i += 1
- j = i + 1
- List = []
- first_run = True
- while 1:
- i = self.skipSpace(argstr, j)
- if i < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "needed '$}', found end.")
- if argstr[i:i + 2] == '$}':
- j = i + 2
- break
- if not first_run:
- if argstr[i:i + 1] == ',':
- i += 1
- else:
- raise BadSyntax(self._thisDoc, self.lines,
- argstr, i, "expected: ','")
- else:
- first_run = False
- item = []
- j = self.item(argstr, i, item) # @@@@@ should be path, was object
- if j < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "expected item in set or '$}'")
- List.append(self._store.intern(item[0]))
- res.append(self._store.newSet(List, self._context))
- return j
- else:
- j = i + 1
- oldParentContext = self._parentContext
- self._parentContext = self._context
- parentAnonymousNodes = self._anonymousNodes
- grandParentVariables = self._parentVariables
- self._parentVariables = self._variables
- self._anonymousNodes = {}
- self._variables = self._variables.copy()
- reason2 = self._reason2
- self._reason2 = becauseSubexpression
- if subj is None:
- subj = self._store.newFormula()
- self._context = subj
- while 1:
- i = self.skipSpace(argstr, j)
- if i < 0:
- raise BadSyntax(self._thisDoc, self.lines,
- argstr, i, "needed '}', found end.")
- if argstr[i:i + 1] == "}":
- j = i + 1
- break
- j = self.directiveOrStatement(argstr, i)
- if j < 0:
- raise BadSyntax(self._thisDoc, self.lines,
- argstr, i, "expected statement or '}'")
- self._anonymousNodes = parentAnonymousNodes
- self._variables = self._parentVariables
- self._parentVariables = grandParentVariables
- self._context = self._parentContext
- self._reason2 = reason2
- self._parentContext = oldParentContext
- res.append(subj.close()) # No use until closed
- return j
- if ch == "(":
- thing_type = self._store.newList
- ch2 = argstr[i + 1:i + 2]
- if ch2 == '$':
- thing_type = self._store.newSet
- i += 1
- j = i + 1
- List = []
- while 1:
- i = self.skipSpace(argstr, j)
- if i < 0:
- raise BadSyntax(self._thisDoc, self.lines,
- argstr, i, "needed ')', found end.")
- if argstr[i:i + 1] == ')':
- j = i + 1
- break
- item = []
- j = self.item(argstr, i, item) # @@@@@ should be path, was object
- if j < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "expected item in list or ')'")
- List.append(self._store.intern(item[0]))
- res.append(thing_type(List, self._context))
- return j
- j = self.tok('this', argstr, i) # This context
- if j >= 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "Keyword 'this' was ancient N3. Now use @forSome and @forAll keywords.")
- res.append(self._context)
- return j
- # booleans
- j = self.tok('true', argstr, i)
- if j >= 0:
- res.append(True)
- return j
- j = self.tok('false', argstr, i)
- if j >= 0:
- res.append(False)
- return j
- if subj is None: # If this can be a named node, then check for a name.
- j = self.uri_ref2(argstr, i, res)
- if j >= 0:
- return j
- return -1
- def property_list(self, argstr, i, subj):
- """Parse property list
- Leaves the terminating punctuation in the buffer
- """
- while 1:
- j = self.skipSpace(argstr, i)
- if j < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "EOF found when expected verb in property list")
- return j # eof
- if argstr[j:j + 2] == ":-":
- i = j + 2
- res = []
- j = self.node(argstr, i, res, subj)
- if j < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "bad {} or () or [] node after :- ")
- i = j
- continue
- i = j
- v = []
- j = self.verb(argstr, i, v)
- if j <= 0:
- return i # void but valid
- objs = []
- i = self.objectList(argstr, j, objs)
- if i < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, j,
- "objectList expected")
- for obj in objs:
- dira, sym = v[0]
- if dira == '->':
- self.makeStatement((self._context, sym, subj, obj))
- else:
- self.makeStatement((self._context, sym, obj, subj))
- j = self.skipSpace(argstr, i)
- if j < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, j,
- "EOF found in list of objects")
- return j # eof
- if argstr[i:i + 1] != ";":
- return i
- i = i + 1 # skip semicolon and continue
- def commaSeparatedList(self, argstr, j, res, what):
- """return value: -1 bad syntax; >1 new position in argstr
- res has things found appended
- """
- i = self.skipSpace(argstr, j)
- if i < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "EOF found expecting comma sep list")
- return i
- if argstr[i] == ".":
- return j # empty list is OK
- i = what(argstr, i, res)
- if i < 0:
- return -1
- while 1:
- j = self.skipSpace(argstr, i)
- if j < 0:
- return j # eof
- ch = argstr[j:j + 1]
- if ch != ",":
- …
Large files files are truncated, but you can click here to view the full file