/SmartObjectFramework/src/rdflib/plugins/parsers/notation3.py
Python | 2427 lines | 2111 code | 176 blank | 140 comment | 224 complexity | 825ecb377cb8c00574f56dbdcc5a06e0 MD5 | raw file
Possible License(s): Apache-2.0, LGPL-3.0
- #!/usr/bin/env python
- u"""
- notation3.py - Standalone Notation3 Parser
- Derived from CWM, the Closed World Machine
- Authors of the original suite:
- * Dan Connolly <@@>
- * Tim Berners-Lee <@@>
- * Yosi Scharf <@@>
- * Joseph M. Reagle Jr. <reagle@w3.org>
- * Rich Salz <rsalz@zolera.com>
- http://www.w3.org/2000/10/swap/notation3.py
- Copyright 2000-2007, World Wide Web Consortium.
- Copyright 2001, MIT.
- Copyright 2001, Zolera Systems Inc.
- License: W3C Software License
- http://www.w3.org/Consortium/Legal/copyright-software
- Modified by Sean B. Palmer
- Copyright 2007, Sean B. Palmer. \u32E1
- Modified to work with rdflib by Gunnar Aastrand Grimnes
- Copyright 2010, Gunnar A. Grimnes
- """
- # Python standard libraries
- import types
- import sys
- import os
- import re
- import StringIO
- import codecs
- from binascii import a2b_hex
- from decimal import Decimal
- from rdflib.term import URIRef, BNode, Literal, Variable, _XSD_PFX, _unique_id
- from rdflib.graph import QuotedGraph, ConjunctiveGraph
- from rdflib import py3compat
- b = py3compat.b
- __all__ = [
- 'URISyntaxError', 'BadSyntax', 'N3Parser', "verbosity", "setVerbosity",
- "progress", "splitFrag", "splitFragP", "join", "refTo", "base",
- "canonical", "runNamespace", "uniqueURI", "Canonicalize", "stripCR",
- "dummyWrite", "toBool", "stringToN3", "backslashUify", "hexify"]
- from rdflib.parser import Parser
- # Incestuous.. would be nice to separate N3 and XML
- # from sax2rdf import XMLtoDOM
- def XMLtoDOM(*args, **kargs):
- # print >> sys.stderr, args, kargs
- pass
- # SWAP http://www.w3.org/2000/10/swap
- # from diag import verbosity, setVerbosity, progress
- def verbosity(*args, **kargs):
- # print >> sys.stderr, args, kargs
- pass
- def setVerbosity(*args, **kargs):
- # print >> sys.stderr, args, kargs
- pass
- def progress(*args, **kargs):
- # print >> sys.stderr, args, kargs
- pass
- def splitFrag(uriref):
- """split a URI reference between the fragment and the rest.
- Punctuation is thrown away.
- e.g.
- >>> splitFrag("abc#def")
- ('abc', 'def')
- >>> splitFrag("abcdef")
- ('abcdef', None)
- """
- i = uriref.rfind("#")
- if i >= 0:
- return uriref[:i], uriref[i + 1:]
- else:
- return uriref, None
- def splitFragP(uriref, punct=0):
- """split a URI reference before the fragment
- Punctuation is kept.
- e.g.
- >>> splitFragP("abc#def")
- ('abc', '#def')
- >>> splitFragP("abcdef")
- ('abcdef', '')
- """
- i = uriref.rfind("#")
- if i >= 0:
- return uriref[:i], uriref[i:]
- else:
- return uriref, ''
- @py3compat.format_doctest_out
- def join(here, there):
- """join an absolute URI and URI reference
- (non-ascii characters are supported/doctested;
- haven't checked the details of the IRI spec though)
- here is assumed to be absolute.
- there is URI reference.
- >>> join('http://example/x/y/z', '../abc')
- 'http://example/x/abc'
- Raise ValueError if there uses relative path
- syntax but here has no hierarchical path.
- >>> join('mid:foo@example', '../foo')
- Traceback (most recent call last):
- raise ValueError, here
- ValueError: Base <mid:foo@example> has no slash after colon - with relative '../foo'.
- >>> join('http://example/x/y/z', '')
- 'http://example/x/y/z'
- >>> join('mid:foo@example', '#foo')
- 'mid:foo@example#foo'
- We grok IRIs
- >>> len(%(u)s'Andr\\xe9')
- 5
- >>> join('http://example.org/', %(u)s'#Andr\\xe9')
- %(u)s'http://example.org/#Andr\\xe9'
- """
- assert(here.find("#") < 0), "Base may not contain hash: '%s'" % here # caller must splitFrag (why?)
- slashl = there.find('/')
- colonl = there.find(':')
- # join(base, 'foo:/') -- absolute
- if colonl >= 0 and (slashl < 0 or colonl < slashl):
- return there
- bcolonl = here.find(':')
- assert(bcolonl >= 0), "Base uri '%s' is not absolute" % here # else it's not absolute
- path, frag = splitFragP(there)
- if not path:
- return here + frag
- # join('mid:foo@example', '../foo') bzzt
- if here[bcolonl + 1:bcolonl + 2] != '/':
- raise ValueError("Base <%s> has no slash after colon - with relative '%s'." % (here, there))
- if here[bcolonl + 1:bcolonl + 3] == '//':
- bpath = here.find('/', bcolonl + 3)
- else:
- bpath = bcolonl + 1
- # join('http://xyz', 'foo')
- if bpath < 0:
- bpath = len(here)
- here = here + '/'
- # join('http://xyz/', '//abc') => 'http://abc'
- if there[:2] == '//':
- return here[:bcolonl + 1] + there
- # join('http://xyz/', '/abc') => 'http://xyz/abc'
- if there[:1] == '/':
- return here[:bpath] + there
- slashr = here.rfind('/')
- while 1:
- if path[:2] == './':
- path = path[2:]
- if path == '.':
- path = ''
- elif path[:3] == '../' or path == '..':
- path = path[3:]
- i = here.rfind('/', bpath, slashr)
- if i >= 0:
- here = here[:i + 1]
- slashr = i
- else:
- break
- return here[:slashr + 1] + path + frag
- commonHost = re.compile(r'^[-_a-zA-Z0-9.]+:(//[^/]*)?/[^/]*$')
- def refTo(base, uri):
- """figure out a relative URI reference from base to uri
- >>> refTo('http://example/x/y/z', 'http://example/x/abc')
- '../abc'
- >>> refTo('file:/ex/x/y', 'file:/ex/x/q/r#s')
- 'q/r#s'
- >>> refTo(None, 'http://ex/x/y')
- 'http://ex/x/y'
- >>> refTo('http://ex/x/y', 'http://ex/x/y')
- ''
- Note the relationship between refTo and join:
- join(x, refTo(x, y)) == y
- which points out certain strings which cannot be URIs. e.g.
- >>> x='http://ex/x/y';y='http://ex/x/q:r';join(x, refTo(x, y)) == y
- 0
- So 'http://ex/x/q:r' is not a URI. Use 'http://ex/x/q%3ar' instead:
- >>> x='http://ex/x/y';y='http://ex/x/q%3ar';join(x, refTo(x, y)) == y
- 1
- This one checks that it uses a root-realtive one where that is
- all they share. Now uses root-relative where no path is shared.
- This is a matter of taste but tends to give more resilience IMHO
- -- and shorter paths
- Note that base may be None, meaning no base. In some situations, there
- just ain't a base. Slife. In these cases, relTo returns the absolute value.
- The axiom abs(,rel(b,x))=x still holds.
- This saves people having to set the base to "bogus:".
- >>> refTo('http://ex/x/y/z', 'http://ex/r')
- '/r'
- """
- # assert base # don't mask bugs -danc # not a bug. -tim
- if not base:
- return uri
- if base == uri:
- return ""
- # Find how many path segments in common
- i = 0
- while i < len(uri) and i < len(base):
- if uri[i] == base[i]:
- i = i + 1
- else:
- break
- # print "# relative", base, uri, " same up to ", i
- # i point to end of shortest one or first difference
- m = commonHost.match(base[:i])
- if m:
- k = uri.find("//")
- if k < 0:
- k = -2 # no host
- l = uri.find("/", k + 2)
- if uri[l + 1:l + 2] != "/" and base[l + 1:l + 2] != "/" and uri[:l] == base[:l]:
- return uri[l:]
- if uri[i:i + 1] == "#" and len(base) == i:
- return uri[i:] # fragment of base
- while i > 0 and uri[i - 1] != '/':
- i = i - 1 # scan for slash
- if i < 3:
- return uri # No way.
- if base.find("//", i - 2) > 0 or uri.find("//", i - 2) > 0:
- return uri # An unshared "//"
- if base.find(":", i) > 0:
- return uri # An unshared ":"
- n = base.count("/", i)
- if n == 0 and i < len(uri) and uri[i] == '#':
- return "./" + uri[i:]
- elif n == 0 and i == len(uri):
- return "./"
- else:
- return ("../" * n) + uri[i:]
- def base():
- """The base URI for this process - the Web equiv of cwd
- Relative or abolute unix-standard filenames parsed relative to
- this yeild the URI of the file.
- If we had a reliable way of getting a computer name,
- we should put it in the hostname just to prevent ambiguity
- """
- # return "file://" + hostname + os.getcwd() + "/"
- return "file://" + _fixslash(os.getcwd()) + "/"
- def _fixslash(argstr):
- """ Fix windowslike filename to unixlike - (#ifdef WINDOWS)"""
- s = argstr
- for i in range(len(s)):
- if s[i] == "\\":
- s = s[:i] + "/" + s[i + 1:]
- if s[0] != "/" and s[1] == ":":
- s = s[2:] # @@@ Hack when drive letter present
- return s
- URI_unreserved = b("ABCDEFGHIJJLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~")
- # unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
- @py3compat.format_doctest_out
- def canonical(str_in):
- """Convert equivalent URIs (or parts) to the same string
- There are many differenet levels of URI canonicalization
- which are possible. See http://www.ietf.org/rfc/rfc3986.txt
- Done:
- - Converfting unicode IRI to utf-8
- - Escaping all non-ASCII
- - De-escaping, if escaped, ALPHA (%%41-%%5A and %%61-%%7A), DIGIT (%%30-%%39),
- hyphen (%%2D), period (%%2E), underscore (%%5F), or tilde (%%7E) (Sect 2.4)
- - Making all escapes uppercase hexadecimal
- Not done:
- - Making URI scheme lowercase
- - changing /./ or /foo/../ to / with care not to change host part
- >>> canonical("foo bar")
- %(b)s'foo%%20bar'
- >>> canonical(%(u)s'http:')
- %(b)s'http:'
- >>> canonical('fran%%c3%%83%%c2%%a7ois')
- %(b)s'fran%%C3%%83%%C2%%A7ois'
- >>> canonical('a')
- %(b)s'a'
- >>> canonical('%%4e')
- %(b)s'N'
- >>> canonical('%%9d')
- %(b)s'%%9D'
- >>> canonical('%%2f')
- %(b)s'%%2F'
- >>> canonical('%%2F')
- %(b)s'%%2F'
- """
- if type(str_in) == type(u''):
- s8 = str_in.encode('utf-8')
- else:
- s8 = str_in
- s = b('')
- i = 0
- while i < len(s8):
- if py3compat.PY3:
- n = s8[i]
- ch = bytes([n])
- else:
- ch = s8[i]
- n = ord(ch)
- if (n > 126) or (n < 33): # %-encode controls, SP, DEL, and utf-8
- s += b("%%%02X" % ord(ch))
- elif ch == b('%') and i + 2 < len(s8):
- ch2 = a2b_hex(s8[i + 1:i + 3])
- if ch2 in URI_unreserved:
- s += ch2
- else:
- s += b("%%%02X" % ord(ch2))
- i = i + 3
- continue
- else:
- s += ch
- i = i + 1
- return s
- CONTEXT = 0
- PRED = 1
- SUBJ = 2
- OBJ = 3
- PARTS = PRED, SUBJ, OBJ
- ALL4 = CONTEXT, PRED, SUBJ, OBJ
- SYMBOL = 0
- FORMULA = 1
- LITERAL = 2
- LITERAL_DT = 21
- LITERAL_LANG = 22
- ANONYMOUS = 3
- XMLLITERAL = 25
- Logic_NS = "http://www.w3.org/2000/10/swap/log#"
- NODE_MERGE_URI = Logic_NS + "is" # Pseudo-property indicating node merging
- forSomeSym = Logic_NS + "forSome"
- forAllSym = Logic_NS + "forAll"
- RDF_type_URI = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
- RDF_NS_URI = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
- OWL_NS = "http://www.w3.org/2002/07/owl#"
- DAML_sameAs_URI = OWL_NS + "sameAs"
- parsesTo_URI = Logic_NS + "parsesTo"
- RDF_spec = "http://www.w3.org/TR/REC-rdf-syntax/"
- List_NS = RDF_NS_URI # From 20030808
- _Old_Logic_NS = "http://www.w3.org/2000/10/swap/log.n3#"
- N3_first = (SYMBOL, List_NS + "first")
- N3_rest = (SYMBOL, List_NS + "rest")
- N3_li = (SYMBOL, List_NS + "li")
- N3_nil = (SYMBOL, List_NS + "nil")
- N3_List = (SYMBOL, List_NS + "List")
- N3_Empty = (SYMBOL, List_NS + "Empty")
- runNamespaceValue = None
- def runNamespace():
- "Return a URI suitable as a namespace for run-local objects"
- # @@@ include hostname (privacy?) (hash it?)
- global runNamespaceValue
- if runNamespaceValue == None:
- runNamespaceValue = join(base(), _unique_id()) + '#'
- return runNamespaceValue
- nextu = 0
- def uniqueURI():
- "A unique URI"
- global nextu
- nextu += 1
- # return runNamespace() + "u_" + `nextu`
- return runNamespace() + "u_" + str(nextu)
- class URISyntaxError(ValueError):
- """A parameter is passed to a routine that requires a URI reference"""
- pass
- tracking = False
- chatty_flag = 50
- from xml.dom import Node
- try:
- from xml.ns import XMLNS
- except:
- class XMLNS:
- BASE = "http://www.w3.org/2000/xmlns/"
- XML = "http://www.w3.org/XML/1998/namespace"
- _attrs = lambda E: (E.attributes and E.attributes.values()) or []
- _children = lambda E: E.childNodes or []
- _IN_XML_NS = lambda n: n.namespaceURI == XMLNS.XML
- _inclusive = lambda n: n.unsuppressedPrefixes == None
- # Does a document/PI has lesser/greater document order than the
- # first element?
- _LesserElement, _Element, _GreaterElement = range(3)
- def _sorter(n1, n2):
- '''_sorter(n1, n2) -> int
- Sorting predicate for non-NS attributes.'''
- i = cmp(n1.namespaceURI, n2.namespaceURI)
- if i:
- return i
- return cmp(n1.localName, n2.localName)
- def _sorter_ns(n1, n2):
- '''_sorter_ns((n,v),(n,v)) -> int
- "(an empty namespace URI is lexicographically least)."'''
- if n1[0] == 'xmlns':
- return -1
- if n2[0] == 'xmlns':
- return 1
- return cmp(n1[0], n2[0])
- def _utilized(n, node, other_attrs, unsuppressedPrefixes):
- '''_utilized(n, node, other_attrs, unsuppressedPrefixes) -> boolean
- Return true if that nodespace is utilized within the node'''
- if n.startswith('xmlns:'):
- n = n[6:]
- elif n.startswith('xmlns'):
- n = n[5:]
- if (n == "" and node.prefix in ["#default", None]) or \
- n == node.prefix or n in unsuppressedPrefixes:
- return 1
- for attr in other_attrs:
- if n == attr.prefix:
- return 1
- return 0
- #_in_subset = lambda subset, node: not subset or node in subset
- _in_subset = lambda subset, node: subset is None or node in subset # rich's tweak
- class _implementation:
- '''Implementation class for C14N. This accompanies a node during it's
- processing and includes the parameters and processing state.'''
- # Handler for each node type; populated during module instantiation.
- handlers = {}
- def __init__(self, node, write, **kw):
- '''Create and run the implementation.'''
- self.write = write
- self.subset = kw.get('subset')
- self.comments = kw.get('comments', 0)
- self.unsuppressedPrefixes = kw.get('unsuppressedPrefixes')
- nsdict = kw.get('nsdict', {'xml': XMLNS.XML, 'xmlns': XMLNS.BASE})
- # Processing state.
- self.state = (nsdict, {'xml': ''}, {}) # 0422
- if node.nodeType == Node.DOCUMENT_NODE:
- self._do_document(node)
- elif node.nodeType == Node.ELEMENT_NODE:
- self.documentOrder = _Element # At document element
- if not _inclusive(self):
- self._do_element(node)
- else:
- inherited = self._inherit_context(node)
- self._do_element(node, inherited)
- elif node.nodeType == Node.DOCUMENT_TYPE_NODE:
- pass
- elif node.nodeType == Node.TEXT_NODE:
- self._do_text(node)
- else:
- raise TypeError(str(node))
- def _inherit_context(self, node):
- '''_inherit_context(self, node) -> list
- Scan ancestors of attribute and namespace context. Used only
- for single element node canonicalization, not for subset
- canonicalization.'''
- # Collect the initial list of xml:foo attributes.
- xmlattrs = filter(_IN_XML_NS, _attrs(node))
- # Walk up and get all xml:XXX attributes we inherit.
- inherited, parent = [], node.parentNode
- while parent and parent.nodeType == Node.ELEMENT_NODE:
- for a in filter(_IN_XML_NS, _attrs(parent)):
- n = a.localName
- if n not in xmlattrs:
- xmlattrs.append(n)
- inherited.append(a)
- parent = parent.parentNode
- return inherited
- def _do_document(self, node):
- '''_do_document(self, node) -> None
- Process a document node. documentOrder holds whether the document
- element has been encountered such that PIs/comments can be written
- as specified.'''
- self.documentOrder = _LesserElement
- for child in node.childNodes:
- if child.nodeType == Node.ELEMENT_NODE:
- self.documentOrder = _Element # At document element
- self._do_element(child)
- self.documentOrder = _GreaterElement # After document element
- elif child.nodeType == Node.PROCESSING_INSTRUCTION_NODE:
- self._do_pi(child)
- elif child.nodeType == Node.COMMENT_NODE:
- self._do_comment(child)
- elif child.nodeType == Node.DOCUMENT_TYPE_NODE:
- pass
- else:
- raise TypeError(str(child))
- handlers[Node.DOCUMENT_NODE] = _do_document
- def _do_text(self, node):
- '''_do_text(self, node) -> None
- Process a text or CDATA node. Render various special characters
- as their C14N entity representations.'''
- if not _in_subset(self.subset, node):
- return
- s = node.data.replace("&", "&")
- s = s.replace("<", "<")
- s = s.replace(">", ">")
- s = s.replace("\015", "
")
- if s:
- self.write(s)
- handlers[Node.TEXT_NODE] = _do_text
- handlers[Node.CDATA_SECTION_NODE] = _do_text
- def _do_pi(self, node):
- '''_do_pi(self, node) -> None
- Process a PI node. Render a leading or trailing # xA if the
- document order of the PI is greater or lesser (respectively)
- than the document element.
- '''
- if not _in_subset(self.subset, node):
- return
- W = self.write
- if self.documentOrder == _GreaterElement:
- W('\n')
- W('<?')
- W(node.nodeName)
- s = node.data
- if s:
- W(' ')
- W(s)
- W('?>')
- if self.documentOrder == _LesserElement:
- W('\n')
- handlers[Node.PROCESSING_INSTRUCTION_NODE] = _do_pi
- def _do_comment(self, node):
- '''_do_comment(self, node) -> None
- Process a comment node. Render a leading or trailing # xA if the
- document order of the comment is greater or lesser (respectively)
- than the document element.
- '''
- if not _in_subset(self.subset, node):
- return
- if self.comments:
- W = self.write
- if self.documentOrder == _GreaterElement:
- W('\n')
- W('<!--')
- W(node.data)
- W('-->')
- if self.documentOrder == _LesserElement:
- W('\n')
- handlers[Node.COMMENT_NODE] = _do_comment
- def _do_attr(self, n, value):
- ''''_do_attr(self, node) -> None
- Process an attribute.'''
- W = self.write
- W(' ')
- W(n)
- W('="')
- s = value.replace(value, "&", "&")
- s = s.replace("<", "<")
- s = s.replace('"', '"')
- s = s.replace('\011', '	')
- s = s.replace('\012', '
')
- s = s.replace('\015', '
')
- W(s)
- W('"')
- def _do_element(self, node, initial_other_attrs=[]):
- '''_do_element(self, node, initial_other_attrs = []) -> None
- Process an element (and its children).'''
- # Get state (from the stack) make local copies.
- # ns_parent -- NS declarations in parent
- # ns_rendered -- NS nodes rendered by ancestors
- # ns_local -- NS declarations relevant to this element
- # xml_attrs -- Attributes in XML namespace from parent
- # xml_attrs_local -- Local attributes in XML namespace.
- ns_parent, ns_rendered, xml_attrs = \
- self.state[0], self.state[1].copy(), self.state[2].copy() # 0422
- ns_local = ns_parent.copy()
- xml_attrs_local = {}
- # progress("_do_element node.nodeName=", node.nodeName)
- # progress("_do_element node.namespaceURI", node.namespaceURI)
- # progress("_do_element node.tocml()", node.toxml())
- # Divide attributes into NS, XML, and others.
- other_attrs = initial_other_attrs[:]
- in_subset = _in_subset(self.subset, node)
- for a in _attrs(node):
- # progress("\t_do_element a.nodeName=", a.nodeName)
- if a.namespaceURI == XMLNS.BASE:
- n = a.nodeName
- if n == "xmlns:":
- n = "xmlns" # DOM bug workaround
- ns_local[n] = a.nodeValue
- elif a.namespaceURI == XMLNS.XML:
- if _inclusive(self) or in_subset:
- xml_attrs_local[a.nodeName] = a # 0426
- else:
- other_attrs.append(a)
- # add local xml:foo attributes to ancestor's xml:foo attributes
- xml_attrs.update(xml_attrs_local)
- # Render the node
- W, name = self.write, None
- if in_subset:
- name = node.nodeName
- W('<')
- W(name)
- # Create list of NS attributes to render.
- ns_to_render = []
- for n, v in ns_local.items():
- # If default namespace is XMLNS.BASE or empty,
- # and if an ancestor was the same
- if n == "xmlns" and v in [XMLNS.BASE, ''] \
- and ns_rendered.get('xmlns') in [XMLNS.BASE, '', None]:
- continue
- # "omit namespace node with local name xml, which defines
- # the xml prefix, if its string value is
- # http://www.w3.org/XML/1998/namespace."
- if n in ["xmlns:xml", "xml"] \
- and v in ['http://www.w3.org/XML/1998/namespace']:
- continue
- # If not previously rendered
- # and it's inclusive or utilized
- if (n, v) not in ns_rendered.items() \
- and (_inclusive(self) or \
- _utilized(n, node, other_attrs, self.unsuppressedPrefixes)):
- ns_to_render.append((n, v))
- # Sort and render the ns, marking what was rendered.
- ns_to_render.sort(_sorter_ns)
- for n, v in ns_to_render:
- self._do_attr(n, v)
- ns_rendered[n] = v # 0417
- # If exclusive or the parent is in the subset, add the local xml attributes
- # Else, add all local and ancestor xml attributes
- # Sort and render the attributes.
- if not _inclusive(self) or _in_subset(self.subset, node.parentNode): # 0426
- other_attrs.extend(xml_attrs_local.values())
- else:
- other_attrs.extend(xml_attrs.values())
- other_attrs.sort(_sorter)
- for a in other_attrs:
- self._do_attr(a.nodeName, a.value)
- W('>')
- # Push state, recurse, pop state.
- state, self.state = self.state, (ns_local, ns_rendered, xml_attrs)
- for c in _children(node):
- _implementation.handlers[c.nodeType](self, c)
- self.state = state
- if name:
- W('</%s>' % name)
- handlers[Node.ELEMENT_NODE] = _do_element
- def Canonicalize(node, output=None, **kw):
- '''Canonicalize(node, output=None, **kw) -> UTF-8
- Canonicalize a DOM document/element node and all descendents.
- Return the text; if output is specified then output.write will
- be called to output the text and None will be returned
- Keyword parameters:
- nsdict -- a dictionary of prefix:uri namespace entries
- assumed to exist in the surrounding context
- comments -- keep comments if non-zero (default is 0)
- subset -- Canonical XML subsetting resulting from XPath (default is [])
- unsuppressedPrefixes -- do exclusive C14N, and this specifies the
- prefixes that should be inherited.
- '''
- if output:
- apply(_implementation, (node, output.write), kw)
- else:
- s = StringIO.StringIO()
- apply(_implementation, (node, s.write), kw)
- return s.getvalue()
- # end of xmlC14n.py
- # from why import BecauseOfData, becauseSubexpression
- def BecauseOfData(*args, **kargs):
- # print args, kargs
- pass
- def becauseSubexpression(*args, **kargs):
- # print args, kargs
- pass
- N3_forSome_URI = forSomeSym
- N3_forAll_URI = forAllSym
- # Magic resources we know about
- ADDED_HASH = "#" # Stop where we use this in case we want to remove it!
- # This is the hash on namespace URIs
- RDF_type = (SYMBOL, RDF_type_URI)
- DAML_sameAs = (SYMBOL, DAML_sameAs_URI)
- LOG_implies_URI = "http://www.w3.org/2000/10/swap/log#implies"
- BOOLEAN_DATATYPE = _XSD_PFX + "boolean"
- DECIMAL_DATATYPE = _XSD_PFX + "decimal"
- DOUBLE_DATATYPE = _XSD_PFX + "double"
- FLOAT_DATATYPE = _XSD_PFX + "float"
- INTEGER_DATATYPE = _XSD_PFX + "integer"
- option_noregen = 0 # If set, do not regenerate genids on output
- # @@ I18n - the notname chars need extending for well known unicode non-text
- # characters. The XML spec switched to assuming unknown things were name
- # characaters.
- # _namechars = string.lowercase + string.uppercase + string.digits + '_-'
- _notQNameChars = "\t\r\n !\"#$%&'()*.,+/;<=>?@[\\]^`{|}~" # else valid qname :-/
- _notNameChars = _notQNameChars + ":" # Assume anything else valid name :-/
- _rdfns = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
- N3CommentCharacter = "#" # For unix script # ! compatabilty
- ########################################## Parse string to sink
- #
- # Regular expressions:
- eol = re.compile(r'[ \t]*(#[^\n]*)?\r?\n') # end of line, poss. w/comment
- eof = re.compile(r'[ \t]*(#[^\n]*)?$') # end of file, poss. w/comment
- ws = re.compile(r'[ \t]*') # Whitespace not including NL
- signed_integer = re.compile(r'[-+]?[0-9]+') # integer
- number_syntax = re.compile(r'(?P<integer>[-+]?[0-9]+)(?P<decimal>\.[0-9]+)?(?P<exponent>(?:e|E)[-+]?[0-9]+)?')
- digitstring = re.compile(r'[0-9]+') # Unsigned integer
- interesting = re.compile(r'[\\\r\n\"]')
- langcode = re.compile(r'[a-zA-Z0-9]+(-[a-zA-Z0-9]+)?')
- class SinkParser:
- def __init__(self, store, openFormula=None, thisDoc="", baseURI=None,
- genPrefix="", flags="", why=None):
- """ note: namespace names should *not* end in # ;
- the # will get added during qname processing """
- self._bindings = {}
- self._flags = flags
- if thisDoc != "":
- assert ':' in thisDoc, "Document URI not absolute: <%s>" % thisDoc
- self._bindings[""] = thisDoc + "#" # default
- self._store = store
- if genPrefix:
- store.setGenPrefix(genPrefix) # pass it on
- self._thisDoc = thisDoc
- self.lines = 0 # for error handling
- self.startOfLine = 0 # For calculating character number
- self._genPrefix = genPrefix
- self.keywords = ['a', 'this', 'bind', 'has', 'is', 'of', 'true', 'false']
- self.keywordsSet = 0 # Then only can others be considerd qnames
- self._anonymousNodes = {} # Dict of anon nodes already declared ln: Term
- self._variables = {}
- self._parentVariables = {}
- self._reason = why # Why the parser was asked to parse this
- self._reason2 = None # Why these triples
- # was: diag.tracking
- if tracking:
- self._reason2 = BecauseOfData(
- store.newSymbol(thisDoc), because=self._reason)
- if baseURI:
- self._baseURI = baseURI
- else:
- if thisDoc:
- self._baseURI = thisDoc
- else:
- self._baseURI = None
- assert not self._baseURI or ':' in self._baseURI
- if not self._genPrefix:
- if self._thisDoc:
- self._genPrefix = self._thisDoc + "#_g"
- else:
- self._genPrefix = uniqueURI()
- if openFormula == None:
- if self._thisDoc:
- self._formula = store.newFormula(thisDoc + "#_formula")
- else:
- self._formula = store.newFormula()
- else:
- self._formula = openFormula
- self._context = self._formula
- self._parentContext = None
- def here(self, i):
- """String generated from position in file
- This is for repeatability when refering people to bnodes in a document.
- This has diagnostic uses less formally, as it should point one to which
- bnode the arbitrary identifier actually is. It gives the
- line and character number of the '[' charcacter or path character
- which introduced the blank node. The first blank node is boringly _L1C1.
- It used to be used only for tracking, but for tests in general
- it makes the canonical ordering of bnodes repeatable."""
- return "%s_L%iC%i" % (self._genPrefix, self.lines,
- i - self.startOfLine + 1)
- def formula(self):
- return self._formula
- def loadStream(self, stream):
- return self.loadBuf(stream.read()) # Not ideal
- def loadBuf(self, buf):
- """Parses a buffer and returns its top level formula"""
- self.startDoc()
- self.feed(buf)
- return self.endDoc() # self._formula
- def feed(self, octets):
- """Feed an octet stream tothe parser
- if BadSyntax is raised, the string
- passed in the exception object is the
- remainder after any statements have been parsed.
- So if there is more data to feed to the
- parser, it should be straightforward to recover."""
- if not isinstance(octets, unicode):
- s = octets.decode('utf-8')
- # NB already decoded, so \ufeff
- if len(s) > 0 and s[0] == codecs.BOM_UTF8.decode('utf-8'):
- s = s[1:]
- else:
- s = octets
- i = 0
- while i >= 0:
- j = self.skipSpace(s, i)
- if j < 0:
- return
- i = self.directiveOrStatement(s, j)
- if i < 0:
- print("# next char: %s" % s[j])
- raise BadSyntax(self._thisDoc, self.lines, s, j,
- "expected directive or statement")
- def directiveOrStatement(self, argstr, h):
- i = self.skipSpace(argstr, h)
- if i < 0:
- return i # EOF
- j = self.directive(argstr, i)
- if j >= 0:
- return self.checkDot(argstr, j)
- j = self.statement(argstr, i)
- if j >= 0:
- return self.checkDot(argstr, j)
- return j
- # @@I18N
- global _notNameChars
- # _namechars = string.lowercase + string.uppercase + string.digits + '_-'
- def tok(self, tok, argstr, i):
- """Check for keyword. Space must have been stripped on entry and
- we must not be at end of file."""
- assert tok[0] not in _notNameChars # not for punctuation
- if argstr[i:i + 1] == "@":
- i = i + 1
- else:
- if tok not in self.keywords:
- return -1 # No, this has neither keywords declaration nor "@"
- if (argstr[i:i + len(tok)] == tok
- and (argstr[i + len(tok)] in _notQNameChars)):
- i = i + len(tok)
- return i
- else:
- return -1
- def directive(self, argstr, i):
- j = self.skipSpace(argstr, i)
- if j < 0:
- return j # eof
- res = []
- j = self.tok('bind', argstr, i) # implied "#". Obsolete.
- if j > 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "keyword bind is obsolete: use @prefix")
- j = self.tok('keywords', argstr, i)
- if j > 0:
- i = self.commaSeparatedList(argstr, j, res, self.bareWord)
- if i < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "'@keywords' needs comma separated list of words")
- self.setKeywords(res[:])
- # was: diag.chatty_flag
- if chatty_flag > 80:
- progress("Keywords ", self.keywords)
- return i
- j = self.tok('forAll', argstr, i)
- if j > 0:
- i = self.commaSeparatedList(argstr, j, res, self.uri_ref2)
- if i < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "Bad variable list after @forAll")
- for x in res:
- # self._context.declareUniversal(x)
- if x not in self._variables or x in self._parentVariables:
- self._variables[x] = self._context.newUniversal(x)
- return i
- j = self.tok('forSome', argstr, i)
- if j > 0:
- i = self. commaSeparatedList(argstr, j, res, self.uri_ref2)
- if i < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "Bad variable list after @forSome")
- for x in res:
- self._context.declareExistential(x)
- return i
- j = self.tok('prefix', argstr, i) # no implied "#"
- if j >= 0:
- t = []
- i = self.qname(argstr, j, t)
- if i < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, j,
- "expected qname after @prefix")
- j = self.uri_ref2(argstr, i, t)
- if j < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "expected <uriref> after @prefix _qname_")
- ns = self.uriOf(t[1])
- if self._baseURI:
- ns = join(self._baseURI, ns)
- elif ":" not in ns:
- raise BadSyntax(self._thisDoc, self.lines, argstr, j,
- "With no base URI, cannot use relative URI in @prefix <" + ns + ">")
- assert ':' in ns # must be absolute
- self._bindings[t[0][0]] = ns
- self.bind(t[0][0], hexify(ns))
- return j
- j = self.tok('base', argstr, i) # Added 2007/7/7
- if j >= 0:
- t = []
- i = self.uri_ref2(argstr, j, t)
- if i < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, j,
- "expected <uri> after @base ")
- ns = self.uriOf(t[0])
- if self._baseURI:
- ns = join(self._baseURI, ns)
- else:
- raise BadSyntax(self._thisDoc, self.lines, argstr, j,
- "With no previous base URI, cannot use relative URI in @base <" + ns + ">")
- assert ':' in ns # must be absolute
- self._baseURI = ns
- return i
- return -1 # Not a directive, could be something else.
- def bind(self, qn, uri):
- assert isinstance(uri,
- types.StringType), "Any unicode must be %x-encoded already"
- if qn == "":
- self._store.setDefaultNamespace(uri)
- else:
- self._store.bind(qn, uri)
- def setKeywords(self, k):
- "Takes a list of strings"
- if k == None:
- self.keywordsSet = 0
- else:
- self.keywords = k
- self.keywordsSet = 1
- def startDoc(self):
- # was: self._store.startDoc()
- self._store.startDoc(self._formula)
- def endDoc(self):
- """Signal end of document and stop parsing. returns formula"""
- self._store.endDoc(self._formula) # don't canonicalize yet
- return self._formula
- def makeStatement(self, quadruple):
- # $$$$$$$$$$$$$$$$$$$$$
- # print "# Parser output: ", `quadruple`
- self._store.makeStatement(quadruple, why=self._reason2)
- def statement(self, argstr, i):
- r = []
- i = self.object(argstr, i, r) # Allow literal for subject - extends RDF
- if i < 0:
- return i
- j = self.property_list(argstr, i, r[0])
- if j < 0:
- raise BadSyntax(self._thisDoc, self.lines,
- argstr, i, "expected propertylist")
- return j
- def subject(self, argstr, i, res):
- return self.item(argstr, i, res)
- def verb(self, argstr, i, res):
- """ has _prop_
- is _prop_ of
- a
- =
- _prop_
- >- prop ->
- <- prop -<
- _operator_"""
- j = self.skipSpace(argstr, i)
- if j < 0:
- return j # eof
- r = []
- j = self.tok('has', argstr, i)
- if j >= 0:
- i = self.prop(argstr, j, r)
- if i < 0:
- raise BadSyntax(self._thisDoc, self.lines,
- argstr, j, "expected property after 'has'")
- res.append(('->', r[0]))
- return i
- j = self.tok('is', argstr, i)
- if j >= 0:
- i = self.prop(argstr, j, r)
- if i < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, j,
- "expected <property> after 'is'")
- j = self.skipSpace(argstr, i)
- if j < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "End of file found, expected property after 'is'")
- return j # eof
- i = j
- j = self.tok('of', argstr, i)
- if j < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "expected 'of' after 'is' <prop>")
- res.append(('<-', r[0]))
- return j
- j = self.tok('a', argstr, i)
- if j >= 0:
- res.append(('->', RDF_type))
- return j
- if argstr[i:i + 2] == "<=":
- res.append(('<-', self._store.newSymbol(Logic_NS + "implies")))
- return i + 2
- if argstr[i:i + 1] == "=":
- if argstr[i + 1:i + 2] == ">":
- res.append(('->', self._store.newSymbol(Logic_NS + "implies")))
- return i + 2
- res.append(('->', DAML_sameAs))
- return i + 1
- if argstr[i:i + 2] == ":=":
- # patch file relates two formulae, uses this @@ really?
- res.append(('->', Logic_NS + "becomes"))
- return i + 2
- j = self.prop(argstr, i, r)
- if j >= 0:
- res.append(('->', r[0]))
- return j
- if argstr[i:i + 2] == ">-" or argstr[i:i + 2] == "<-":
- raise BadSyntax(self._thisDoc, self.lines, argstr, j,
- ">- ... -> syntax is obsolete.")
- return -1
- def prop(self, argstr, i, res):
- return self.item(argstr, i, res)
- def item(self, argstr, i, res):
- return self.path(argstr, i, res)
- def blankNode(self, uri=None):
- if "B" not in self._flags:
- return self._context.newBlankNode(uri, why=self._reason2)
- x = self._context.newSymbol(uri)
- self._context.declareExistential(x)
- return x
- def path(self, argstr, i, res):
- """Parse the path production.
- """
- j = self.nodeOrLiteral(argstr, i, res)
- if j < 0:
- return j # nope
- while argstr[j:j + 1] in "!^.": # no spaces, must follow exactly (?)
- ch = argstr[j:j + 1] # @@ Allow "." followed IMMEDIATELY by a node.
- if ch == ".":
- ahead = argstr[j + 1:j + 2]
- if not ahead or (ahead in _notNameChars
- and ahead not in ":?<[{("):
- break
- subj = res.pop()
- obj = self.blankNode(uri=self.here(j))
- j = self.node(argstr, j + 1, res)
- if j < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, j,
- "EOF found in middle of path syntax")
- pred = res.pop()
- if ch == "^": # Reverse traverse
- self.makeStatement((self._context, pred, obj, subj))
- else:
- self.makeStatement((self._context, pred, subj, obj))
- res.append(obj)
- return j
- def anonymousNode(self, ln):
- """Remember or generate a term for one of these _: anonymous nodes"""
- term = self._anonymousNodes.get(ln, None)
- if term != None:
- return term
- term = self._store.newBlankNode(self._context, why=self._reason2)
- self._anonymousNodes[ln] = term
- return term
- def node(self, argstr, i, res, subjectAlready=None):
- """Parse the <node> production.
- Space is now skipped once at the beginning
- instead of in multipe calls to self.skipSpace().
- """
- subj = subjectAlready
- j = self.skipSpace(argstr, i)
- if j < 0:
- return j # eof
- i = j
- ch = argstr[i:i + 1] # Quick 1-character checks first:
- if ch == "[":
- bnodeID = self.here(i)
- j = self.skipSpace(argstr, i + 1)
- if j < 0:
- raise BadSyntax(self._thisDoc,
- self.lines, argstr, i, "EOF after '['")
- if argstr[j:j + 1] == "=": # Hack for "is" binding name to anon node
- i = j + 1
- objs = []
- j = self.objectList(argstr, i, objs)
- if j >= 0:
- subj = objs[0]
- if len(objs) > 1:
- for obj in objs:
- self.makeStatement((self._context,
- DAML_sameAs, subj, obj))
- j = self.skipSpace(argstr, j)
- if j < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "EOF when objectList expected after [ = ")
- if argstr[j:j + 1] == ";":
- j = j + 1
- else:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "objectList expected after [= ")
- if subj is None:
- subj = self.blankNode(uri=bnodeID)
- i = self.property_list(argstr, j, subj)
- if i < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, j,
- "property_list expected")
- j = self.skipSpace(argstr, i)
- if j < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "EOF when ']' expected after [ <propertyList>")
- if argstr[j:j + 1] != "]":
- raise BadSyntax(self._thisDoc,
- self.lines, argstr, j, "']' expected")
- res.append(subj)
- return j + 1
- if ch == "{":
- ch2 = argstr[i + 1:i + 2]
- if ch2 == '$':
- i += 1
- j = i + 1
- List = []
- first_run = True
- while 1:
- i = self.skipSpace(argstr, j)
- if i < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "needed '$}', found end.")
- if argstr[i:i + 2] == '$}':
- j = i + 2
- break
- if not first_run:
- if argstr[i:i + 1] == ',':
- i += 1
- else:
- raise BadSyntax(self._thisDoc, self.lines,
- argstr, i, "expected: ','")
- else:
- first_run = False
- item = []
- j = self.item(argstr, i, item) # @@@@@ should be path, was object
- if j < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "expected item in set or '$}'")
- List.append(self._store.intern(item[0]))
- res.append(self._store.newSet(List, self._context))
- return j
- else:
- j = i + 1
- oldParentContext = self._parentContext
- self._parentContext = self._context
- parentAnonymousNodes = self._anonymousNodes
- grandParentVariables = self._parentVariables
- self._parentVariables = self._variables
- self._anonymousNodes = {}
- self._variables = self._variables.copy()
- reason2 = self._reason2
- self._reason2 = becauseSubexpression
- if subj is None:
- subj = self._store.newFormula()
- self._context = subj
- while 1:
- i = self.skipSpace(argstr, j)
- if i < 0:
- raise BadSyntax(self._thisDoc, self.lines,
- argstr, i, "needed '}', found end.")
- if argstr[i:i + 1] == "}":
- j = i + 1
- break
- j = self.directiveOrStatement(argstr, i)
- if j < 0:
- raise BadSyntax(self._thisDoc, self.lines,
- argstr, i, "expected statement or '}'")
- self._anonymousNodes = parentAnonymousNodes
- self._variables = self._parentVariables
- self._parentVariables = grandParentVariables
- self._context = self._parentContext
- self._reason2 = reason2
- self._parentContext = oldParentContext
- res.append(subj.close()) # No use until closed
- return j
- if ch == "(":
- thing_type = self._store.newList
- ch2 = argstr[i + 1:i + 2]
- if ch2 == '$':
- thing_type = self._store.newSet
- i += 1
- j = i + 1
- List = []
- while 1:
- i = self.skipSpace(argstr, j)
- if i < 0:
- raise BadSyntax(self._thisDoc, self.lines,
- argstr, i, "needed ')', found end.")
- if argstr[i:i + 1] == ')':
- j = i + 1
- break
- item = []
- j = self.item(argstr, i, item) # @@@@@ should be path, was object
- if j < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "expected item in list or ')'")
- List.append(self._store.intern(item[0]))
- res.append(thing_type(List, self._context))
- return j
- j = self.tok('this', argstr, i) # This context
- if j >= 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "Keyword 'this' was ancient N3. Now use @forSome and @forAll keywords.")
- res.append(self._context)
- return j
- # booleans
- j = self.tok('true', argstr, i)
- if j >= 0:
- res.append(True)
- return j
- j = self.tok('false', argstr, i)
- if j >= 0:
- res.append(False)
- return j
- if subj is None: # If this can be a named node, then check for a name.
- j = self.uri_ref2(argstr, i, res)
- if j >= 0:
- return j
- return -1
- def property_list(self, argstr, i, subj):
- """Parse property list
- Leaves the terminating punctuation in the buffer
- """
- while 1:
- j = self.skipSpace(argstr, i)
- if j < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "EOF found when expected verb in property list")
- return j # eof
- if argstr[j:j + 2] == ":-":
- i = j + 2
- res = []
- j = self.node(argstr, i, res, subj)
- if j < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "bad {} or () or [] node after :- ")
- i = j
- continue
- i = j
- v = []
- j = self.verb(argstr, i, v)
- if j <= 0:
- return i # void but valid
- objs = []
- i = self.objectList(argstr, j, objs)
- if i < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, j,
- "objectList expected")
- for obj in objs:
- dira, sym = v[0]
- if dira == '->':
- self.makeStatement((self._context, sym, subj, obj))
- else:
- self.makeStatement((self._context, sym, obj, subj))
- j = self.skipSpace(argstr, i)
- if j < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, j,
- "EOF found in list of objects")
- return j # eof
- if argstr[i:i + 1] != ";":
- return i
- i = i + 1 # skip semicolon and continue
- def commaSeparatedList(self, argstr, j, res, what):
- """return value: -1 bad syntax; >1 new position in argstr
- res has things found appended
- """
- i = self.skipSpace(argstr, j)
- if i < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "EOF found expecting comma sep list")
- return i
- if argstr[i] == ".":
- return j # empty list is OK
- i = what(argstr, i, res)
- if i < 0:
- return -1
- while 1:
- j = self.skipSpace(argstr, i)
- if j < 0:
- return j # eof
- ch = argstr[j:j + 1]
- if ch != ",":
- if ch != ".":
- return -1
- return j # Found but not swallowed "."
- i = what(argstr, j + 1, res)
- if i < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "bad list content")
- return i
- def objectList(self, argstr, i, res):
- i = self.object(argstr, i, res)
- if i < 0:
- return -1
- while 1:
- j = self.skipSpace(argstr, i)
- if j < 0:
- raise BadSyntax(self._thisDoc, self.lines, argstr, j,
- "EOF found after object")
- return j # eof
- if argstr[j:j + 1] != ",":
- return j # Found something else!
- i = self.object(argstr, j + 1, res)
- if i < 0:
- return i
- def checkDot(self, argstr, i):
- j = self.skipSpace(argstr, i)
- if j < 0:
- return j # eof
- if argstr[j:j + 1] == ".":
- return j + 1 # skip
- if argstr[j:j + 1] == "}":
- return j # don't skip it
- if argstr[j:j + 1] == "]":
- return j
- raise BadSyntax(self._thisDoc, self.lines,
- argstr, j, "expected '.' or '}' or ']' at end of statement")
- return i
- def uri_ref2(self, argstr, i, res):
- """Generate uri from n3 representation.
- Note that the RDF convention of directly concatenating
- NS and local name is now used though I prefer inserting a '#'
- to make the namesapces look more like what XML folks expect.
- """
- qn = []
- j = self.qname(argstr, i, qn)
- if j >= 0:
- pfx, ln = qn[0]
- if pfx is None:
- assert 0, "not used?"
- ns = self._baseURI + ADDED_HASH
- else:
- try:
- ns = self._bindings[pfx]
- except KeyError:
- if pfx == "_": # Magic prefix 2001/05/30, can be overridden
- res.append(self.anonymousNode(ln))
- return j
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "Prefix \"%s:\" not bound" % (pfx))
- symb = self._store.newSymbol(ns + ln)
- if symb in self._variables:
- res.append(self._variables[symb])
- else:
- res.append(symb) # @@@ "#" CONVENTION
- if not ns.find("#"):
- progress("Warning: no # on namespace %s," % ns)
- return j
- i = self.skipSpace(argstr, i)
- if i < 0:
- return -1
- if argstr[i] == "?":
- v = []
- j = self.variable(argstr, i, v)
- if j > 0: # Forget varibles as a class, only in context.
- res.append(v[0])
- return j
- return -1
- elif argstr[i] == "<":
- i = i + 1
- st = i
- while i < len(argstr):
- if argstr[i] == ">":
- uref = argstr[st:i] # the join should dealt with "":
- if self._baseURI:
- uref = join(self._baseURI, uref) # was: uripath.join
- else:
- assert ":" in uref, \
- "With no base URI, cannot deal with relative URIs"
- if argstr[i - 1:i] == "#" and not uref[-1:] == "#":
- uref = uref + "#" # She meant it! Weirdness in urlparse?
- symb = self._store.newSymbol(uref)
- if symb in self._variables:
- res.append(self._variables[symb])
- else:
- res.append(symb)
- return i + 1
- i = i + 1
- raise BadSyntax(self._thisDoc, self.lines, argstr, j,
- "unterminated URI reference")
- elif self.keywordsSet:
- v = []
- j = self.bareWord(argstr, i, v)
- if j < 0:
- return -1 # Forget varibles as a class, only in context.
- if v[0] in self.keywords:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- 'Keyword "%s" not allowed here.' % v[0])
- res.append(self._store.newSymbol(self._bindings[""] + v[0]))
- return j
- else:
- return -1
- def skipSpace(self, argstr, i):
- """Skip white space, newlines and comments.
- return -1 if EOF, else position of first non-ws character"""
- while 1:
- m = eol.match(argstr, i)
- if m == None:
- break
- self.lines = self.lines + 1
- i = m.end() # Point to first character unmatched
- self.startOfLine = i
- m = ws.match(argstr, i)
- if m != None:
- i = m.end()
- m = eof.match(argstr, i)
- if m != None:
- return -1
- return i
- def variable(self, argstr, i, res):
- """ ?abc -> variable(:abc)
- """
- j = self.skipSpace(argstr, i)
- if j < 0:
- return -1
- if argstr[j:j + 1] != "?":
- return -1
- j = j + 1
- i = j
- if argstr[j] in "0123456789-":
- raise BadSyntax(self._thisDoc, self.lines, argstr, j,
- "Varible name can't start with '%s'" % argstr[j])
- return -1
- while i < len(argstr) and argstr[i] not in _notNameChars:
- i = i + 1
- if self._parentContext == None:
- varURI = self._store.newSymbol(self._baseURI + "#" + argstr[j:i])
- if varURI not in self._variables:
- self._variables[varURI] = self._context.newUniversal(
- varURI, why=self._reason2)
- res.append(self._variables[varURI])
- return i
- # @@ was:
- # raise BadSyntax(self._thisDoc, self.lines, argstr, j,
- # "Can't use ?xxx syntax for variable in outermost level: %s"
- # % argstr[j-1:i])
- varURI = self._store.newSymbol(self._baseURI + "#" + argstr[j:i])
- if varURI not in self._parentVariables:
- self._parentVariables[varURI] = self._parentContext.newUniversal(
- varURI, why=self._reason2)
- res.append(self._parentVariables[varURI])
- return i
- def bareWord(self, argstr, i, res):
- """ abc -> :abc
- """
- j = self.skipSpace(argstr, i)
- if j < 0:
- return -1
- if argstr[j] in "0123456789-" or argstr[j] in _notNameChars:
- return -1
- i = j
- while i < len(argstr) and argstr[i] not in _notNameChars:
- i = i + 1
- res.append(argstr[j:i])
- return i
- def qname(self, argstr, i, res):
- """
- xyz:def -> ('xyz', 'def')
- If not in keywords and keywordsSet: def -> ('', 'def')
- :def -> ('', 'def')
- """
- i = self.skipSpace(argstr, i)
- if i < 0:
- return -1
- c = argstr[i]
- if c in "0123456789-+.":
- return -1
- if c not in _notNameChars:
- ln = c
- i = i + 1
- while i < len(argstr):
- c = argstr[i]
- if c == "." or c not in _notNameChars:
- ln = ln + c
- i = i + 1
- else:
- break
- if argstr[i - 1] == ".": # qname cannot end with "."
- return -1
- else: # First character is non-alpha
- ln = '' # Was: None - TBL (why? useful?)
- if i < len(argstr) and argstr[i] == ':':
- pfx = ln
- i = i + 1
- ln = ''
- while i < len(argstr):
- c = argstr[i]
- if c not in _notNameChars:
- ln = ln + c
- i = i + 1
- else:
- break
- res.append((pfx, ln))
- return i
- else: # delimiter was not ":"
- if ln and self.keywordsSet and ln not in self.keywords:
- res.append(('', ln))
- return i
- return -1
- def object(self, argstr, i, res):
- j = self.subject(argstr, i, res)
- if j >= 0:
- return j
- else:
- j = self.skipSpace(argstr, i)
- if j < 0:
- return -1
- else:
- i = j
- if argstr[i] == '"':
- if argstr[i:i + 3] == '"""':
- delim = '"""'
- else:
- delim = '"'
- i = i + len(delim)
- j, s = self.strconst(argstr, i, delim)
- res.append(self._store.newLiteral(s))
- progress("New string const ", s, j)
- return j
- else:
- return -1
- def nodeOrLiteral(self, argstr, i, res):
- j = self.node(argstr, i, res)
- startline = self.lines # Remember where for error messages
- if j >= 0:
- return j
- else:
- j = self.skipSpace(argstr, i)
- if j < 0:
- return -1
- else:
- i = j
- ch = argstr[i]
- if ch in "-+0987654321":
- m = number_syntax.match(argstr, i)
- if m == None:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "Bad number syntax")
- j = m.end()
- if m.group('exponent') != None: # includes decimal exponent
- res.append(float(argstr[i:j]))
- # res.append(self._store.newLiteral(argstr[i:j],
- # self._store.newSymbol(FLOAT_DATATYPE)))
- elif m.group('decimal') != None:
- res.append(Decimal(argstr[i:j]))
- else:
- res.append(long(argstr[i:j]))
- # res.append(self._store.newLiteral(argstr[i:j],
- # self._store.newSymbol(INTEGER_DATATYPE)))
- return j
- if argstr[i] == '"':
- if argstr[i:i + 3] == '"""':
- delim = '"""'
- else:
- delim = '"'
- i = i + len(delim)
- dt = None
- j, s = self.strconst(argstr, i, delim)
- lang = None
- if argstr[j:j + 1] == "@": # Language?
- m = langcode.match(argstr, j + 1)
- if m == None:
- raise BadSyntax(self._thisDoc, startline, argstr, i,
- "Bad language code syntax on string literal, after @")
- i = m.end()
- lang = argstr[j + 1:i]
- j = i
- if argstr[j:j + 2] == "^^":
- res2 = []
- j = self.uri_ref2(argstr, j + 2, res2) # Read datatype URI
- dt = res2[0]
- # if dt.uriref() == "http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral":
- if dt == "http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral":
- try:
- dom = XMLtoDOM('<rdf:envelope xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns">'
- + s
- + '</rdf:envelope>').firstChild
- except:
- raise ValueError('s="%s"' % s)
- res.append(self._store.newXMLLiteral(dom))
- return j
- res.append(self._store.newLiteral(s, dt, lang))
- return j
- else:
- return -1
- def uriOf(self, sym):
- if isinstance(sym, types.TupleType):
- return sym[1] # old system for --pipe
- # return sym.uriref() # cwm api
- return sym
- def strconst(self, argstr, i, delim):
- """parse an N3 string constant delimited by delim.
- return index, val
- """
- j = i
- ustr = u"" # Empty unicode string
- startline = self.lines # Remember where for error messages
- while j < len(argstr):
- if argstr[j] == '"':
- if delim == '"': # done when delim is "
- i = j + 1
- return i, ustr
- if delim == '"""': # done when delim is """ and ...
- if argstr[j:j + 5] == '"""""': # ... we have "" before
- i = j + 5
- ustr = ustr + '""'
- return i, ustr
- if argstr[j:j + 4] == '""""': # ... we have " before
- i = j + 4
- ustr = ustr + '"'
- return i, ustr
- if argstr[j:j + 3] == '"""': # ... current " is part of delim
- i = j + 3
- return i, ustr
- # we are inside of the string and current char is "
- j = j + 1
- ustr = ustr + '"'
- continue
- m = interesting.search(argstr, j) # was argstr[j:].
- # Note for pos param to work, MUST be compiled ... re bug?
- assert m, "Quote expected in string at ^ in %s^%s" % (
- argstr[j - 20:j], argstr[j:j + 20]) # we at least have to find a quote
- i = m.start()
- try:
- ustr = ustr + argstr[j:i]
- except UnicodeError:
- err = ""
- for c in argstr[j:i]:
- err = err + (" %02x" % ord(c))
- streason = sys.exc_info()[1].__str__()
- raise BadSyntax(self._thisDoc, startline, argstr, j,
- "Unicode error appending characters %s to string, because\n\t%s"
- % (err, streason))
- # print "@@@ i = ",i, " j=",j, "m.end=", m.end()
- ch = argstr[i]
- if ch == '"':
- j = i
- continue
- elif ch == "\r": # Strip carriage returns
- j = i + 1
- continue
- elif ch == "\n":
- if delim == '"':
- raise BadSyntax(self._thisDoc, startline, argstr, i,
- "newline found in string literal")
- self.lines = self.lines + 1
- ustr = ustr + ch
- j = i + 1
- self.startOfLine = j
- elif ch == "\\":
- j = i + 1
- ch = argstr[j:j + 1] # Will be empty if string ends
- if not ch:
- raise BadSyntax(self._thisDoc, startline, argstr, i,
- "unterminated string literal (2)")
- k = 'abfrtvn\\"'.find(ch)
- if k >= 0:
- uch = '\a\b\f\r\t\v\n\\"'[k]
- ustr = ustr + uch
- j = j + 1
- elif ch == "u":
- j, ch = self.uEscape(argstr, j + 1, startline)
- ustr = ustr + ch
- elif ch == "U":
- j, ch = self.UEscape(argstr, j + 1, startline)
- ustr = ustr + ch
- else:
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "bad escape")
- raise BadSyntax(self._thisDoc, self.lines, argstr, i,
- "unterminated string literal")
- def uEscape(self, argstr, i, startline):
- j = i
- count = 0
- value = 0
- while count < 4: # Get 4 more characters
- ch = argstr[j:j + 1].lower()
- # sbp http://ilrt.org/discovery/chatlogs/rdfig/2002-07-05
- j = j + 1
- if ch == "":
- raise BadSyntax(self._thisDoc, startline, argstr, i,
- "unterminated string literal(3)")
- k = "0123456789abcdef".find(ch)
- if k < 0:
- raise BadSyntax(self._thisDoc, startline, argstr, i,
- "bad string literal hex escape")
- value = value * 16 + k
- count = count + 1
- uch = unichr(value)
- return j, uch
- def UEscape(self, argstr, i, startline):
- stringType = type('')
- j = i
- count = 0
- value = '\\U'
- while count < 8: # Get 8 more characters
- ch = argstr[j:j + 1].lower()
- # sbp http://ilrt.org/discovery/chatlogs/rdfig/2002-07-05
- j = j + 1
- if ch == "":
- raise BadSyntax(self._thisDoc, startline, argstr, i,
- "unterminated string literal(3)")
- k = "0123456789abcdef".find(ch)
- if k < 0:
- raise BadSyntax(self._thisDoc, startline, argstr, i,
- "bad string literal hex escape")
- value = value + ch
- count = count + 1
- uch = stringType(value).decode('unicode-escape')
- return j, uch
- wide_build = True
- try:
- unichr(0x10000)
- except ValueError:
- wide_build = False
- # If we are going to do operators then they should generate
- # [ is operator:plus of ( \1 \2 ) ]
- class BadSyntax(SyntaxError):
- def __init__(self, uri, lines, argstr, i, why):
- self._str = argstr.encode('utf-8') # Better go back to strings for errors
- self._i = i
- self._why = why
- self.lines = lines
- self._uri = uri
- def __str__(self):
- argstr = self._str
- i = self._i
- st = 0
- if i > 60:
- pre = "..."
- st = i - 60
- else:
- pre = ""
- if len(argstr) - i > 60:
- post = "..."
- else:
- post = ""
- return 'at line %i of <%s>:\nBad syntax (%s) at ^ in:\n"%s%s^%s%s"' \
- % (self.lines + 1, self._uri, self._why, pre,
- argstr[st:i], argstr[i:i + 60], post)
- def stripCR(argstr):
- res = ""
- for ch in argstr:
- if ch != "\r":
- res = res + ch
- return res
- def dummyWrite(x):
- pass
- ################################################################################
- def toBool(s):
- if s == 'true' or s == 'True' or s == '1':
- return True
- if s == 'false' or s == 'False' or s == '0':
- return False
- raise ValueError(s)
- class Formula(object):
- number = 0
- def __init__(self, parent):
- self.counter = 0
- Formula.number += 1
- self.number = Formula.number
- self.existentials = {}
- self.universals = {}
- self.quotedgraph = QuotedGraph(
- store=parent.store, identifier=self.id())
- def __str__(self):
- return '_:Formula%s' % self.number
- def id(self):
- return BNode('_:Formula%s' % self.number)
- def newBlankNode(self, uri=None, why=None):
- if uri is None:
- self.counter += 1
- bn = BNode('f%sb%s' % (id(self), self.counter))
- else:
- bn = BNode(uri.split('#').pop().replace('_', 'b'))
- return bn
- def newUniversal(self, uri, why=None):
- return Variable(uri.split('#').pop())
- def declareExistential(self, x):
- self.existentials[x] = self.newBlankNode()
- def close(self):
- return self.quotedgraph
- r_hibyte = re.compile(r'([\x80-\xff])')
- def iri(uri):
- return uri.decode('utf-8')
- # return unicode(r_hibyte.sub(lambda m: '%%%02X' % ord(m.group(1)), uri))
- class RDFSink(object):
- def __init__(self, graph):
- self.rootFormula = None
- self.counter = 0
- self.graph = graph
- def newFormula(self):
- assert self.graph.store.formula_aware
- f = Formula(self.graph)
- return f
- def newSymbol(self, *args):
- uri = args[0].encode('utf-8')
- return URIRef(iri(uri))
- def newBlankNode(self, arg=None, **kargs):
- if isinstance(arg, Formula):
- return arg.newBlankNode()
- elif arg is None:
- self.counter += 1
- bn = BNode('n' + str(self.counter))
- else:
- bn = BNode(str(arg[0]).split('#').pop().replace('_', 'b'))
- return bn
- def newLiteral(self, s, dt, lang):
- if dt:
- return Literal(s, datatype=dt)
- else:
- return Literal(s, lang=lang)
- def newList(self, n, f):
- if not n:
- return self.newSymbol(
- 'http://www.w3.org/1999/02/22-rdf-syntax-ns#nil'
- )
- a = self.newBlankNode(f)
- first = self.newSymbol(
- 'http://www.w3.org/1999/02/22-rdf-syntax-ns#first'
- )
- rest = self.newSymbol('http://www.w3.org/1999/02/22-rdf-syntax-ns#rest')
- self.makeStatement((f, first, a, n[0]))
- self.makeStatement((f, rest, a, self.newList(n[1:], f)))
- return a
- def newSet(self, *args):
- return set(args)
- def setDefaultNamespace(self, *args):
- return ':'.join(repr(n) for n in args)
- def makeStatement(self, quadruple, why=None):
- f, p, s, o = quadruple
- if hasattr(p, 'formula'):
- raise Exception("Formula used as predicate")
- s = self.normalise(f, s)
- p = self.normalise(f, p)
- o = self.normalise(f, o)
- if f == self.rootFormula:
- # print s, p, o, '.'
- self.graph.add((s, p, o))
- else:
- f.quotedgraph.add((s, p, o))
- # return str(quadruple)
- def normalise(self, f, n):
- if isinstance(n, tuple):
- return URIRef(unicode(n[1]))
- # if isinstance(n, list):
- # rdflist, f = n
- # name = self.newBlankNode()
- # if f == self.rootFormula:
- # sublist = name
- # for i in xrange(0, len(rdflist) - 1):
- # print sublist, 'first', rdflist[i]
- # rest = self.newBlankNode()
- # print sublist, 'rest', rest
- # sublist = rest
- # print sublist, 'first', rdflist[-1]
- # print sublist, 'rest', 'nil'
- # return name
- if isinstance(n, bool):
- s = Literal(str(n).lower(), datatype=BOOLEAN_DATATYPE)
- return s
- if isinstance(n, int) or isinstance(n, long):
- s = Literal(unicode(n), datatype=INTEGER_DATATYPE)
- return s
- if isinstance(n, Decimal):
- value = str(n.normalize())
- if value == '-0':
- value = '0'
- s = Literal(value, datatype=DECIMAL_DATATYPE)
- return s
- if isinstance(n, float):
- s = Literal(str(n), datatype=DOUBLE_DATATYPE)
- return s
- if n in f.existentials:
- return f.existentials[n]
- # if isinstance(n, Var):
- # if f.universals.has_key(n):
- # return f.universals[n]
- # f.universals[n] = f.newBlankNode()
- # return f.universals[n]
- return n
- def intern(self, something):
- return something
- def bind(self, pfx, uri):
- pass # print pfx, ':', uri
- def startDoc(self, formula):
- self.rootFormula = formula
- def endDoc(self, formula):
- pass
- ###################################################
- #
- # Utilities
- #
- Escapes = {'a': '\a',
- 'b': '\b',
- 'f': '\f',
- 'r': '\r',
- 't': '\t',
- 'v': '\v',
- 'n': '\n',
- '\\': '\\',
- '"': '"'}
- forbidden1 = re.compile(ur'[\\\"\a\b\f\r\v\u0080-\U0000ffff]')
- forbidden2 = re.compile(ur'[\\\"\a\b\f\r\v\t\n\u0080-\U0000ffff]')
- def stringToN3(argstr, singleLine=0, flags=""):
- res = ''
- if (len(argstr) > 20 and argstr[-1] != '"' \
- and not singleLine and (argstr.find("\n") >= 0 \
- or argstr.find('"') >= 0)):
- delim = '"""'
- forbidden = forbidden1 # (allow tabs too now)
- else:
- delim = '"'
- forbidden = forbidden2
- i = 0
- while i < len(argstr):
- m = forbidden.search(argstr, i)
- if not m:
- break
- j = m.start()
- res = res + argstr[i:j]
- ch = m.group(0)
- if ch == '"' and delim == '"""' and argstr[j:j + 3] != '"""': # "
- res = res + ch
- else:
- k = '\a\b\f\r\t\v\n\\"'.find(ch)
- if k >= 0:
- res = res + "\\" + 'abfrtvn\\"'[k]
- else:
- if 'e' in flags:
- # res = res + ('\\u%04x' % ord(ch))
- res = res + ('\\u%04X' % ord(ch))
- # http://www.w3.org/TR/rdf-testcases/#ntriples
- else:
- res = res + ch
- i = j + 1
- # The following code fixes things for really high range Unicode
- newstr = ""
- for ch in res + argstr[i:]:
- if ord(ch) > 65535:
- newstr = newstr + ('\\U%08X' % ord(ch))
- # http://www.w3.org/TR/rdf-testcases/#ntriples
- else:
- newstr = newstr + ch
- return delim + newstr + delim
- def backslashUify(ustr):
- """Use URL encoding to return an ASCII string corresponding
- to the given unicode"""
- # progress("String is "+`ustr`)
- # s1=ustr.encode('utf-8')
- s = ""
- for ch in ustr: # .encode('utf-8'):
- if ord(ch) > 65535:
- ch = "\\U%08X" % ord(ch)
- elif ord(ch) > 126:
- ch = "\\u%04X" % ord(ch)
- else:
- ch = "%c" % ord(ch)
- s = s + ch
- return b(s)
- @py3compat.format_doctest_out
- def hexify(ustr):
- """Use URL encoding to return an ASCII string
- corresponding to the given UTF8 string
- >>> hexify("http://example/a b")
- %(b)s'http://example/a%%20b'
- """
- # progress("String is "+`ustr`)
- # s1=ustr.encode('utf-8')
- s = ""
- for ch in ustr: # .encode('utf-8'):
- if ord(ch) > 126 or ord(ch) < 33:
- ch = "%%%02X" % ord(ch)
- else:
- ch = "%c" % ord(ch)
- s = s + ch
- return b(s)
- # # Unused, dysfunctional.
- # def dummy():
- # res = ""
- # if len(argstr) > 20 and (argstr.find("\n") >=0 or argstr.find('"') >=0):
- # delim= '"""'
- # forbidden = "\\\"\a\b\f\r\v" # (allow tabs too now)
- # else:
- # delim = '"'
- # forbidden = "\\\"\a\b\f\r\v\t\n"
- # for i in range(len(argstr)):
- # ch = argstr[i]
- # j = forbidden.find(ch)
- # if ch == '"' and delim == '"""' \
- # and i+1 < len(argstr) and argstr[i+1] != '"':
- # j=-1 # Single quotes don't need escaping in long format
- # if j >= 0:
- # ch = "\\" + '\\"abfrvtn'[j]
- # elif ch not in "\n\t" and (ch < " " or ch > "}"):
- # ch = "[[" + `ch` + "]]" # [2:-1] # Use python
- # res = res + ch
- # return delim + res + delim
- class TurtleParser(Parser):
- def __init__(self):
- pass
- def parse(self, source, graph, encoding="utf-8"):
- if encoding not in [None, "utf-8"]:
- raise Exception("N3/Turtle files are always utf-8 encoded, I was passed: %s" % encoding)
- sink = RDFSink(graph)
- baseURI = graph.absolutize(source.getPublicId() or source.getSystemId() or "")
- p = SinkParser(sink, baseURI=baseURI)
- p.loadStream(source.getByteStream())
- for prefix, namespace in p._bindings.items():
- graph.bind(prefix, namespace)
- class N3Parser(TurtleParser):
- def __init__(self):
- pass
- def parse(self, source, graph, encoding="utf-8"):
- # we're currently being handed a Graph, not a ConjunctiveGraph
- assert graph.store.context_aware # is this implied by formula_aware
- assert graph.store.formula_aware
- conj_graph = ConjunctiveGraph(store=graph.store)
- conj_graph.default_context = graph # TODO: CG __init__ should have a default_context arg
- # TODO: update N3Processor so that it can use conj_graph as the sink
- conj_graph.namespace_manager = graph.namespace_manager
-
- TurtleParser.parse(self,source,conj_graph,encoding)
- def _test():
- import doctest
- doctest.testmod()
- # if __name__ == '__main__':
- # _test()
- def main():
- g = ConjunctiveGraph()
- sink = RDFSink(g)
- base_uri = 'file://' + os.path.join(os.getcwd(), sys.argv[1])
- p = SinkParser(sink, baseURI=base_uri)
- p._bindings[''] = p._baseURI + '#'
- p.startDoc()
- f = open(sys.argv[1], 'rb')
- rdbytes = f.read()
- f.close()
- p.feed(rdbytes)
- p.endDoc()
- for t in g.quads((None, None, None)):
- print t
- if __name__ == '__main__':
- main()
- #ends