/ntriples.py
https://github.com/strategist922/mrlin · Python · 240 lines · 174 code · 39 blank · 27 comment · 42 complexity · 17a335771416d10d544a8d0b6f9be515 MD5 · raw file
- #!/usr/bin/env python
- """
- N-Triples Parser
- Copyright 2004, Sean B. Palmer, inamidst.com
- Licensed under GPL 2, W3C, BSD, MIT, or EFL 2
- Documentation:
- http://inamidst.com/proj/rdf/ntriples-doc
- Command line usage:
- ./ntriples.py <URI> - parses URI as N-Triples
- ./ntriples.py --help - prints out this help message
- # @@ fully empty document?
- """
- import re
- uriref = r'<([^:]+:[^\s"<>]+)>'
- literal = r'"([^"\\]*(?:\\.[^"\\]*)*)"'
- litinfo = r'(?:@([a-z]+(?:-[a-z0-9]+)*)|\^\^' + uriref + r')?'
- r_line = re.compile(r'([^\r\n]*)(?:\r\n|\r|\n)')
- r_wspace = re.compile(r'[ \t]*')
- r_wspaces = re.compile(r'[ \t]+')
- r_tail = re.compile(r'[ \t]*\.[ \t]*')
- r_uriref = re.compile(uriref)
- r_nodeid = re.compile(r'_:([A-Za-z][A-Za-z0-9]*)')
- r_literal = re.compile(literal + litinfo)
- bufsiz = 2048
- validate = False
- class Node(unicode): pass
- class URI(Node): pass
- class bNode(Node): pass
- class Literal(Node):
- def __new__(cls, lit, lang=None, dtype=None):
- n = str(lang) + ' ' + str(dtype) + ' ' + lit
- return unicode.__new__(cls, n)
- class Sink(object):
- def __init__(self):
- self.length = 0
- def triple(self, s, p, o):
- self.length += 1
- print (s, p, o)
- class ParseError(Exception): pass
- quot = {'t': '\t', 'n': '\n', 'r': '\r', '"': '"', '\\': '\\'}
- r_safe = re.compile(r'([\x20\x21\x23-\x5B\x5D-\x7E]+)')
- r_quot = re.compile(r'\\(t|n|r|"|\\)')
- r_uniquot = re.compile(r'\\u([0-9A-F]{4})|\\U([0-9A-F]{8})')
- def unquote(s):
- """Unquote an N-Triples string."""
- result = []
- while s:
- m = r_safe.match(s)
- if m:
- s = s[m.end():]
- result.append(m.group(1))
- continue
- m = r_quot.match(s)
- if m:
- s = s[2:]
- result.append(quot[m.group(1)])
- continue
- m = r_uniquot.match(s)
- if m:
- s = s[m.end():]
- u, U = m.groups()
- codepoint = int(u or U, 16)
- if codepoint > 0x10FFFF:
- raise ParseError("Disallowed codepoint: %08X" % codepoint)
- result.append(unichr(codepoint))
- elif s.startswith('\\'):
- raise ParseError("Illegal escape at: %s..." % s[:10])
- else: raise ParseError("Illegal literal character: %r" % s[0])
- return unicode(''.join(result))
- if not validate:
- def unquote(s):
- return s.decode('unicode-escape')
- r_hibyte = re.compile(r'([\x80-\xFF])')
- def uriquote(uri):
- return r_hibyte.sub(lambda m: '%%%02X' % ord(m.group(1)), uri)
- if not validate:
- def uriquote(uri):
- return uri
- class NTriplesParser(object):
- """An N-Triples Parser.
- Usage:
- p = NTriplesParser(sink=MySink())
- sink = p.parse(f) # file; use parsestring for a string
- """
- def __init__(self, sink=None):
- if sink is not None:
- self.sink = sink
- else: self.sink = Sink()
- def parse(self, f):
- """Parse f as an N-Triples file."""
- if not hasattr(f, 'read'):
- raise ParseError("Item to parse must be a file-like object.")
- self.file = f
- self.buffer = ''
- while True:
- self.line = self.readline()
- if self.line is None: break
- try: self.parseline()
- except ParseError:
- raise ParseError("Invalid line: %r" % self.line)
- return self.sink
- def parsestring(self, s):
- """Parse s as an N-Triples string."""
- if not isinstance(s, basestring):
- raise ParseError("Item to parse must be a string instance.")
- from cStringIO import StringIO
- f = StringIO()
- f.write(s)
- f.seek(0)
- self.parse(f)
- def readline(self):
- """Read an N-Triples line from buffered input."""
- # N-Triples lines end in either CRLF, CR, or LF
- # Therefore, we can't just use f.readline()
- if not self.buffer:
- buffer = self.file.read(bufsiz)
- if not buffer: return None
- self.buffer = buffer
- while True:
- m = r_line.match(self.buffer)
- if m: # the more likely prospect
- self.buffer = self.buffer[m.end():]
- return m.group(1)
- else:
- buffer = self.file.read(bufsiz)
- if not buffer:
- raise ParseError("EOF in line")
- self.buffer += buffer
- def parseline(self):
- self.eat(r_wspace)
- if (not self.line) or self.line.startswith('#'):
- return # The line is empty or a comment
- subject = self.subject()
- self.eat(r_wspaces)
- predicate = self.predicate()
- self.eat(r_wspaces)
- object = self.object()
- self.eat(r_tail)
- if self.line:
- raise ParseError("Trailing garbage")
- self.sink.triple(subject, predicate, object)
- def peek(self, token):
- return self.line.startswith(token)
- def eat(self, pattern):
- m = pattern.match(self.line)
- if not m: # @@ Why can't we get the original pattern?
- raise ParseError("Failed to eat %s" % pattern)
- self.line = self.line[m.end():]
- return m
- def subject(self):
- # @@ Consider using dictionary cases
- subj = self.uriref() or self.nodeid()
- if not subj:
- raise ParseError("Subject must be uriref or nodeID")
- return subj
- def predicate(self):
- pred = self.uriref()
- if not pred:
- raise ParseError("Predicate must be uriref")
- return pred
- def object(self):
- objt = self.uriref() or self.nodeid() or self.literal()
- if not objt:
- raise ParseError("Unrecognised object type")
- return objt
- def uriref(self):
- if self.peek('<'):
- uri = self.eat(r_uriref).group(1)
- uri = unquote(uri)
- uri = uriquote(uri)
- return URI(uri)
- return False
- def nodeid(self):
- if self.peek('_'):
- return bNode(self.eat(r_nodeid).group(1))
- return False
- def literal(self):
- if self.peek('"'):
- lit, lang, dtype = self.eat(r_literal).groups()
- if lang and dtype:
- raise ParseError("Can't have both a language and a datatype")
- lit = unquote(lit)
- return Literal(lit, lang, dtype)
- return False
- def parseURI(uri):
- import urllib
- parser = NTriplesParser()
- u = urllib.urlopen(uri)
- sink = parser.parse(u)
- u.close()
- # for triple in sink:
- # print triple
- print 'Length of input:', sink.length
- def main():
- import sys
- if len(sys.argv) == 2:
- parseURI(sys.argv[1])
- else: print __doc__
- if __name__=="__main__":
- main()