PageRenderTime 51ms CodeModel.GetById 21ms RepoModel.GetById 1ms app.codeStats 0ms

/ntriples.py

https://github.com/strategist922/mrlin
Python | 240 lines | 193 code | 30 blank | 17 comment | 31 complexity | 17a335771416d10d544a8d0b6f9be515 MD5 | raw file
  1. #!/usr/bin/env python
  2. """
  3. N-Triples Parser
  4. Copyright 2004, Sean B. Palmer, inamidst.com
  5. Licensed under GPL 2, W3C, BSD, MIT, or EFL 2
  6. Documentation:
  7. http://inamidst.com/proj/rdf/ntriples-doc
  8. Command line usage:
  9. ./ntriples.py <URI> - parses URI as N-Triples
  10. ./ntriples.py --help - prints out this help message
  11. # @@ fully empty document?
  12. """
  13. import re
  14. uriref = r'<([^:]+:[^\s"<>]+)>'
  15. literal = r'"([^"\\]*(?:\\.[^"\\]*)*)"'
  16. litinfo = r'(?:@([a-z]+(?:-[a-z0-9]+)*)|\^\^' + uriref + r')?'
  17. r_line = re.compile(r'([^\r\n]*)(?:\r\n|\r|\n)')
  18. r_wspace = re.compile(r'[ \t]*')
  19. r_wspaces = re.compile(r'[ \t]+')
  20. r_tail = re.compile(r'[ \t]*\.[ \t]*')
  21. r_uriref = re.compile(uriref)
  22. r_nodeid = re.compile(r'_:([A-Za-z][A-Za-z0-9]*)')
  23. r_literal = re.compile(literal + litinfo)
  24. bufsiz = 2048
  25. validate = False
  26. class Node(unicode): pass
  27. class URI(Node): pass
  28. class bNode(Node): pass
  29. class Literal(Node):
  30. def __new__(cls, lit, lang=None, dtype=None):
  31. n = str(lang) + ' ' + str(dtype) + ' ' + lit
  32. return unicode.__new__(cls, n)
  33. class Sink(object):
  34. def __init__(self):
  35. self.length = 0
  36. def triple(self, s, p, o):
  37. self.length += 1
  38. print (s, p, o)
  39. class ParseError(Exception): pass
  40. quot = {'t': '\t', 'n': '\n', 'r': '\r', '"': '"', '\\': '\\'}
  41. r_safe = re.compile(r'([\x20\x21\x23-\x5B\x5D-\x7E]+)')
  42. r_quot = re.compile(r'\\(t|n|r|"|\\)')
  43. r_uniquot = re.compile(r'\\u([0-9A-F]{4})|\\U([0-9A-F]{8})')
  44. def unquote(s):
  45. """Unquote an N-Triples string."""
  46. result = []
  47. while s:
  48. m = r_safe.match(s)
  49. if m:
  50. s = s[m.end():]
  51. result.append(m.group(1))
  52. continue
  53. m = r_quot.match(s)
  54. if m:
  55. s = s[2:]
  56. result.append(quot[m.group(1)])
  57. continue
  58. m = r_uniquot.match(s)
  59. if m:
  60. s = s[m.end():]
  61. u, U = m.groups()
  62. codepoint = int(u or U, 16)
  63. if codepoint > 0x10FFFF:
  64. raise ParseError("Disallowed codepoint: %08X" % codepoint)
  65. result.append(unichr(codepoint))
  66. elif s.startswith('\\'):
  67. raise ParseError("Illegal escape at: %s..." % s[:10])
  68. else: raise ParseError("Illegal literal character: %r" % s[0])
  69. return unicode(''.join(result))
  70. if not validate:
  71. def unquote(s):
  72. return s.decode('unicode-escape')
  73. r_hibyte = re.compile(r'([\x80-\xFF])')
  74. def uriquote(uri):
  75. return r_hibyte.sub(lambda m: '%%%02X' % ord(m.group(1)), uri)
  76. if not validate:
  77. def uriquote(uri):
  78. return uri
  79. class NTriplesParser(object):
  80. """An N-Triples Parser.
  81. Usage:
  82. p = NTriplesParser(sink=MySink())
  83. sink = p.parse(f) # file; use parsestring for a string
  84. """
  85. def __init__(self, sink=None):
  86. if sink is not None:
  87. self.sink = sink
  88. else: self.sink = Sink()
  89. def parse(self, f):
  90. """Parse f as an N-Triples file."""
  91. if not hasattr(f, 'read'):
  92. raise ParseError("Item to parse must be a file-like object.")
  93. self.file = f
  94. self.buffer = ''
  95. while True:
  96. self.line = self.readline()
  97. if self.line is None: break
  98. try: self.parseline()
  99. except ParseError:
  100. raise ParseError("Invalid line: %r" % self.line)
  101. return self.sink
  102. def parsestring(self, s):
  103. """Parse s as an N-Triples string."""
  104. if not isinstance(s, basestring):
  105. raise ParseError("Item to parse must be a string instance.")
  106. from cStringIO import StringIO
  107. f = StringIO()
  108. f.write(s)
  109. f.seek(0)
  110. self.parse(f)
  111. def readline(self):
  112. """Read an N-Triples line from buffered input."""
  113. # N-Triples lines end in either CRLF, CR, or LF
  114. # Therefore, we can't just use f.readline()
  115. if not self.buffer:
  116. buffer = self.file.read(bufsiz)
  117. if not buffer: return None
  118. self.buffer = buffer
  119. while True:
  120. m = r_line.match(self.buffer)
  121. if m: # the more likely prospect
  122. self.buffer = self.buffer[m.end():]
  123. return m.group(1)
  124. else:
  125. buffer = self.file.read(bufsiz)
  126. if not buffer:
  127. raise ParseError("EOF in line")
  128. self.buffer += buffer
  129. def parseline(self):
  130. self.eat(r_wspace)
  131. if (not self.line) or self.line.startswith('#'):
  132. return # The line is empty or a comment
  133. subject = self.subject()
  134. self.eat(r_wspaces)
  135. predicate = self.predicate()
  136. self.eat(r_wspaces)
  137. object = self.object()
  138. self.eat(r_tail)
  139. if self.line:
  140. raise ParseError("Trailing garbage")
  141. self.sink.triple(subject, predicate, object)
  142. def peek(self, token):
  143. return self.line.startswith(token)
  144. def eat(self, pattern):
  145. m = pattern.match(self.line)
  146. if not m: # @@ Why can't we get the original pattern?
  147. raise ParseError("Failed to eat %s" % pattern)
  148. self.line = self.line[m.end():]
  149. return m
  150. def subject(self):
  151. # @@ Consider using dictionary cases
  152. subj = self.uriref() or self.nodeid()
  153. if not subj:
  154. raise ParseError("Subject must be uriref or nodeID")
  155. return subj
  156. def predicate(self):
  157. pred = self.uriref()
  158. if not pred:
  159. raise ParseError("Predicate must be uriref")
  160. return pred
  161. def object(self):
  162. objt = self.uriref() or self.nodeid() or self.literal()
  163. if not objt:
  164. raise ParseError("Unrecognised object type")
  165. return objt
  166. def uriref(self):
  167. if self.peek('<'):
  168. uri = self.eat(r_uriref).group(1)
  169. uri = unquote(uri)
  170. uri = uriquote(uri)
  171. return URI(uri)
  172. return False
  173. def nodeid(self):
  174. if self.peek('_'):
  175. return bNode(self.eat(r_nodeid).group(1))
  176. return False
  177. def literal(self):
  178. if self.peek('"'):
  179. lit, lang, dtype = self.eat(r_literal).groups()
  180. if lang and dtype:
  181. raise ParseError("Can't have both a language and a datatype")
  182. lit = unquote(lit)
  183. return Literal(lit, lang, dtype)
  184. return False
  185. def parseURI(uri):
  186. import urllib
  187. parser = NTriplesParser()
  188. u = urllib.urlopen(uri)
  189. sink = parser.parse(u)
  190. u.close()
  191. # for triple in sink:
  192. # print triple
  193. print 'Length of input:', sink.length
  194. def main():
  195. import sys
  196. if len(sys.argv) == 2:
  197. parseURI(sys.argv[1])
  198. else: print __doc__
  199. if __name__=="__main__":
  200. main()