PageRenderTime 46ms CodeModel.GetById 17ms RepoModel.GetById 1ms app.codeStats 0ms

/lib-python/2.7/sgmllib.py

https://bitbucket.org/prestontimmons/pypy
Python | 553 lines | 502 code | 29 blank | 22 comment | 14 complexity | 38f449092dd9f01486978c92ace937b6 MD5 | raw file
  1. """A parser for SGML, using the derived class as a static DTD."""
  2. # XXX This only supports those SGML features used by HTML.
  3. # XXX There should be a way to distinguish between PCDATA (parsed
  4. # character data -- the normal case), RCDATA (replaceable character
  5. # data -- only char and entity references and end tags are special)
  6. # and CDATA (character data -- only end tags are special). RCDATA is
  7. # not supported at all.
  8. from warnings import warnpy3k
  9. warnpy3k("the sgmllib module has been removed in Python 3.0",
  10. stacklevel=2)
  11. del warnpy3k
  12. import markupbase
  13. import re
  14. __all__ = ["SGMLParser", "SGMLParseError"]
  15. # Regular expressions used for parsing
  16. interesting = re.compile('[&<]')
  17. incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
  18. '<([a-zA-Z][^<>]*|'
  19. '/([a-zA-Z][^<>]*)?|'
  20. '![^<>]*)?')
  21. entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
  22. charref = re.compile('&#([0-9]+)[^0-9]')
  23. starttagopen = re.compile('<[>a-zA-Z]')
  24. shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
  25. shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
  26. piclose = re.compile('>')
  27. endbracket = re.compile('[<>]')
  28. tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
  29. attrfind = re.compile(
  30. r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
  31. r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
  32. class SGMLParseError(RuntimeError):
  33. """Exception raised for all parse errors."""
  34. pass
  35. # SGML parser base class -- find tags and call handler functions.
  36. # Usage: p = SGMLParser(); p.feed(data); ...; p.close().
  37. # The dtd is defined by deriving a class which defines methods
  38. # with special names to handle tags: start_foo and end_foo to handle
  39. # <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
  40. # (Tags are converted to lower case for this purpose.) The data
  41. # between tags is passed to the parser by calling self.handle_data()
  42. # with some data as argument (the data may be split up in arbitrary
  43. # chunks). Entity references are passed by calling
  44. # self.handle_entityref() with the entity reference as argument.
  45. class SGMLParser(markupbase.ParserBase):
  46. # Definition of entities -- derived classes may override
  47. entity_or_charref = re.compile('&(?:'
  48. '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
  49. ')(;?)')
  50. def __init__(self, verbose=0):
  51. """Initialize and reset this instance."""
  52. self.verbose = verbose
  53. self.reset()
  54. def reset(self):
  55. """Reset this instance. Loses all unprocessed data."""
  56. self.__starttag_text = None
  57. self.rawdata = ''
  58. self.stack = []
  59. self.lasttag = '???'
  60. self.nomoretags = 0
  61. self.literal = 0
  62. markupbase.ParserBase.reset(self)
  63. def setnomoretags(self):
  64. """Enter literal mode (CDATA) till EOF.
  65. Intended for derived classes only.
  66. """
  67. self.nomoretags = self.literal = 1
  68. def setliteral(self, *args):
  69. """Enter literal mode (CDATA).
  70. Intended for derived classes only.
  71. """
  72. self.literal = 1
  73. def feed(self, data):
  74. """Feed some data to the parser.
  75. Call this as often as you want, with as little or as much text
  76. as you want (may include '\n'). (This just saves the text,
  77. all the processing is done by goahead().)
  78. """
  79. self.rawdata = self.rawdata + data
  80. self.goahead(0)
  81. def close(self):
  82. """Handle the remaining data."""
  83. self.goahead(1)
  84. def error(self, message):
  85. raise SGMLParseError(message)
  86. # Internal -- handle data as far as reasonable. May leave state
  87. # and data to be processed by a subsequent call. If 'end' is
  88. # true, force handling all data as if followed by EOF marker.
  89. def goahead(self, end):
  90. rawdata = self.rawdata
  91. i = 0
  92. n = len(rawdata)
  93. while i < n:
  94. if self.nomoretags:
  95. self.handle_data(rawdata[i:n])
  96. i = n
  97. break
  98. match = interesting.search(rawdata, i)
  99. if match: j = match.start()
  100. else: j = n
  101. if i < j:
  102. self.handle_data(rawdata[i:j])
  103. i = j
  104. if i == n: break
  105. if rawdata[i] == '<':
  106. if starttagopen.match(rawdata, i):
  107. if self.literal:
  108. self.handle_data(rawdata[i])
  109. i = i+1
  110. continue
  111. k = self.parse_starttag(i)
  112. if k < 0: break
  113. i = k
  114. continue
  115. if rawdata.startswith("</", i):
  116. k = self.parse_endtag(i)
  117. if k < 0: break
  118. i = k
  119. self.literal = 0
  120. continue
  121. if self.literal:
  122. if n > (i + 1):
  123. self.handle_data("<")
  124. i = i+1
  125. else:
  126. # incomplete
  127. break
  128. continue
  129. if rawdata.startswith("<!--", i):
  130. # Strictly speaking, a comment is --.*--
  131. # within a declaration tag <!...>.
  132. # This should be removed,
  133. # and comments handled only in parse_declaration.
  134. k = self.parse_comment(i)
  135. if k < 0: break
  136. i = k
  137. continue
  138. if rawdata.startswith("<?", i):
  139. k = self.parse_pi(i)
  140. if k < 0: break
  141. i = i+k
  142. continue
  143. if rawdata.startswith("<!", i):
  144. # This is some sort of declaration; in "HTML as
  145. # deployed," this should only be the document type
  146. # declaration ("<!DOCTYPE html...>").
  147. k = self.parse_declaration(i)
  148. if k < 0: break
  149. i = k
  150. continue
  151. elif rawdata[i] == '&':
  152. if self.literal:
  153. self.handle_data(rawdata[i])
  154. i = i+1
  155. continue
  156. match = charref.match(rawdata, i)
  157. if match:
  158. name = match.group(1)
  159. self.handle_charref(name)
  160. i = match.end(0)
  161. if rawdata[i-1] != ';': i = i-1
  162. continue
  163. match = entityref.match(rawdata, i)
  164. if match:
  165. name = match.group(1)
  166. self.handle_entityref(name)
  167. i = match.end(0)
  168. if rawdata[i-1] != ';': i = i-1
  169. continue
  170. else:
  171. self.error('neither < nor & ??')
  172. # We get here only if incomplete matches but
  173. # nothing else
  174. match = incomplete.match(rawdata, i)
  175. if not match:
  176. self.handle_data(rawdata[i])
  177. i = i+1
  178. continue
  179. j = match.end(0)
  180. if j == n:
  181. break # Really incomplete
  182. self.handle_data(rawdata[i:j])
  183. i = j
  184. # end while
  185. if end and i < n:
  186. self.handle_data(rawdata[i:n])
  187. i = n
  188. self.rawdata = rawdata[i:]
  189. # XXX if end: check for empty stack
  190. # Extensions for the DOCTYPE scanner:
  191. _decl_otherchars = '='
  192. # Internal -- parse processing instr, return length or -1 if not terminated
  193. def parse_pi(self, i):
  194. rawdata = self.rawdata
  195. if rawdata[i:i+2] != '<?':
  196. self.error('unexpected call to parse_pi()')
  197. match = piclose.search(rawdata, i+2)
  198. if not match:
  199. return -1
  200. j = match.start(0)
  201. self.handle_pi(rawdata[i+2: j])
  202. j = match.end(0)
  203. return j-i
  204. def get_starttag_text(self):
  205. return self.__starttag_text
  206. # Internal -- handle starttag, return length or -1 if not terminated
  207. def parse_starttag(self, i):
  208. self.__starttag_text = None
  209. start_pos = i
  210. rawdata = self.rawdata
  211. if shorttagopen.match(rawdata, i):
  212. # SGML shorthand: <tag/data/ == <tag>data</tag>
  213. # XXX Can data contain &... (entity or char refs)?
  214. # XXX Can data contain < or > (tag characters)?
  215. # XXX Can there be whitespace before the first /?
  216. match = shorttag.match(rawdata, i)
  217. if not match:
  218. return -1
  219. tag, data = match.group(1, 2)
  220. self.__starttag_text = '<%s/' % tag
  221. tag = tag.lower()
  222. k = match.end(0)
  223. self.finish_shorttag(tag, data)
  224. self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
  225. return k
  226. # XXX The following should skip matching quotes (' or ")
  227. # As a shortcut way to exit, this isn't so bad, but shouldn't
  228. # be used to locate the actual end of the start tag since the
  229. # < or > characters may be embedded in an attribute value.
  230. match = endbracket.search(rawdata, i+1)
  231. if not match:
  232. return -1
  233. j = match.start(0)
  234. # Now parse the data between i+1 and j into a tag and attrs
  235. attrs = []
  236. if rawdata[i:i+2] == '<>':
  237. # SGML shorthand: <> == <last open tag seen>
  238. k = j
  239. tag = self.lasttag
  240. else:
  241. match = tagfind.match(rawdata, i+1)
  242. if not match:
  243. self.error('unexpected call to parse_starttag')
  244. k = match.end(0)
  245. tag = rawdata[i+1:k].lower()
  246. self.lasttag = tag
  247. while k < j:
  248. match = attrfind.match(rawdata, k)
  249. if not match: break
  250. attrname, rest, attrvalue = match.group(1, 2, 3)
  251. if not rest:
  252. attrvalue = attrname
  253. else:
  254. if (attrvalue[:1] == "'" == attrvalue[-1:] or
  255. attrvalue[:1] == '"' == attrvalue[-1:]):
  256. # strip quotes
  257. attrvalue = attrvalue[1:-1]
  258. attrvalue = self.entity_or_charref.sub(
  259. self._convert_ref, attrvalue)
  260. attrs.append((attrname.lower(), attrvalue))
  261. k = match.end(0)
  262. if rawdata[j] == '>':
  263. j = j+1
  264. self.__starttag_text = rawdata[start_pos:j]
  265. self.finish_starttag(tag, attrs)
  266. return j
  267. # Internal -- convert entity or character reference
  268. def _convert_ref(self, match):
  269. if match.group(2):
  270. return self.convert_charref(match.group(2)) or \
  271. '&#%s%s' % match.groups()[1:]
  272. elif match.group(3):
  273. return self.convert_entityref(match.group(1)) or \
  274. '&%s;' % match.group(1)
  275. else:
  276. return '&%s' % match.group(1)
  277. # Internal -- parse endtag
  278. def parse_endtag(self, i):
  279. rawdata = self.rawdata
  280. match = endbracket.search(rawdata, i+1)
  281. if not match:
  282. return -1
  283. j = match.start(0)
  284. tag = rawdata[i+2:j].strip().lower()
  285. if rawdata[j] == '>':
  286. j = j+1
  287. self.finish_endtag(tag)
  288. return j
  289. # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
  290. def finish_shorttag(self, tag, data):
  291. self.finish_starttag(tag, [])
  292. self.handle_data(data)
  293. self.finish_endtag(tag)
  294. # Internal -- finish processing of start tag
  295. # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
  296. def finish_starttag(self, tag, attrs):
  297. try:
  298. method = getattr(self, 'start_' + tag)
  299. except AttributeError:
  300. try:
  301. method = getattr(self, 'do_' + tag)
  302. except AttributeError:
  303. self.unknown_starttag(tag, attrs)
  304. return -1
  305. else:
  306. self.handle_starttag(tag, method, attrs)
  307. return 0
  308. else:
  309. self.stack.append(tag)
  310. self.handle_starttag(tag, method, attrs)
  311. return 1
  312. # Internal -- finish processing of end tag
  313. def finish_endtag(self, tag):
  314. if not tag:
  315. found = len(self.stack) - 1
  316. if found < 0:
  317. self.unknown_endtag(tag)
  318. return
  319. else:
  320. if tag not in self.stack:
  321. try:
  322. method = getattr(self, 'end_' + tag)
  323. except AttributeError:
  324. self.unknown_endtag(tag)
  325. else:
  326. self.report_unbalanced(tag)
  327. return
  328. found = len(self.stack)
  329. for i in range(found):
  330. if self.stack[i] == tag: found = i
  331. while len(self.stack) > found:
  332. tag = self.stack[-1]
  333. try:
  334. method = getattr(self, 'end_' + tag)
  335. except AttributeError:
  336. method = None
  337. if method:
  338. self.handle_endtag(tag, method)
  339. else:
  340. self.unknown_endtag(tag)
  341. del self.stack[-1]
  342. # Overridable -- handle start tag
  343. def handle_starttag(self, tag, method, attrs):
  344. method(attrs)
  345. # Overridable -- handle end tag
  346. def handle_endtag(self, tag, method):
  347. method()
  348. # Example -- report an unbalanced </...> tag.
  349. def report_unbalanced(self, tag):
  350. if self.verbose:
  351. print '*** Unbalanced </' + tag + '>'
  352. print '*** Stack:', self.stack
  353. def convert_charref(self, name):
  354. """Convert character reference, may be overridden."""
  355. try:
  356. n = int(name)
  357. except ValueError:
  358. return
  359. if not 0 <= n <= 127:
  360. return
  361. return self.convert_codepoint(n)
  362. def convert_codepoint(self, codepoint):
  363. return chr(codepoint)
  364. def handle_charref(self, name):
  365. """Handle character reference, no need to override."""
  366. replacement = self.convert_charref(name)
  367. if replacement is None:
  368. self.unknown_charref(name)
  369. else:
  370. self.handle_data(replacement)
  371. # Definition of entities -- derived classes may override
  372. entitydefs = \
  373. {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
  374. def convert_entityref(self, name):
  375. """Convert entity references.
  376. As an alternative to overriding this method; one can tailor the
  377. results by setting up the self.entitydefs mapping appropriately.
  378. """
  379. table = self.entitydefs
  380. if name in table:
  381. return table[name]
  382. else:
  383. return
  384. def handle_entityref(self, name):
  385. """Handle entity references, no need to override."""
  386. replacement = self.convert_entityref(name)
  387. if replacement is None:
  388. self.unknown_entityref(name)
  389. else:
  390. self.handle_data(replacement)
  391. # Example -- handle data, should be overridden
  392. def handle_data(self, data):
  393. pass
  394. # Example -- handle comment, could be overridden
  395. def handle_comment(self, data):
  396. pass
  397. # Example -- handle declaration, could be overridden
  398. def handle_decl(self, decl):
  399. pass
  400. # Example -- handle processing instruction, could be overridden
  401. def handle_pi(self, data):
  402. pass
  403. # To be overridden -- handlers for unknown objects
  404. def unknown_starttag(self, tag, attrs): pass
  405. def unknown_endtag(self, tag): pass
  406. def unknown_charref(self, ref): pass
  407. def unknown_entityref(self, ref): pass
  408. class TestSGMLParser(SGMLParser):
  409. def __init__(self, verbose=0):
  410. self.testdata = ""
  411. SGMLParser.__init__(self, verbose)
  412. def handle_data(self, data):
  413. self.testdata = self.testdata + data
  414. if len(repr(self.testdata)) >= 70:
  415. self.flush()
  416. def flush(self):
  417. data = self.testdata
  418. if data:
  419. self.testdata = ""
  420. print 'data:', repr(data)
  421. def handle_comment(self, data):
  422. self.flush()
  423. r = repr(data)
  424. if len(r) > 68:
  425. r = r[:32] + '...' + r[-32:]
  426. print 'comment:', r
  427. def unknown_starttag(self, tag, attrs):
  428. self.flush()
  429. if not attrs:
  430. print 'start tag: <' + tag + '>'
  431. else:
  432. print 'start tag: <' + tag,
  433. for name, value in attrs:
  434. print name + '=' + '"' + value + '"',
  435. print '>'
  436. def unknown_endtag(self, tag):
  437. self.flush()
  438. print 'end tag: </' + tag + '>'
  439. def unknown_entityref(self, ref):
  440. self.flush()
  441. print '*** unknown entity ref: &' + ref + ';'
  442. def unknown_charref(self, ref):
  443. self.flush()
  444. print '*** unknown char ref: &#' + ref + ';'
  445. def unknown_decl(self, data):
  446. self.flush()
  447. print '*** unknown decl: [' + data + ']'
  448. def close(self):
  449. SGMLParser.close(self)
  450. self.flush()
  451. def test(args = None):
  452. import sys
  453. if args is None:
  454. args = sys.argv[1:]
  455. if args and args[0] == '-s':
  456. args = args[1:]
  457. klass = SGMLParser
  458. else:
  459. klass = TestSGMLParser
  460. if args:
  461. file = args[0]
  462. else:
  463. file = 'test.html'
  464. if file == '-':
  465. f = sys.stdin
  466. else:
  467. try:
  468. f = open(file, 'r')
  469. except IOError, msg:
  470. print file, ":", msg
  471. sys.exit(1)
  472. data = f.read()
  473. if f is not sys.stdin:
  474. f.close()
  475. x = klass()
  476. for c in data:
  477. x.feed(c)
  478. x.close()
  479. if __name__ == '__main__':
  480. test()