PageRenderTime 53ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 0ms

/python/lib/Lib/sgmllib.py

http://github.com/JetBrains/intellij-community
Python | 548 lines | 498 code | 28 blank | 22 comment | 14 complexity | 35e0b2aa9c1d83dee3920034e47f87c1 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0, MPL-2.0-no-copyleft-exception, MIT, EPL-1.0, AGPL-1.0
  1. """A parser for SGML, using the derived class as a static DTD."""
  2. # XXX This only supports those SGML features used by HTML.
  3. # XXX There should be a way to distinguish between PCDATA (parsed
  4. # character data -- the normal case), RCDATA (replaceable character
  5. # data -- only char and entity references and end tags are special)
  6. # and CDATA (character data -- only end tags are special). RCDATA is
  7. # not supported at all.
  8. import markupbase
  9. import re
  10. __all__ = ["SGMLParser", "SGMLParseError"]
  11. # Regular expressions used for parsing
  12. interesting = re.compile('[&<]')
  13. incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
  14. '<([a-zA-Z][^<>]*|'
  15. '/([a-zA-Z][^<>]*)?|'
  16. '![^<>]*)?')
  17. entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
  18. charref = re.compile('&#([0-9]+)[^0-9]')
  19. starttagopen = re.compile('<[>a-zA-Z]')
  20. shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
  21. shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
  22. piclose = re.compile('>')
  23. endbracket = re.compile('[<>]')
  24. tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
  25. attrfind = re.compile(
  26. r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
  27. r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
  28. class SGMLParseError(RuntimeError):
  29. """Exception raised for all parse errors."""
  30. pass
  31. # SGML parser base class -- find tags and call handler functions.
  32. # Usage: p = SGMLParser(); p.feed(data); ...; p.close().
  33. # The dtd is defined by deriving a class which defines methods
  34. # with special names to handle tags: start_foo and end_foo to handle
  35. # <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
  36. # (Tags are converted to lower case for this purpose.) The data
  37. # between tags is passed to the parser by calling self.handle_data()
  38. # with some data as argument (the data may be split up in arbitrary
  39. # chunks). Entity references are passed by calling
  40. # self.handle_entityref() with the entity reference as argument.
  41. class SGMLParser(markupbase.ParserBase):
  42. # Definition of entities -- derived classes may override
  43. entity_or_charref = re.compile('&(?:'
  44. '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
  45. ')(;?)')
  46. def __init__(self, verbose=0):
  47. """Initialize and reset this instance."""
  48. self.verbose = verbose
  49. self.reset()
  50. def reset(self):
  51. """Reset this instance. Loses all unprocessed data."""
  52. self.__starttag_text = None
  53. self.rawdata = ''
  54. self.stack = []
  55. self.lasttag = '???'
  56. self.nomoretags = 0
  57. self.literal = 0
  58. markupbase.ParserBase.reset(self)
  59. def setnomoretags(self):
  60. """Enter literal mode (CDATA) till EOF.
  61. Intended for derived classes only.
  62. """
  63. self.nomoretags = self.literal = 1
  64. def setliteral(self, *args):
  65. """Enter literal mode (CDATA).
  66. Intended for derived classes only.
  67. """
  68. self.literal = 1
  69. def feed(self, data):
  70. """Feed some data to the parser.
  71. Call this as often as you want, with as little or as much text
  72. as you want (may include '\n'). (This just saves the text,
  73. all the processing is done by goahead().)
  74. """
  75. self.rawdata = self.rawdata + data
  76. self.goahead(0)
  77. def close(self):
  78. """Handle the remaining data."""
  79. self.goahead(1)
  80. def error(self, message):
  81. raise SGMLParseError(message)
  82. # Internal -- handle data as far as reasonable. May leave state
  83. # and data to be processed by a subsequent call. If 'end' is
  84. # true, force handling all data as if followed by EOF marker.
  85. def goahead(self, end):
  86. rawdata = self.rawdata
  87. i = 0
  88. n = len(rawdata)
  89. while i < n:
  90. if self.nomoretags:
  91. self.handle_data(rawdata[i:n])
  92. i = n
  93. break
  94. match = interesting.search(rawdata, i)
  95. if match: j = match.start()
  96. else: j = n
  97. if i < j:
  98. self.handle_data(rawdata[i:j])
  99. i = j
  100. if i == n: break
  101. if rawdata[i] == '<':
  102. if starttagopen.match(rawdata, i):
  103. if self.literal:
  104. self.handle_data(rawdata[i])
  105. i = i+1
  106. continue
  107. k = self.parse_starttag(i)
  108. if k < 0: break
  109. i = k
  110. continue
  111. if rawdata.startswith("</", i):
  112. k = self.parse_endtag(i)
  113. if k < 0: break
  114. i = k
  115. self.literal = 0
  116. continue
  117. if self.literal:
  118. if n > (i + 1):
  119. self.handle_data("<")
  120. i = i+1
  121. else:
  122. # incomplete
  123. break
  124. continue
  125. if rawdata.startswith("<!--", i):
  126. # Strictly speaking, a comment is --.*--
  127. # within a declaration tag <!...>.
  128. # This should be removed,
  129. # and comments handled only in parse_declaration.
  130. k = self.parse_comment(i)
  131. if k < 0: break
  132. i = k
  133. continue
  134. if rawdata.startswith("<?", i):
  135. k = self.parse_pi(i)
  136. if k < 0: break
  137. i = i+k
  138. continue
  139. if rawdata.startswith("<!", i):
  140. # This is some sort of declaration; in "HTML as
  141. # deployed," this should only be the document type
  142. # declaration ("<!DOCTYPE html...>").
  143. k = self.parse_declaration(i)
  144. if k < 0: break
  145. i = k
  146. continue
  147. elif rawdata[i] == '&':
  148. if self.literal:
  149. self.handle_data(rawdata[i])
  150. i = i+1
  151. continue
  152. match = charref.match(rawdata, i)
  153. if match:
  154. name = match.group(1)
  155. self.handle_charref(name)
  156. i = match.end(0)
  157. if rawdata[i-1] != ';': i = i-1
  158. continue
  159. match = entityref.match(rawdata, i)
  160. if match:
  161. name = match.group(1)
  162. self.handle_entityref(name)
  163. i = match.end(0)
  164. if rawdata[i-1] != ';': i = i-1
  165. continue
  166. else:
  167. self.error('neither < nor & ??')
  168. # We get here only if incomplete matches but
  169. # nothing else
  170. match = incomplete.match(rawdata, i)
  171. if not match:
  172. self.handle_data(rawdata[i])
  173. i = i+1
  174. continue
  175. j = match.end(0)
  176. if j == n:
  177. break # Really incomplete
  178. self.handle_data(rawdata[i:j])
  179. i = j
  180. # end while
  181. if end and i < n:
  182. self.handle_data(rawdata[i:n])
  183. i = n
  184. self.rawdata = rawdata[i:]
  185. # XXX if end: check for empty stack
  186. # Extensions for the DOCTYPE scanner:
  187. _decl_otherchars = '='
  188. # Internal -- parse processing instr, return length or -1 if not terminated
  189. def parse_pi(self, i):
  190. rawdata = self.rawdata
  191. if rawdata[i:i+2] != '<?':
  192. self.error('unexpected call to parse_pi()')
  193. match = piclose.search(rawdata, i+2)
  194. if not match:
  195. return -1
  196. j = match.start(0)
  197. self.handle_pi(rawdata[i+2: j])
  198. j = match.end(0)
  199. return j-i
  200. def get_starttag_text(self):
  201. return self.__starttag_text
  202. # Internal -- handle starttag, return length or -1 if not terminated
  203. def parse_starttag(self, i):
  204. self.__starttag_text = None
  205. start_pos = i
  206. rawdata = self.rawdata
  207. if shorttagopen.match(rawdata, i):
  208. # SGML shorthand: <tag/data/ == <tag>data</tag>
  209. # XXX Can data contain &... (entity or char refs)?
  210. # XXX Can data contain < or > (tag characters)?
  211. # XXX Can there be whitespace before the first /?
  212. match = shorttag.match(rawdata, i)
  213. if not match:
  214. return -1
  215. tag, data = match.group(1, 2)
  216. self.__starttag_text = '<%s/' % tag
  217. tag = tag.lower()
  218. k = match.end(0)
  219. self.finish_shorttag(tag, data)
  220. self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
  221. return k
  222. # XXX The following should skip matching quotes (' or ")
  223. # As a shortcut way to exit, this isn't so bad, but shouldn't
  224. # be used to locate the actual end of the start tag since the
  225. # < or > characters may be embedded in an attribute value.
  226. match = endbracket.search(rawdata, i+1)
  227. if not match:
  228. return -1
  229. j = match.start(0)
  230. # Now parse the data between i+1 and j into a tag and attrs
  231. attrs = []
  232. if rawdata[i:i+2] == '<>':
  233. # SGML shorthand: <> == <last open tag seen>
  234. k = j
  235. tag = self.lasttag
  236. else:
  237. match = tagfind.match(rawdata, i+1)
  238. if not match:
  239. self.error('unexpected call to parse_starttag')
  240. k = match.end(0)
  241. tag = rawdata[i+1:k].lower()
  242. self.lasttag = tag
  243. while k < j:
  244. match = attrfind.match(rawdata, k)
  245. if not match: break
  246. attrname, rest, attrvalue = match.group(1, 2, 3)
  247. if not rest:
  248. attrvalue = attrname
  249. else:
  250. if (attrvalue[:1] == "'" == attrvalue[-1:] or
  251. attrvalue[:1] == '"' == attrvalue[-1:]):
  252. # strip quotes
  253. attrvalue = attrvalue[1:-1]
  254. attrvalue = self.entity_or_charref.sub(
  255. self._convert_ref, attrvalue)
  256. attrs.append((attrname.lower(), attrvalue))
  257. k = match.end(0)
  258. if rawdata[j] == '>':
  259. j = j+1
  260. self.__starttag_text = rawdata[start_pos:j]
  261. self.finish_starttag(tag, attrs)
  262. return j
  263. # Internal -- convert entity or character reference
  264. def _convert_ref(self, match):
  265. if match.group(2):
  266. return self.convert_charref(match.group(2)) or \
  267. '&#%s%s' % match.groups()[1:]
  268. elif match.group(3):
  269. return self.convert_entityref(match.group(1)) or \
  270. '&%s;' % match.group(1)
  271. else:
  272. return '&%s' % match.group(1)
  273. # Internal -- parse endtag
  274. def parse_endtag(self, i):
  275. rawdata = self.rawdata
  276. match = endbracket.search(rawdata, i+1)
  277. if not match:
  278. return -1
  279. j = match.start(0)
  280. tag = rawdata[i+2:j].strip().lower()
  281. if rawdata[j] == '>':
  282. j = j+1
  283. self.finish_endtag(tag)
  284. return j
  285. # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
  286. def finish_shorttag(self, tag, data):
  287. self.finish_starttag(tag, [])
  288. self.handle_data(data)
  289. self.finish_endtag(tag)
  290. # Internal -- finish processing of start tag
  291. # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
  292. def finish_starttag(self, tag, attrs):
  293. try:
  294. method = getattr(self, 'start_' + tag)
  295. except AttributeError:
  296. try:
  297. method = getattr(self, 'do_' + tag)
  298. except AttributeError:
  299. self.unknown_starttag(tag, attrs)
  300. return -1
  301. else:
  302. self.handle_starttag(tag, method, attrs)
  303. return 0
  304. else:
  305. self.stack.append(tag)
  306. self.handle_starttag(tag, method, attrs)
  307. return 1
  308. # Internal -- finish processing of end tag
  309. def finish_endtag(self, tag):
  310. if not tag:
  311. found = len(self.stack) - 1
  312. if found < 0:
  313. self.unknown_endtag(tag)
  314. return
  315. else:
  316. if tag not in self.stack:
  317. try:
  318. method = getattr(self, 'end_' + tag)
  319. except AttributeError:
  320. self.unknown_endtag(tag)
  321. else:
  322. self.report_unbalanced(tag)
  323. return
  324. found = len(self.stack)
  325. for i in range(found):
  326. if self.stack[i] == tag: found = i
  327. while len(self.stack) > found:
  328. tag = self.stack[-1]
  329. try:
  330. method = getattr(self, 'end_' + tag)
  331. except AttributeError:
  332. method = None
  333. if method:
  334. self.handle_endtag(tag, method)
  335. else:
  336. self.unknown_endtag(tag)
  337. del self.stack[-1]
  338. # Overridable -- handle start tag
  339. def handle_starttag(self, tag, method, attrs):
  340. method(attrs)
  341. # Overridable -- handle end tag
  342. def handle_endtag(self, tag, method):
  343. method()
  344. # Example -- report an unbalanced </...> tag.
  345. def report_unbalanced(self, tag):
  346. if self.verbose:
  347. print '*** Unbalanced </' + tag + '>'
  348. print '*** Stack:', self.stack
  349. def convert_charref(self, name):
  350. """Convert character reference, may be overridden."""
  351. try:
  352. n = int(name)
  353. except ValueError:
  354. return
  355. if not 0 <= n <= 255:
  356. return
  357. return self.convert_codepoint(n)
  358. def convert_codepoint(self, codepoint):
  359. return chr(codepoint)
  360. def handle_charref(self, name):
  361. """Handle character reference, no need to override."""
  362. replacement = self.convert_charref(name)
  363. if replacement is None:
  364. self.unknown_charref(name)
  365. else:
  366. self.handle_data(replacement)
  367. # Definition of entities -- derived classes may override
  368. entitydefs = \
  369. {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
  370. def convert_entityref(self, name):
  371. """Convert entity references.
  372. As an alternative to overriding this method; one can tailor the
  373. results by setting up the self.entitydefs mapping appropriately.
  374. """
  375. table = self.entitydefs
  376. if name in table:
  377. return table[name]
  378. else:
  379. return
  380. def handle_entityref(self, name):
  381. """Handle entity references, no need to override."""
  382. replacement = self.convert_entityref(name)
  383. if replacement is None:
  384. self.unknown_entityref(name)
  385. else:
  386. self.handle_data(self.convert_entityref(name))
  387. # Example -- handle data, should be overridden
  388. def handle_data(self, data):
  389. pass
  390. # Example -- handle comment, could be overridden
  391. def handle_comment(self, data):
  392. pass
  393. # Example -- handle declaration, could be overridden
  394. def handle_decl(self, decl):
  395. pass
  396. # Example -- handle processing instruction, could be overridden
  397. def handle_pi(self, data):
  398. pass
  399. # To be overridden -- handlers for unknown objects
  400. def unknown_starttag(self, tag, attrs): pass
  401. def unknown_endtag(self, tag): pass
  402. def unknown_charref(self, ref): pass
  403. def unknown_entityref(self, ref): pass
  404. class TestSGMLParser(SGMLParser):
  405. def __init__(self, verbose=0):
  406. self.testdata = ""
  407. SGMLParser.__init__(self, verbose)
  408. def handle_data(self, data):
  409. self.testdata = self.testdata + data
  410. if len(repr(self.testdata)) >= 70:
  411. self.flush()
  412. def flush(self):
  413. data = self.testdata
  414. if data:
  415. self.testdata = ""
  416. print 'data:', repr(data)
  417. def handle_comment(self, data):
  418. self.flush()
  419. r = repr(data)
  420. if len(r) > 68:
  421. r = r[:32] + '...' + r[-32:]
  422. print 'comment:', r
  423. def unknown_starttag(self, tag, attrs):
  424. self.flush()
  425. if not attrs:
  426. print 'start tag: <' + tag + '>'
  427. else:
  428. print 'start tag: <' + tag,
  429. for name, value in attrs:
  430. print name + '=' + '"' + value + '"',
  431. print '>'
  432. def unknown_endtag(self, tag):
  433. self.flush()
  434. print 'end tag: </' + tag + '>'
  435. def unknown_entityref(self, ref):
  436. self.flush()
  437. print '*** unknown entity ref: &' + ref + ';'
  438. def unknown_charref(self, ref):
  439. self.flush()
  440. print '*** unknown char ref: &#' + ref + ';'
  441. def unknown_decl(self, data):
  442. self.flush()
  443. print '*** unknown decl: [' + data + ']'
  444. def close(self):
  445. SGMLParser.close(self)
  446. self.flush()
  447. def test(args = None):
  448. import sys
  449. if args is None:
  450. args = sys.argv[1:]
  451. if args and args[0] == '-s':
  452. args = args[1:]
  453. klass = SGMLParser
  454. else:
  455. klass = TestSGMLParser
  456. if args:
  457. file = args[0]
  458. else:
  459. file = 'test.html'
  460. if file == '-':
  461. f = sys.stdin
  462. else:
  463. try:
  464. f = open(file, 'r')
  465. except IOError, msg:
  466. print file, ":", msg
  467. sys.exit(1)
  468. data = f.read()
  469. if f is not sys.stdin:
  470. f.close()
  471. x = klass()
  472. for c in data:
  473. x.feed(c)
  474. x.close()
  475. if __name__ == '__main__':
  476. test()