PageRenderTime 71ms CodeModel.GetById 40ms RepoModel.GetById 0ms app.codeStats 0ms

/src/sgml/SGMLLexer.py

http://github.com/lkcl/grailbrowser
Python | 780 lines | 734 code | 11 blank | 35 comment | 8 complexity | 6f7974526cc7152dac9f774ca96264a9 MD5 | raw file
  1. """A lexer for SGML, using derived classes as parser and DTD.
  2. This module provides a transparent interface allowing the use of
  3. alternate lexical analyzers without modifying higher levels of SGML
  4. or HTML support.
  5. """
  6. __version__ = "$Revision: 1.45 $"
  7. # These constants are not used in this module, but are provided to
  8. # allow other modules to know about the concrete syntax we support.
  9. COM = "--" # comment start or end
  10. CRO = "&#" # character reference open
  11. REFC = ";" # reference close
  12. DSO = "[" # declaration subset open
  13. DSC = "]" # declaration subset close
  14. ERO = "&" # entity reference open
  15. LIT = '"' # literal start or end
  16. LITA = "'" # literal start or end (alternative)
  17. MDO = "<!" # markup declaration open
  18. MDC = ">" # markup declaration close
  19. MSC = "]]" # marked section close
  20. NET = "/" # null end tag
  21. PIO = "<?" # processing instruciton open
  22. PIC = ">" # processing instruction close
  23. STAGO = "<" # start tag open
  24. ETAGO = "</" # end tag open
  25. TAGC = ">" # tag close
  26. VI = "=" # value indicator
  27. whitespace = '\\t\\n\x0b\x0c\\r '
  28. # XXX There should be a way to distinguish between PCDATA (parsed
  29. # character data -- the normal case), RCDATA (replaceable character
  30. # data -- only char and entity references and end tags are special)
  31. # and CDATA (character data -- only end tags are special).
  32. import re
  33. import string
  34. try:
  35. class SGMLError(Exception):
  36. pass
  37. except TypeError:
  38. class SGMLError:
  39. pass
  40. # SGML lexer base class -- find tags and call handler functions.
  41. # Usage: p = SGMLLexer(); p.feed(data); ...; p.close().
  42. # The data between tags is passed to the parser by calling
  43. # self.lex_data() with some data as argument (the data may be split up
  44. # in arbutrary chunks). Entity references are passed by calling
  45. # self.lex_entityref() with the entity reference as argument.
  46. class SGMLLexerBase:
  47. # This is a "dummy" base class which provides documentation on the
  48. # lexer API; this can be used by tools which can extract missing
  49. # method documentation from base classes.
  50. def feed(self, input_data):
  51. """Feed some data to the parser.
  52. input_data
  53. Input data to be fed to the scanner. An empty string
  54. indicates end-of-input.
  55. Call this as often as you want, with as little or as much text
  56. as you want (may include '\n').
  57. """
  58. pass
  59. def close(self):
  60. """Terminate the input stream.
  61. If any data remains unparsed or any events have not been
  62. dispatched, they must be forced to do so by this method before
  63. returning.
  64. """
  65. pass
  66. def line(self):
  67. """Return the current line number if known.
  68. """
  69. def normalize(self, norm):
  70. """Control normalization of name tokens.
  71. norm
  72. Boolean indicating new setting of case normalization.
  73. If `norm' is true, names tokens will be converted to lower
  74. case before being based to the `lex_*()' interfaces described
  75. below. Otherwise, names will be reported in the case in which
  76. they are found in the input stream. Tokens which are affected
  77. include tag names, attribute names, and named character
  78. references. Note that general entity references are not
  79. affected.
  80. A boolean indicating the previous value is returned.
  81. """
  82. pass
  83. def reset(self):
  84. """Attempt to reset the lexical analyzer.
  85. """
  86. pass
  87. def restrict(self, strict):
  88. """Control recognition of particular constructs.
  89. """
  90. pass
  91. # The rest of the methods of this class are intended to be overridden
  92. # by parser subclasses interested in different events on the input
  93. # stream. They are called by the implementation of the lexer object.
  94. def lex_data(self, data_string):
  95. """Process data characters.
  96. """
  97. pass
  98. def lex_starttag(self, tagname, attributes):
  99. """Process a start tag and attributes.
  100. tagname
  101. General identifier of the start tag encountered.
  102. attributes
  103. Dictionary of the attribute/value pairs found in the document
  104. source.
  105. The general identifier and attribute names are normalized to
  106. lower case if only if normalization is enabled; all attribute
  107. values are strings. Attribute values coded as string literals
  108. using either LIT or LITA quoting will have the surrounding
  109. quotation marks removed. Attributes with no value specified
  110. in the document source will have a value of `None' in the
  111. dictionary passed to this method.
  112. """
  113. pass
  114. def lex_endtag(self, tagname):
  115. """Process an end tag.
  116. tagname
  117. General identifier of the end tag found.
  118. """
  119. pass
  120. def lex_charref(self, ordinal, terminator):
  121. """Process a numeric character reference.
  122. """
  123. pass
  124. def lex_namedcharref(self, refname, terminator):
  125. """Process a named character reference.
  126. """
  127. pass
  128. def lex_entityref(self, refname, terminator):
  129. """Process a general entity reference.
  130. """
  131. pass
  132. def lex_pi(self, pi_data):
  133. """Process a processing instruction.
  134. """
  135. pass
  136. def lex_comment(self, comment_string):
  137. """Process a comment string.
  138. If a markup declaration consists entirely of comments, each comment
  139. is passed to this method in sequence. The parser has no way of
  140. knowing whether multiple comments received in sequence are part of
  141. a single markup declaration or originated in multiple declarations.
  142. Empty comments ('<!>') are ignored. Comments embedded in other
  143. markup declarations are not handled via this method.
  144. """
  145. pass
  146. def lex_declaration(self, declaration_info):
  147. """Process a markup declaration other than a comment.
  148. declaration_info
  149. List of strings. The first string will be the name of the
  150. declaration (doctype, etc.), followed by each additional
  151. name, nametoken, quoted literal, or comment in the
  152. declaration.
  153. Literals and comments will include the quotation marks or
  154. comment delimiters to allow the client to process each
  155. correctly. Normalization of names and nametokens will be
  156. handled as for general identifiers.
  157. """
  158. pass
  159. def lex_error(self, error_string):
  160. """Process an error packet.
  161. error_string
  162. String which describes a lexical error in the input stream.
  163. Values passed to this method may be affected by the current
  164. scanning mode. Further callbacks may show symptoms described
  165. by the error described by `error_string'.
  166. """
  167. pass
  168. def lex_limitation(self, limit_string):
  169. """Process a limitation packet.
  170. limit_string
  171. String which describes a lexical limitation in the current
  172. scanning mode.
  173. Further callbacks may show symptoms determined by the limitation
  174. described by `limit_string'.
  175. """
  176. pass
  177. class SGMLLexer(SGMLLexerBase):
  178. entitydefs = {}
  179. _in_parse = 0
  180. _finish_parse = 0
  181. def __init__(self):
  182. self.reset()
  183. def strict_p(self):
  184. return self._strict
  185. def cleanup(self):
  186. pass
  187. rawdata = ''
  188. def reset(self):
  189. self.stack = []
  190. self.lasttag = '???'
  191. self.nomoretags = 0
  192. self.literal = 0
  193. self._normfunc = lambda s: s
  194. self._strict = 0
  195. def close(self):
  196. if not self._in_parse:
  197. self.goahead(1)
  198. self.cleanup()
  199. else:
  200. self._finish_parse = 1
  201. def line(self):
  202. return None
  203. def feed(self, data):
  204. self.rawdata = self.rawdata + data
  205. if not self._in_parse:
  206. self._in_parse = 1
  207. self.goahead(0)
  208. self._in_parse = 0
  209. if self._finish_parse:
  210. self.cleanup()
  211. def normalize(self, norm):
  212. prev = ((self._normfunc is string.lower) and 1) or 0
  213. self._normfunc = (norm and string.lower) or (lambda s: s)
  214. return prev
  215. def restrict(self, constrain):
  216. prev = not self._strict
  217. self._strict = not ((constrain and 1) or 0)
  218. return prev
  219. def setliteral(self, tag):
  220. self.literal = 1
  221. re = "%s%s[%s]*%s" % (ETAGO, tag, whitespace, TAGC)
  222. if self._normfunc is string.lower:
  223. self._lit_etag_re = re.compile(re, re.IGNORECASE)
  224. else:
  225. self._lit_etag_re = re.compile(re)
  226. def setnomoretags(self):
  227. self.nomoretags = 1
  228. # Internal -- handle data as far as reasonable. May leave state
  229. # and data to be processed by a subsequent call. If 'end' is
  230. # true, force handling all data as if followed by EOF marker.
  231. def goahead(self, end):
  232. #print "goahead", self.rawdata
  233. i = 0
  234. n = len(self.rawdata)
  235. while i < n:
  236. rawdata = self.rawdata # pick up any appended data
  237. n = len(rawdata)
  238. if self.nomoretags:
  239. self.lex_data(rawdata[i:n])
  240. i = n
  241. break
  242. if self.literal:
  243. match = self._lit_etag_re.search(rawdata, i)
  244. if match:
  245. pos = match.start()
  246. # found end
  247. self.lex_data(rawdata[i:pos])
  248. i = pos + len(match.group(0))
  249. self.literal = 0
  250. continue
  251. else:
  252. pos = string.rfind(rawdata, "<", i)
  253. if pos >= 0:
  254. self.lex_data(rawdata[i:pos])
  255. i = pos
  256. break
  257. # pick up self._finish_parse as soon as possible:
  258. end = end or self._finish_parse
  259. match = interesting.search(rawdata, i)
  260. if match: j = match.start()
  261. else: j = n
  262. if i < j: self.lex_data(rawdata[i:j])
  263. i = j
  264. if i == n: break
  265. #print "interesting", j, i
  266. if rawdata[i] == '<':
  267. #print "<", self.literal, rawdata[i:20]
  268. if starttagopen.match(rawdata, i):
  269. #print "open"
  270. if self.literal:
  271. self.lex_data(rawdata[i])
  272. i = i+1
  273. continue
  274. #print "parse_starttag", self.parse_starttag
  275. k = self.parse_starttag(i)
  276. if k < 0: break
  277. i = k
  278. continue
  279. if endtagopen.match(rawdata, i):
  280. k = self.parse_endtag(i)
  281. if k < 0: break
  282. i = k
  283. self.literal = 0
  284. continue
  285. if commentopen.match(rawdata, i):
  286. if self.literal:
  287. self.lex_data(rawdata[i])
  288. i = i+1
  289. continue
  290. k = self.parse_comment(i, end)
  291. if k < 0: break
  292. i = i + k
  293. continue
  294. match = processinginstruction.match(rawdata, i)
  295. if match:
  296. k = match.start()
  297. # Processing instruction:
  298. if self._strict:
  299. self.lex_pi(match.group(1))
  300. i = match.end()
  301. else:
  302. self.lex_data(rawdata[i])
  303. i = i + 1
  304. continue
  305. match = special.match(rawdata, i)
  306. if match:
  307. k = match.start()
  308. if k-i == 3:
  309. self.lex_declaration([])
  310. i = i + 3
  311. continue
  312. if self._strict:
  313. if rawdata[i+2] in string.letters:
  314. k = self.parse_declaration(i)
  315. if k > -1:
  316. i = i + k
  317. else:
  318. self.lex_data('<!')
  319. i = i + 2
  320. else:
  321. # Pretend it's data:
  322. if self.literal:
  323. self.lex_data(rawdata[i])
  324. k = 1
  325. i = match.end()
  326. continue
  327. elif rawdata[i] == '&':
  328. charref = (self._strict and legalcharref) or simplecharref
  329. match = charref.match(rawdata, i)
  330. if match:
  331. k = match.end()
  332. if rawdata[k-1] not in ';\n':
  333. k = k-1
  334. terminator = ''
  335. else:
  336. terminator = rawdata[k-1]
  337. name = match.group(1)[:-1]
  338. postchar = ''
  339. if terminator == '\n' and not self._strict:
  340. postchar = '\n'
  341. terminator = ''
  342. if name[0] in '0123456789':
  343. # Character reference:
  344. try:
  345. self.lex_charref(string.atoi(name), terminator)
  346. except ValueError:
  347. self.lex_data("&#%s%s" % (name, terminator))
  348. else:
  349. # Named character reference:
  350. self.lex_namedcharref(self._normfunc(name),
  351. terminator)
  352. if postchar:
  353. self.lex_data(postchar)
  354. i = k
  355. continue
  356. match = entityref.match(rawdata, i)
  357. if match:
  358. k = match.end()
  359. # General entity reference:
  360. #k = i+k
  361. if rawdata[k-1] not in ';\n':
  362. k = k-1
  363. terminator = ''
  364. else:
  365. terminator = rawdata[k-1]
  366. name = match.group(1)
  367. self.lex_entityref(name, terminator)
  368. i = k
  369. continue
  370. else:
  371. raise RuntimeError, 'neither < nor & ??'
  372. # We get here only if incomplete matches but
  373. # nothing else
  374. match = incomplete.match(rawdata, i)
  375. if not match:
  376. self.lex_data(rawdata[i])
  377. i = i+1
  378. continue
  379. k = match.end()
  380. j = k
  381. if j == n:
  382. break # Really incomplete
  383. self.lex_data(rawdata[i:j])
  384. i = j
  385. # end while
  386. if (end or self._finish_parse) and i < n:
  387. self.lex_data(self.rawdata[i:n])
  388. i = n
  389. self.rawdata = self.rawdata[i:]
  390. # Internal -- parse comment, return length or -1 if not terminated
  391. def parse_comment(self, i, end):
  392. #print "parse comment"
  393. rawdata = self.rawdata
  394. if rawdata[i:i+4] <> (MDO + COM):
  395. raise RuntimeError, 'unexpected call to parse_comment'
  396. if self._strict:
  397. # stricter parsing; this requires legal SGML:
  398. pos = i + len(MDO)
  399. datalength = len(rawdata)
  400. comments = []
  401. while (pos < datalength) and rawdata[pos] != MDC:
  402. matchlength, comment = comment_match(rawdata, pos)
  403. if matchlength >= 0:
  404. pos = pos + matchlength
  405. comments.append(comment)
  406. elif end:
  407. self.lex_error("unexpected end of data in comment")
  408. comments.append(rawdata[pos+2:])
  409. pos = datalength
  410. elif rawdata[pos] != "-":
  411. self.lex_error("illegal character in"
  412. " markup declaration: "
  413. + `rawdata[pos]`)
  414. pos = pos + 1
  415. else:
  416. return -1
  417. map(self.lex_comment, comments)
  418. return pos + len(MDC) - i
  419. # not strict
  420. match = commentclose.search(rawdata, i+4)
  421. if not match:
  422. if end:
  423. if MDC in rawdata[i:]:
  424. j = string.find(rawdata, MDC, i)
  425. self.lex_comment(rawdata[i+4: j])
  426. return j + len(MDC) - i
  427. self.lex_comment(rawdata[i+4:])
  428. return len(rawdata) - i
  429. return -1
  430. j = match.start()
  431. self.lex_comment(rawdata[i+4: j])
  432. match = commentclose.match(rawdata, j)
  433. if match:
  434. j = match.start()
  435. return j - i
  436. # Internal -- handle starttag, return length or -1 if not terminated
  437. def parse_starttag(self, i):
  438. rawdata = self.rawdata
  439. #print "parse_starttag", rawdata
  440. if self._strict and shorttagopen.match(rawdata, i):
  441. # SGML shorthand: <tag/data/ == <tag>data</tag>
  442. # XXX Can data contain &... (entity or char refs)? ... yes
  443. # XXX Can data contain < or > (tag characters)? ... > yes,
  444. # < not as delimiter-in-context
  445. # XXX Can there be whitespace before the first /? ... no
  446. match = shorttag.match(rawdata, i)
  447. if not match:
  448. self.lex_data(rawdata[i])
  449. return i + 1
  450. k = match.end()
  451. tag, data = match.group(1, 2)
  452. tag = self._normfunc(tag)
  453. self.lex_starttag(tag, {})
  454. self.lex_data(data) # should scan for entity refs
  455. self.lex_endtag(tag)
  456. return k
  457. # XXX The following should skip matching quotes (' or ")
  458. match = endbracket.search(rawdata, i+1)
  459. if not match:
  460. return -1
  461. j = match.start(0)
  462. #print "parse_starttag endbracket", j
  463. # Now parse the data between i+1 and j into a tag and attrs
  464. if rawdata[i:i+2] == '<>':
  465. # Semantics of the empty tag are handled by lex_starttag():
  466. if self._strict:
  467. self.lex_starttag('', {})
  468. else:
  469. self.lex_data('<>')
  470. return i + 2
  471. #print "tagfind start", i+1
  472. match = tagfind.match(rawdata, i+1) # matches just the GI
  473. if not match:
  474. raise RuntimeError, 'unexpected call to parse_starttag'
  475. k = match.end(0)
  476. #print "tagfind end", k
  477. tag = self._normfunc(rawdata[i+1:k])
  478. #print "tag", tag
  479. # pull recognizable attributes
  480. attrs = {}
  481. while k < j:
  482. match = attrfind.match(rawdata, k)
  483. if not match: break
  484. l = match.start(0)
  485. k = k + l
  486. # Break out the name[/value] pair:
  487. attrname, rest, attrvalue = match.group(1, 2, 3)
  488. if not rest:
  489. attrvalue = None # was: = attrname
  490. elif attrvalue[:1] == LITA == attrvalue[-1:] or \
  491. attrvalue[:1] == LIT == attrvalue[-1:]:
  492. attrvalue = attrvalue[1:-1]
  493. if '&' in attrvalue:
  494. from SGMLReplacer import replace
  495. attrvalue = replace(attrvalue, self.entitydefs)
  496. attrs[self._normfunc(attrname)] = attrvalue
  497. k = match.end(0)
  498. # close the start-tag
  499. xx = tagend.match(rawdata, k)
  500. if not xx:
  501. # something vile
  502. endchars = self._strict and "<>/" or "<>"
  503. while 1:
  504. try:
  505. while rawdata[k] in string.whitespace:
  506. k = k + 1
  507. except IndexError:
  508. return -1
  509. if rawdata[k] not in endchars:
  510. self.lex_error("bad character in tag")
  511. k = k + 1
  512. else:
  513. break
  514. if not self._strict:
  515. if rawdata[k] == '<':
  516. self.lex_limitation("unclosed start tag not supported")
  517. elif rawdata[k] == '/':
  518. self.lex_limitation("NET-enabling start tags"
  519. " not supported")
  520. else:
  521. k = k + len(xx.group(0)) - 1
  522. #
  523. # Vicious hack to allow XML-style empty tags, like "<hr />".
  524. # We don't require the space, but appearantly it's significant
  525. # on Netscape Navigator. Only in non-strict mode.
  526. #
  527. c = rawdata[k]
  528. if c == '/' and not self._strict:
  529. if rawdata[k:k+2] == "/>":
  530. # using XML empty-tag hack
  531. self.lex_starttag(tag, attrs)
  532. self.lex_endtag(tag)
  533. return k + 2
  534. else:
  535. self.lex_starttag(tag, attrs)
  536. return k + 1
  537. if c in '>/':
  538. k = k + 1
  539. self.lex_starttag(tag, attrs)
  540. return k
  541. # Internal -- parse endtag
  542. def parse_endtag(self, i):
  543. rawdata = self.rawdata
  544. if rawdata[i+2] in '<>':
  545. if rawdata[i+2] == '<' and not self._strict:
  546. self.lex_limitation("unclosed end tags not supported")
  547. self.lex_data(ETAGO)
  548. return i + 2
  549. self.lex_endtag('')
  550. return i + 2 + (rawdata[i+2] == TAGC)
  551. match = endtag.match(rawdata, i)
  552. if not match:
  553. return -1
  554. j = match.end(0)-1
  555. #j = i + j - 1
  556. if rawdata[j] == TAGC:
  557. j = j + 1
  558. self.lex_endtag(self._normfunc(match.group(1)))
  559. return j
  560. def parse_declaration(self, start):
  561. # This only gets used in "strict" mode.
  562. rawdata = self.rawdata
  563. i = start
  564. # Markup declaration, possibly illegal:
  565. strs = []
  566. i = i + 2
  567. match = md_name.match(rawdata, i)
  568. k = match.start()
  569. strs.append(self._normfunc(match.group(1)))
  570. i = i + k
  571. end_target = '>'
  572. while k > 0:
  573. # Have to check the comment pattern first so we don't get
  574. # confused and think this is a name that starts with '--':
  575. if rawdata[i] == '[':
  576. self.lex_limitation("declaration subset not supported")
  577. end_target = ']>'
  578. break
  579. k, comment = comment_match(rawdata, i)
  580. if k > 0:
  581. strs.append(comment)
  582. i = i + k
  583. continue
  584. match = md_string.match(rawdata, i)
  585. if match:
  586. k = match.start()
  587. strs.append(match.group(1))
  588. i = i + k
  589. continue
  590. match = md_name.match(rawdata, i)
  591. if match:
  592. k = match.start()
  593. s = match.group(1)
  594. try:
  595. strs.append(string.atoi(s))
  596. except string.atoi_error:
  597. strs.append(self._normfunc(s))
  598. i = i + k
  599. continue
  600. k = string.find(rawdata, end_target, i)
  601. if end_target == ']>':
  602. if k < 0:
  603. k = string.find(rawdata, '>', i)
  604. else:
  605. k = k + 1
  606. if k >= 0:
  607. i = k + 1
  608. else:
  609. return -1
  610. self.lex_declaration(strs)
  611. return i - start
  612. # Regular expressions used for parsing:
  613. OPTIONAL_WHITESPACE = "[%s]*" % whitespace
  614. interesting = re.compile('[&<]')
  615. incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|'
  616. '#[0-9]*)?|'
  617. '<([a-zA-Z][^<>]*|'
  618. '/([a-zA-Z][^<>]*)?|'
  619. '![^<>]*)?')
  620. entityref = re.compile(ERO + '([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]')
  621. simplecharref = re.compile(CRO + '([0-9]+[^0-9])')
  622. legalcharref \
  623. = re.compile(CRO + '([0-9]+[^0-9]|[a-zA-Z.-]+[^a-zA-Z.-])')
  624. processinginstruction = re.compile('<\\?([^>]*)' + PIC)
  625. starttagopen = re.compile(STAGO + '[>a-zA-Z]')
  626. shorttagopen = re.compile(STAGO + '[a-zA-Z][-.a-zA-Z0-9]*'
  627. + OPTIONAL_WHITESPACE + NET)
  628. shorttag = re.compile(STAGO + '([a-zA-Z][-.a-zA-Z0-9]*)'
  629. + OPTIONAL_WHITESPACE + NET + '([^/]*)' + NET)
  630. endtagopen = re.compile(ETAGO + '[<>a-zA-Z]')
  631. endbracket = re.compile('[<>]')
  632. endtag = re.compile(ETAGO +
  633. '([a-zA-Z][-.a-zA-Z0-9]*)'
  634. '([^-.<>a-zA-Z0-9]?[^<>]*)[<>]')
  635. special = re.compile(MDO + '[^>]*' + MDC)
  636. markupdeclaration = re.compile(MDO +
  637. '(([-.a-zA-Z0-9]+|'
  638. + LIT + '[^"]*' + LIT + '|'
  639. + LITA + "[^']*" + LITA + '|'
  640. + COM + '([^-]|-[^-])*' + COM
  641. + ')' + OPTIONAL_WHITESPACE
  642. + ')*' + MDC)
  643. md_name = re.compile('([^>%s\'"]+)' % whitespace
  644. + OPTIONAL_WHITESPACE)
  645. md_string = re.compile('("[^"]*"|\'[^\']*\')' + OPTIONAL_WHITESPACE)
  646. commentopen = re.compile(MDO + COM)
  647. commentclose = re.compile(COM + OPTIONAL_WHITESPACE + MDC)
  648. tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
  649. attrfind = re.compile(
  650. # comma is for compatibility
  651. ('[%s,]*([_a-zA-Z][-:.a-zA-Z_0-9]*)' % whitespace)
  652. + '(' + OPTIONAL_WHITESPACE + VI + OPTIONAL_WHITESPACE # VI
  653. + '(' + LITA + "[^']*" + LITA
  654. + '|' + LIT + '[^"]*' + LIT
  655. + '|[\-~a-zA-Z0-9,./:+*%?!\\(\\)_#=]*))?')
  656. tagend = re.compile(OPTIONAL_WHITESPACE + '[<>/]')
  657. # used below in comment_match()
  658. comment_start = re.compile(COM + '([^-]*)-(.|\\n)')
  659. comment_segment = re.compile('([^-]*)-(.|\\n)')
  660. comment_whitespace = re.compile(OPTIONAL_WHITESPACE)
  661. del re
  662. def comment_match(rawdata, start):
  663. """Match a legal SGML comment.
  664. rawdata
  665. Data buffer, as a string.
  666. start
  667. Starting index into buffer. This should point to the `<'
  668. character of the Markup Declaration Open.
  669. Analyzes SGML comments using very simple regular expressions to
  670. ensure that the limits of the regular expression package are not
  671. exceeded. Very long comments with embedded hyphens which cross
  672. buffer boundaries can easily generate problems with less-than-
  673. ideal RE implementations.
  674. Returns the number of characters to consume from the input buffer
  675. (*not* including the first `start' characters!) and the text of
  676. comment located. If no comment was identified, returns -1 and
  677. an empty string.
  678. """
  679. matcher = comment_start.match(rawdata, start)
  680. if not matcher or matcher.start() < 0:
  681. return -1, ''
  682. pos = start
  683. comment = ''
  684. matchlength = m.start()
  685. while matchlength >= 0:
  686. if matcher.group(2) == "-":
  687. # skip any whitespace
  688. ws = comment_whitespace.match(rawdata, pos + matchlength)
  689. if ws:
  690. ws = ws.start()
  691. else:
  692. ws = 0
  693. pos = pos + matchlength + ws
  694. return pos - start, comment + matcher.group(1)
  695. # only a partial match
  696. comment = "%s%s-%s" % (comment,
  697. matcher.group(1), matcher.group(2))
  698. pos = pos + matchlength
  699. matcher = comment_segment.match(rawdata, pos)
  700. if not matcher:
  701. matchlength = -1
  702. else:
  703. matchlength = matcher.start()
  704. return -1, ''