PageRenderTime 54ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/lib-python/2.7/markupbase.py

https://bitbucket.org/evelyn559/pypy
Python | 392 lines | 361 code | 14 blank | 17 comment | 8 complexity | 6baa97b4cca9155666ffa5705197596f MD5 | raw file
  1. """Shared support for scanning document type declarations in HTML and XHTML.
  2. This module is used as a foundation for the HTMLParser and sgmllib
  3. modules (indirectly, for htmllib as well). It has no documented
  4. public API and should not be used directly.
  5. """
  6. import re
  7. _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
  8. _declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match
  9. _commentclose = re.compile(r'--\s*>')
  10. _markedsectionclose = re.compile(r']\s*]\s*>')
  11. # An analysis of the MS-Word extensions is available at
  12. # http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
  13. _msmarkedsectionclose = re.compile(r']\s*>')
  14. del re
  15. class ParserBase:
  16. """Parser base class which provides some common support methods used
  17. by the SGML/HTML and XHTML parsers."""
  18. def __init__(self):
  19. if self.__class__ is ParserBase:
  20. raise RuntimeError(
  21. "markupbase.ParserBase must be subclassed")
  22. def error(self, message):
  23. raise NotImplementedError(
  24. "subclasses of ParserBase must override error()")
  25. def reset(self):
  26. self.lineno = 1
  27. self.offset = 0
  28. def getpos(self):
  29. """Return current line number and offset."""
  30. return self.lineno, self.offset
  31. # Internal -- update line number and offset. This should be
  32. # called for each piece of data exactly once, in order -- in other
  33. # words the concatenation of all the input strings to this
  34. # function should be exactly the entire input.
  35. def updatepos(self, i, j):
  36. if i >= j:
  37. return j
  38. rawdata = self.rawdata
  39. nlines = rawdata.count("\n", i, j)
  40. if nlines:
  41. self.lineno = self.lineno + nlines
  42. pos = rawdata.rindex("\n", i, j) # Should not fail
  43. self.offset = j-(pos+1)
  44. else:
  45. self.offset = self.offset + j-i
  46. return j
  47. _decl_otherchars = ''
  48. # Internal -- parse declaration (for use by subclasses).
  49. def parse_declaration(self, i):
  50. # This is some sort of declaration; in "HTML as
  51. # deployed," this should only be the document type
  52. # declaration ("<!DOCTYPE html...>").
  53. # ISO 8879:1986, however, has more complex
  54. # declaration syntax for elements in <!...>, including:
  55. # --comment--
  56. # [marked section]
  57. # name in the following list: ENTITY, DOCTYPE, ELEMENT,
  58. # ATTLIST, NOTATION, SHORTREF, USEMAP,
  59. # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
  60. rawdata = self.rawdata
  61. j = i + 2
  62. assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
  63. if rawdata[j:j+1] == ">":
  64. # the empty comment <!>
  65. return j + 1
  66. if rawdata[j:j+1] in ("-", ""):
  67. # Start of comment followed by buffer boundary,
  68. # or just a buffer boundary.
  69. return -1
  70. # A simple, practical version could look like: ((name|stringlit) S*) + '>'
  71. n = len(rawdata)
  72. if rawdata[j:j+2] == '--': #comment
  73. # Locate --.*-- as the body of the comment
  74. return self.parse_comment(i)
  75. elif rawdata[j] == '[': #marked section
  76. # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
  77. # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
  78. # Note that this is extended by Microsoft Office "Save as Web" function
  79. # to include [if...] and [endif].
  80. return self.parse_marked_section(i)
  81. else: #all other declaration elements
  82. decltype, j = self._scan_name(j, i)
  83. if j < 0:
  84. return j
  85. if decltype == "doctype":
  86. self._decl_otherchars = ''
  87. while j < n:
  88. c = rawdata[j]
  89. if c == ">":
  90. # end of declaration syntax
  91. data = rawdata[i+2:j]
  92. if decltype == "doctype":
  93. self.handle_decl(data)
  94. else:
  95. self.unknown_decl(data)
  96. return j + 1
  97. if c in "\"'":
  98. m = _declstringlit_match(rawdata, j)
  99. if not m:
  100. return -1 # incomplete
  101. j = m.end()
  102. elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
  103. name, j = self._scan_name(j, i)
  104. elif c in self._decl_otherchars:
  105. j = j + 1
  106. elif c == "[":
  107. # this could be handled in a separate doctype parser
  108. if decltype == "doctype":
  109. j = self._parse_doctype_subset(j + 1, i)
  110. elif decltype in ("attlist", "linktype", "link", "element"):
  111. # must tolerate []'d groups in a content model in an element declaration
  112. # also in data attribute specifications of attlist declaration
  113. # also link type declaration subsets in linktype declarations
  114. # also link attribute specification lists in link declarations
  115. self.error("unsupported '[' char in %s declaration" % decltype)
  116. else:
  117. self.error("unexpected '[' char in declaration")
  118. else:
  119. self.error(
  120. "unexpected %r char in declaration" % rawdata[j])
  121. if j < 0:
  122. return j
  123. return -1 # incomplete
  124. # Internal -- parse a marked section
  125. # Override this to handle MS-word extension syntax <![if word]>content<![endif]>
  126. def parse_marked_section(self, i, report=1):
  127. rawdata= self.rawdata
  128. assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"
  129. sectName, j = self._scan_name( i+3, i )
  130. if j < 0:
  131. return j
  132. if sectName in ("temp", "cdata", "ignore", "include", "rcdata"):
  133. # look for standard ]]> ending
  134. match= _markedsectionclose.search(rawdata, i+3)
  135. elif sectName in ("if", "else", "endif"):
  136. # look for MS Office ]> ending
  137. match= _msmarkedsectionclose.search(rawdata, i+3)
  138. else:
  139. self.error('unknown status keyword %r in marked section' % rawdata[i+3:j])
  140. if not match:
  141. return -1
  142. if report:
  143. j = match.start(0)
  144. self.unknown_decl(rawdata[i+3: j])
  145. return match.end(0)
  146. # Internal -- parse comment, return length or -1 if not terminated
  147. def parse_comment(self, i, report=1):
  148. rawdata = self.rawdata
  149. if rawdata[i:i+4] != '<!--':
  150. self.error('unexpected call to parse_comment()')
  151. match = _commentclose.search(rawdata, i+4)
  152. if not match:
  153. return -1
  154. if report:
  155. j = match.start(0)
  156. self.handle_comment(rawdata[i+4: j])
  157. return match.end(0)
  158. # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
  159. # returning the index just past any whitespace following the trailing ']'.
  160. def _parse_doctype_subset(self, i, declstartpos):
  161. rawdata = self.rawdata
  162. n = len(rawdata)
  163. j = i
  164. while j < n:
  165. c = rawdata[j]
  166. if c == "<":
  167. s = rawdata[j:j+2]
  168. if s == "<":
  169. # end of buffer; incomplete
  170. return -1
  171. if s != "<!":
  172. self.updatepos(declstartpos, j + 1)
  173. self.error("unexpected char in internal subset (in %r)" % s)
  174. if (j + 2) == n:
  175. # end of buffer; incomplete
  176. return -1
  177. if (j + 4) > n:
  178. # end of buffer; incomplete
  179. return -1
  180. if rawdata[j:j+4] == "<!--":
  181. j = self.parse_comment(j, report=0)
  182. if j < 0:
  183. return j
  184. continue
  185. name, j = self._scan_name(j + 2, declstartpos)
  186. if j == -1:
  187. return -1
  188. if name not in ("attlist", "element", "entity", "notation"):
  189. self.updatepos(declstartpos, j + 2)
  190. self.error(
  191. "unknown declaration %r in internal subset" % name)
  192. # handle the individual names
  193. meth = getattr(self, "_parse_doctype_" + name)
  194. j = meth(j, declstartpos)
  195. if j < 0:
  196. return j
  197. elif c == "%":
  198. # parameter entity reference
  199. if (j + 1) == n:
  200. # end of buffer; incomplete
  201. return -1
  202. s, j = self._scan_name(j + 1, declstartpos)
  203. if j < 0:
  204. return j
  205. if rawdata[j] == ";":
  206. j = j + 1
  207. elif c == "]":
  208. j = j + 1
  209. while j < n and rawdata[j].isspace():
  210. j = j + 1
  211. if j < n:
  212. if rawdata[j] == ">":
  213. return j
  214. self.updatepos(declstartpos, j)
  215. self.error("unexpected char after internal subset")
  216. else:
  217. return -1
  218. elif c.isspace():
  219. j = j + 1
  220. else:
  221. self.updatepos(declstartpos, j)
  222. self.error("unexpected char %r in internal subset" % c)
  223. # end of buffer reached
  224. return -1
  225. # Internal -- scan past <!ELEMENT declarations
  226. def _parse_doctype_element(self, i, declstartpos):
  227. name, j = self._scan_name(i, declstartpos)
  228. if j == -1:
  229. return -1
  230. # style content model; just skip until '>'
  231. rawdata = self.rawdata
  232. if '>' in rawdata[j:]:
  233. return rawdata.find(">", j) + 1
  234. return -1
  235. # Internal -- scan past <!ATTLIST declarations
  236. def _parse_doctype_attlist(self, i, declstartpos):
  237. rawdata = self.rawdata
  238. name, j = self._scan_name(i, declstartpos)
  239. c = rawdata[j:j+1]
  240. if c == "":
  241. return -1
  242. if c == ">":
  243. return j + 1
  244. while 1:
  245. # scan a series of attribute descriptions; simplified:
  246. # name type [value] [#constraint]
  247. name, j = self._scan_name(j, declstartpos)
  248. if j < 0:
  249. return j
  250. c = rawdata[j:j+1]
  251. if c == "":
  252. return -1
  253. if c == "(":
  254. # an enumerated type; look for ')'
  255. if ")" in rawdata[j:]:
  256. j = rawdata.find(")", j) + 1
  257. else:
  258. return -1
  259. while rawdata[j:j+1].isspace():
  260. j = j + 1
  261. if not rawdata[j:]:
  262. # end of buffer, incomplete
  263. return -1
  264. else:
  265. name, j = self._scan_name(j, declstartpos)
  266. c = rawdata[j:j+1]
  267. if not c:
  268. return -1
  269. if c in "'\"":
  270. m = _declstringlit_match(rawdata, j)
  271. if m:
  272. j = m.end()
  273. else:
  274. return -1
  275. c = rawdata[j:j+1]
  276. if not c:
  277. return -1
  278. if c == "#":
  279. if rawdata[j:] == "#":
  280. # end of buffer
  281. return -1
  282. name, j = self._scan_name(j + 1, declstartpos)
  283. if j < 0:
  284. return j
  285. c = rawdata[j:j+1]
  286. if not c:
  287. return -1
  288. if c == '>':
  289. # all done
  290. return j + 1
  291. # Internal -- scan past <!NOTATION declarations
  292. def _parse_doctype_notation(self, i, declstartpos):
  293. name, j = self._scan_name(i, declstartpos)
  294. if j < 0:
  295. return j
  296. rawdata = self.rawdata
  297. while 1:
  298. c = rawdata[j:j+1]
  299. if not c:
  300. # end of buffer; incomplete
  301. return -1
  302. if c == '>':
  303. return j + 1
  304. if c in "'\"":
  305. m = _declstringlit_match(rawdata, j)
  306. if not m:
  307. return -1
  308. j = m.end()
  309. else:
  310. name, j = self._scan_name(j, declstartpos)
  311. if j < 0:
  312. return j
  313. # Internal -- scan past <!ENTITY declarations
  314. def _parse_doctype_entity(self, i, declstartpos):
  315. rawdata = self.rawdata
  316. if rawdata[i:i+1] == "%":
  317. j = i + 1
  318. while 1:
  319. c = rawdata[j:j+1]
  320. if not c:
  321. return -1
  322. if c.isspace():
  323. j = j + 1
  324. else:
  325. break
  326. else:
  327. j = i
  328. name, j = self._scan_name(j, declstartpos)
  329. if j < 0:
  330. return j
  331. while 1:
  332. c = self.rawdata[j:j+1]
  333. if not c:
  334. return -1
  335. if c in "'\"":
  336. m = _declstringlit_match(rawdata, j)
  337. if m:
  338. j = m.end()
  339. else:
  340. return -1 # incomplete
  341. elif c == ">":
  342. return j + 1
  343. else:
  344. name, j = self._scan_name(j, declstartpos)
  345. if j < 0:
  346. return j
  347. # Internal -- scan a name token and the new position and the token, or
  348. # return -1 if we've reached the end of the buffer.
  349. def _scan_name(self, i, declstartpos):
  350. rawdata = self.rawdata
  351. n = len(rawdata)
  352. if i == n:
  353. return None, -1
  354. m = _declname_match(rawdata, i)
  355. if m:
  356. s = m.group()
  357. name = s.strip()
  358. if (i + len(s)) == n:
  359. return None, -1 # end of buffer
  360. return name.lower(), m.end()
  361. else:
  362. self.updatepos(declstartpos, i)
  363. self.error("expected name token at %r"
  364. % rawdata[declstartpos:declstartpos+20])
  365. # To be overridden -- handlers for unknown objects
  366. def unknown_decl(self, data):
  367. pass