PageRenderTime 58ms CodeModel.GetById 31ms RepoModel.GetById 0ms app.codeStats 0ms

/lib-python/2.7/markupbase.py

https://bitbucket.org/shomah4a/pypy2
Python | 396 lines | 365 code | 14 blank | 17 comment | 8 complexity | 50afb00f7c38cd21f46feb5351e3aa5f MD5 | raw file
  1. """Shared support for scanning document type declarations in HTML and XHTML.
  2. This module is used as a foundation for the HTMLParser and sgmllib
  3. modules (indirectly, for htmllib as well). It has no documented
  4. public API and should not be used directly.
  5. """
  6. import re
  7. _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
  8. _declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match
  9. _commentclose = re.compile(r'--\s*>')
  10. _markedsectionclose = re.compile(r']\s*]\s*>')
  11. # An analysis of the MS-Word extensions is available at
  12. # http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
  13. _msmarkedsectionclose = re.compile(r']\s*>')
  14. del re
  15. class ParserBase:
  16. """Parser base class which provides some common support methods used
  17. by the SGML/HTML and XHTML parsers."""
  18. def __init__(self):
  19. if self.__class__ is ParserBase:
  20. raise RuntimeError(
  21. "markupbase.ParserBase must be subclassed")
  22. def error(self, message):
  23. raise NotImplementedError(
  24. "subclasses of ParserBase must override error()")
  25. def reset(self):
  26. self.lineno = 1
  27. self.offset = 0
  28. def getpos(self):
  29. """Return current line number and offset."""
  30. return self.lineno, self.offset
  31. # Internal -- update line number and offset. This should be
  32. # called for each piece of data exactly once, in order -- in other
  33. # words the concatenation of all the input strings to this
  34. # function should be exactly the entire input.
  35. def updatepos(self, i, j):
  36. if i >= j:
  37. return j
  38. rawdata = self.rawdata
  39. nlines = rawdata.count("\n", i, j)
  40. if nlines:
  41. self.lineno = self.lineno + nlines
  42. pos = rawdata.rindex("\n", i, j) # Should not fail
  43. self.offset = j-(pos+1)
  44. else:
  45. self.offset = self.offset + j-i
  46. return j
  47. _decl_otherchars = ''
  48. # Internal -- parse declaration (for use by subclasses).
  49. def parse_declaration(self, i):
  50. # This is some sort of declaration; in "HTML as
  51. # deployed," this should only be the document type
  52. # declaration ("<!DOCTYPE html...>").
  53. # ISO 8879:1986, however, has more complex
  54. # declaration syntax for elements in <!...>, including:
  55. # --comment--
  56. # [marked section]
  57. # name in the following list: ENTITY, DOCTYPE, ELEMENT,
  58. # ATTLIST, NOTATION, SHORTREF, USEMAP,
  59. # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
  60. rawdata = self.rawdata
  61. j = i + 2
  62. assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
  63. if rawdata[j:j+1] == ">":
  64. # the empty comment <!>
  65. return j + 1
  66. if rawdata[j:j+1] in ("-", ""):
  67. # Start of comment followed by buffer boundary,
  68. # or just a buffer boundary.
  69. return -1
  70. # A simple, practical version could look like: ((name|stringlit) S*) + '>'
  71. n = len(rawdata)
  72. if rawdata[j:j+2] == '--': #comment
  73. # Locate --.*-- as the body of the comment
  74. return self.parse_comment(i)
  75. elif rawdata[j] == '[': #marked section
  76. # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
  77. # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
  78. # Note that this is extended by Microsoft Office "Save as Web" function
  79. # to include [if...] and [endif].
  80. return self.parse_marked_section(i)
  81. else: #all other declaration elements
  82. decltype, j = self._scan_name(j, i)
  83. if j < 0:
  84. return j
  85. if decltype == "doctype":
  86. self._decl_otherchars = ''
  87. while j < n:
  88. c = rawdata[j]
  89. if c == ">":
  90. # end of declaration syntax
  91. data = rawdata[i+2:j]
  92. if decltype == "doctype":
  93. self.handle_decl(data)
  94. else:
  95. # According to the HTML5 specs sections "8.2.4.44 Bogus
  96. # comment state" and "8.2.4.45 Markup declaration open
  97. # state", a comment token should be emitted.
  98. # Calling unknown_decl provides more flexibility though.
  99. self.unknown_decl(data)
  100. return j + 1
  101. if c in "\"'":
  102. m = _declstringlit_match(rawdata, j)
  103. if not m:
  104. return -1 # incomplete
  105. j = m.end()
  106. elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
  107. name, j = self._scan_name(j, i)
  108. elif c in self._decl_otherchars:
  109. j = j + 1
  110. elif c == "[":
  111. # this could be handled in a separate doctype parser
  112. if decltype == "doctype":
  113. j = self._parse_doctype_subset(j + 1, i)
  114. elif decltype in ("attlist", "linktype", "link", "element"):
  115. # must tolerate []'d groups in a content model in an element declaration
  116. # also in data attribute specifications of attlist declaration
  117. # also link type declaration subsets in linktype declarations
  118. # also link attribute specification lists in link declarations
  119. self.error("unsupported '[' char in %s declaration" % decltype)
  120. else:
  121. self.error("unexpected '[' char in declaration")
  122. else:
  123. self.error(
  124. "unexpected %r char in declaration" % rawdata[j])
  125. if j < 0:
  126. return j
  127. return -1 # incomplete
  128. # Internal -- parse a marked section
  129. # Override this to handle MS-word extension syntax <![if word]>content<![endif]>
  130. def parse_marked_section(self, i, report=1):
  131. rawdata= self.rawdata
  132. assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"
  133. sectName, j = self._scan_name( i+3, i )
  134. if j < 0:
  135. return j
  136. if sectName in ("temp", "cdata", "ignore", "include", "rcdata"):
  137. # look for standard ]]> ending
  138. match= _markedsectionclose.search(rawdata, i+3)
  139. elif sectName in ("if", "else", "endif"):
  140. # look for MS Office ]> ending
  141. match= _msmarkedsectionclose.search(rawdata, i+3)
  142. else:
  143. self.error('unknown status keyword %r in marked section' % rawdata[i+3:j])
  144. if not match:
  145. return -1
  146. if report:
  147. j = match.start(0)
  148. self.unknown_decl(rawdata[i+3: j])
  149. return match.end(0)
  150. # Internal -- parse comment, return length or -1 if not terminated
  151. def parse_comment(self, i, report=1):
  152. rawdata = self.rawdata
  153. if rawdata[i:i+4] != '<!--':
  154. self.error('unexpected call to parse_comment()')
  155. match = _commentclose.search(rawdata, i+4)
  156. if not match:
  157. return -1
  158. if report:
  159. j = match.start(0)
  160. self.handle_comment(rawdata[i+4: j])
  161. return match.end(0)
  162. # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
  163. # returning the index just past any whitespace following the trailing ']'.
  164. def _parse_doctype_subset(self, i, declstartpos):
  165. rawdata = self.rawdata
  166. n = len(rawdata)
  167. j = i
  168. while j < n:
  169. c = rawdata[j]
  170. if c == "<":
  171. s = rawdata[j:j+2]
  172. if s == "<":
  173. # end of buffer; incomplete
  174. return -1
  175. if s != "<!":
  176. self.updatepos(declstartpos, j + 1)
  177. self.error("unexpected char in internal subset (in %r)" % s)
  178. if (j + 2) == n:
  179. # end of buffer; incomplete
  180. return -1
  181. if (j + 4) > n:
  182. # end of buffer; incomplete
  183. return -1
  184. if rawdata[j:j+4] == "<!--":
  185. j = self.parse_comment(j, report=0)
  186. if j < 0:
  187. return j
  188. continue
  189. name, j = self._scan_name(j + 2, declstartpos)
  190. if j == -1:
  191. return -1
  192. if name not in ("attlist", "element", "entity", "notation"):
  193. self.updatepos(declstartpos, j + 2)
  194. self.error(
  195. "unknown declaration %r in internal subset" % name)
  196. # handle the individual names
  197. meth = getattr(self, "_parse_doctype_" + name)
  198. j = meth(j, declstartpos)
  199. if j < 0:
  200. return j
  201. elif c == "%":
  202. # parameter entity reference
  203. if (j + 1) == n:
  204. # end of buffer; incomplete
  205. return -1
  206. s, j = self._scan_name(j + 1, declstartpos)
  207. if j < 0:
  208. return j
  209. if rawdata[j] == ";":
  210. j = j + 1
  211. elif c == "]":
  212. j = j + 1
  213. while j < n and rawdata[j].isspace():
  214. j = j + 1
  215. if j < n:
  216. if rawdata[j] == ">":
  217. return j
  218. self.updatepos(declstartpos, j)
  219. self.error("unexpected char after internal subset")
  220. else:
  221. return -1
  222. elif c.isspace():
  223. j = j + 1
  224. else:
  225. self.updatepos(declstartpos, j)
  226. self.error("unexpected char %r in internal subset" % c)
  227. # end of buffer reached
  228. return -1
  229. # Internal -- scan past <!ELEMENT declarations
  230. def _parse_doctype_element(self, i, declstartpos):
  231. name, j = self._scan_name(i, declstartpos)
  232. if j == -1:
  233. return -1
  234. # style content model; just skip until '>'
  235. rawdata = self.rawdata
  236. if '>' in rawdata[j:]:
  237. return rawdata.find(">", j) + 1
  238. return -1
  239. # Internal -- scan past <!ATTLIST declarations
  240. def _parse_doctype_attlist(self, i, declstartpos):
  241. rawdata = self.rawdata
  242. name, j = self._scan_name(i, declstartpos)
  243. c = rawdata[j:j+1]
  244. if c == "":
  245. return -1
  246. if c == ">":
  247. return j + 1
  248. while 1:
  249. # scan a series of attribute descriptions; simplified:
  250. # name type [value] [#constraint]
  251. name, j = self._scan_name(j, declstartpos)
  252. if j < 0:
  253. return j
  254. c = rawdata[j:j+1]
  255. if c == "":
  256. return -1
  257. if c == "(":
  258. # an enumerated type; look for ')'
  259. if ")" in rawdata[j:]:
  260. j = rawdata.find(")", j) + 1
  261. else:
  262. return -1
  263. while rawdata[j:j+1].isspace():
  264. j = j + 1
  265. if not rawdata[j:]:
  266. # end of buffer, incomplete
  267. return -1
  268. else:
  269. name, j = self._scan_name(j, declstartpos)
  270. c = rawdata[j:j+1]
  271. if not c:
  272. return -1
  273. if c in "'\"":
  274. m = _declstringlit_match(rawdata, j)
  275. if m:
  276. j = m.end()
  277. else:
  278. return -1
  279. c = rawdata[j:j+1]
  280. if not c:
  281. return -1
  282. if c == "#":
  283. if rawdata[j:] == "#":
  284. # end of buffer
  285. return -1
  286. name, j = self._scan_name(j + 1, declstartpos)
  287. if j < 0:
  288. return j
  289. c = rawdata[j:j+1]
  290. if not c:
  291. return -1
  292. if c == '>':
  293. # all done
  294. return j + 1
  295. # Internal -- scan past <!NOTATION declarations
  296. def _parse_doctype_notation(self, i, declstartpos):
  297. name, j = self._scan_name(i, declstartpos)
  298. if j < 0:
  299. return j
  300. rawdata = self.rawdata
  301. while 1:
  302. c = rawdata[j:j+1]
  303. if not c:
  304. # end of buffer; incomplete
  305. return -1
  306. if c == '>':
  307. return j + 1
  308. if c in "'\"":
  309. m = _declstringlit_match(rawdata, j)
  310. if not m:
  311. return -1
  312. j = m.end()
  313. else:
  314. name, j = self._scan_name(j, declstartpos)
  315. if j < 0:
  316. return j
  317. # Internal -- scan past <!ENTITY declarations
  318. def _parse_doctype_entity(self, i, declstartpos):
  319. rawdata = self.rawdata
  320. if rawdata[i:i+1] == "%":
  321. j = i + 1
  322. while 1:
  323. c = rawdata[j:j+1]
  324. if not c:
  325. return -1
  326. if c.isspace():
  327. j = j + 1
  328. else:
  329. break
  330. else:
  331. j = i
  332. name, j = self._scan_name(j, declstartpos)
  333. if j < 0:
  334. return j
  335. while 1:
  336. c = self.rawdata[j:j+1]
  337. if not c:
  338. return -1
  339. if c in "'\"":
  340. m = _declstringlit_match(rawdata, j)
  341. if m:
  342. j = m.end()
  343. else:
  344. return -1 # incomplete
  345. elif c == ">":
  346. return j + 1
  347. else:
  348. name, j = self._scan_name(j, declstartpos)
  349. if j < 0:
  350. return j
  351. # Internal -- scan a name token and the new position and the token, or
  352. # return -1 if we've reached the end of the buffer.
  353. def _scan_name(self, i, declstartpos):
  354. rawdata = self.rawdata
  355. n = len(rawdata)
  356. if i == n:
  357. return None, -1
  358. m = _declname_match(rawdata, i)
  359. if m:
  360. s = m.group()
  361. name = s.strip()
  362. if (i + len(s)) == n:
  363. return None, -1 # end of buffer
  364. return name.lower(), m.end()
  365. else:
  366. self.updatepos(declstartpos, i)
  367. self.error("expected name token at %r"
  368. % rawdata[declstartpos:declstartpos+20])
  369. # To be overridden -- handlers for unknown objects
  370. def unknown_decl(self, data):
  371. pass