PageRenderTime 50ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/other/FetchData/mechanize/_html.py

http://github.com/jbeezley/wrf-fire
Python | 631 lines | 543 code | 46 blank | 42 comment | 52 complexity | 069652310b4beeec4d36a7e4a949046a MD5 | raw file
Possible License(s): AGPL-1.0
  1. """HTML handling.
  2. Copyright 2003-2006 John J. Lee <jjl@pobox.com>
  3. This code is free software; you can redistribute it and/or modify it under
  4. the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
  5. included with the distribution).
  6. """
  7. import re, copy, htmlentitydefs
  8. import sgmllib, ClientForm
  9. import _request
  10. from _headersutil import split_header_words, is_html as _is_html
  11. import _rfc3986
  12. DEFAULT_ENCODING = "latin-1"
  13. COMPRESS_RE = re.compile(r"\s+")
  14. # the base classe is purely for backwards compatibility
  15. class ParseError(ClientForm.ParseError): pass
  16. class CachingGeneratorFunction(object):
  17. """Caching wrapper around a no-arguments iterable."""
  18. def __init__(self, iterable):
  19. self._cache = []
  20. # wrap iterable to make it non-restartable (otherwise, repeated
  21. # __call__ would give incorrect results)
  22. self._iterator = iter(iterable)
  23. def __call__(self):
  24. cache = self._cache
  25. for item in cache:
  26. yield item
  27. for item in self._iterator:
  28. cache.append(item)
  29. yield item
  30. class EncodingFinder:
  31. def __init__(self, default_encoding):
  32. self._default_encoding = default_encoding
  33. def encoding(self, response):
  34. # HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV
  35. # headers may be in the response. HTTP-EQUIV headers come last,
  36. # so try in order from first to last.
  37. for ct in response.info().getheaders("content-type"):
  38. for k, v in split_header_words([ct])[0]:
  39. if k == "charset":
  40. return v
  41. return self._default_encoding
  42. class ResponseTypeFinder:
  43. def __init__(self, allow_xhtml):
  44. self._allow_xhtml = allow_xhtml
  45. def is_html(self, response, encoding):
  46. ct_hdrs = response.info().getheaders("content-type")
  47. url = response.geturl()
  48. # XXX encoding
  49. return _is_html(ct_hdrs, url, self._allow_xhtml)
  50. # idea for this argument-processing trick is from Peter Otten
  51. class Args:
  52. def __init__(self, args_map):
  53. self.dictionary = dict(args_map)
  54. def __getattr__(self, key):
  55. try:
  56. return self.dictionary[key]
  57. except KeyError:
  58. return getattr(self.__class__, key)
  59. def form_parser_args(
  60. select_default=False,
  61. form_parser_class=None,
  62. request_class=None,
  63. backwards_compat=False,
  64. ):
  65. return Args(locals())
  66. class Link:
  67. def __init__(self, base_url, url, text, tag, attrs):
  68. assert None not in [url, tag, attrs]
  69. self.base_url = base_url
  70. self.absolute_url = _rfc3986.urljoin(base_url, url)
  71. self.url, self.text, self.tag, self.attrs = url, text, tag, attrs
  72. def __cmp__(self, other):
  73. try:
  74. for name in "url", "text", "tag", "attrs":
  75. if getattr(self, name) != getattr(other, name):
  76. return -1
  77. except AttributeError:
  78. return -1
  79. return 0
  80. def __repr__(self):
  81. return "Link(base_url=%r, url=%r, text=%r, tag=%r, attrs=%r)" % (
  82. self.base_url, self.url, self.text, self.tag, self.attrs)
  83. class LinksFactory:
  84. def __init__(self,
  85. link_parser_class=None,
  86. link_class=Link,
  87. urltags=None,
  88. ):
  89. import _pullparser
  90. if link_parser_class is None:
  91. link_parser_class = _pullparser.TolerantPullParser
  92. self.link_parser_class = link_parser_class
  93. self.link_class = link_class
  94. if urltags is None:
  95. urltags = {
  96. "a": "href",
  97. "area": "href",
  98. "frame": "src",
  99. "iframe": "src",
  100. }
  101. self.urltags = urltags
  102. self._response = None
  103. self._encoding = None
  104. def set_response(self, response, base_url, encoding):
  105. self._response = response
  106. self._encoding = encoding
  107. self._base_url = base_url
  108. def links(self):
  109. """Return an iterator that provides links of the document."""
  110. response = self._response
  111. encoding = self._encoding
  112. base_url = self._base_url
  113. p = self.link_parser_class(response, encoding=encoding)
  114. try:
  115. for token in p.tags(*(self.urltags.keys()+["base"])):
  116. if token.type == "endtag":
  117. continue
  118. if token.data == "base":
  119. base_href = dict(token.attrs).get("href")
  120. if base_href is not None:
  121. base_url = base_href
  122. continue
  123. attrs = dict(token.attrs)
  124. tag = token.data
  125. name = attrs.get("name")
  126. text = None
  127. # XXX use attr_encoding for ref'd doc if that doc does not
  128. # provide one by other means
  129. #attr_encoding = attrs.get("charset")
  130. url = attrs.get(self.urltags[tag]) # XXX is "" a valid URL?
  131. if not url:
  132. # Probably an <A NAME="blah"> link or <AREA NOHREF...>.
  133. # For our purposes a link is something with a URL, so
  134. # ignore this.
  135. continue
  136. url = _rfc3986.clean_url(url, encoding)
  137. if tag == "a":
  138. if token.type != "startendtag":
  139. # hmm, this'd break if end tag is missing
  140. text = p.get_compressed_text(("endtag", tag))
  141. # but this doesn't work for eg.
  142. # <a href="blah"><b>Andy</b></a>
  143. #text = p.get_compressed_text()
  144. yield Link(base_url, url, text, tag, token.attrs)
  145. except sgmllib.SGMLParseError, exc:
  146. raise ParseError(exc)
  147. class FormsFactory:
  148. """Makes a sequence of objects satisfying ClientForm.HTMLForm interface.
  149. After calling .forms(), the .global_form attribute is a form object
  150. containing all controls not a descendant of any FORM element.
  151. For constructor argument docs, see ClientForm.ParseResponse
  152. argument docs.
  153. """
  154. def __init__(self,
  155. select_default=False,
  156. form_parser_class=None,
  157. request_class=None,
  158. backwards_compat=False,
  159. ):
  160. import ClientForm
  161. self.select_default = select_default
  162. if form_parser_class is None:
  163. form_parser_class = ClientForm.FormParser
  164. self.form_parser_class = form_parser_class
  165. if request_class is None:
  166. request_class = _request.Request
  167. self.request_class = request_class
  168. self.backwards_compat = backwards_compat
  169. self._response = None
  170. self.encoding = None
  171. self.global_form = None
  172. def set_response(self, response, encoding):
  173. self._response = response
  174. self.encoding = encoding
  175. self.global_form = None
  176. def forms(self):
  177. import ClientForm
  178. encoding = self.encoding
  179. try:
  180. forms = ClientForm.ParseResponseEx(
  181. self._response,
  182. select_default=self.select_default,
  183. form_parser_class=self.form_parser_class,
  184. request_class=self.request_class,
  185. encoding=encoding,
  186. _urljoin=_rfc3986.urljoin,
  187. _urlparse=_rfc3986.urlsplit,
  188. _urlunparse=_rfc3986.urlunsplit,
  189. )
  190. except ClientForm.ParseError, exc:
  191. raise ParseError(exc)
  192. self.global_form = forms[0]
  193. return forms[1:]
  194. class TitleFactory:
  195. def __init__(self):
  196. self._response = self._encoding = None
  197. def set_response(self, response, encoding):
  198. self._response = response
  199. self._encoding = encoding
  200. def _get_title_text(self, parser):
  201. import _pullparser
  202. text = []
  203. tok = None
  204. while 1:
  205. try:
  206. tok = parser.get_token()
  207. except _pullparser.NoMoreTokensError:
  208. break
  209. if tok.type == "data":
  210. text.append(str(tok))
  211. elif tok.type == "entityref":
  212. t = unescape("&%s;" % tok.data,
  213. parser._entitydefs, parser.encoding)
  214. text.append(t)
  215. elif tok.type == "charref":
  216. t = unescape_charref(tok.data, parser.encoding)
  217. text.append(t)
  218. elif tok.type in ["starttag", "endtag", "startendtag"]:
  219. tag_name = tok.data
  220. if tok.type == "endtag" and tag_name == "title":
  221. break
  222. text.append(str(tok))
  223. return COMPRESS_RE.sub(" ", "".join(text).strip())
  224. def title(self):
  225. import _pullparser
  226. p = _pullparser.TolerantPullParser(
  227. self._response, encoding=self._encoding)
  228. try:
  229. try:
  230. p.get_tag("title")
  231. except _pullparser.NoMoreTokensError:
  232. return None
  233. else:
  234. return self._get_title_text(p)
  235. except sgmllib.SGMLParseError, exc:
  236. raise ParseError(exc)
  237. def unescape(data, entities, encoding):
  238. if data is None or "&" not in data:
  239. return data
  240. def replace_entities(match):
  241. ent = match.group()
  242. if ent[1] == "#":
  243. return unescape_charref(ent[2:-1], encoding)
  244. repl = entities.get(ent[1:-1])
  245. if repl is not None:
  246. repl = unichr(repl)
  247. if type(repl) != type(""):
  248. try:
  249. repl = repl.encode(encoding)
  250. except UnicodeError:
  251. repl = ent
  252. else:
  253. repl = ent
  254. return repl
  255. return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
  256. def unescape_charref(data, encoding):
  257. name, base = data, 10
  258. if name.startswith("x"):
  259. name, base= name[1:], 16
  260. uc = unichr(int(name, base))
  261. if encoding is None:
  262. return uc
  263. else:
  264. try:
  265. repl = uc.encode(encoding)
  266. except UnicodeError:
  267. repl = "&#%s;" % data
  268. return repl
  269. # bizarre import gymnastics for bundled BeautifulSoup
  270. import _beautifulsoup
  271. import ClientForm
  272. RobustFormParser, NestingRobustFormParser = ClientForm._create_bs_classes(
  273. _beautifulsoup.BeautifulSoup, _beautifulsoup.ICantBelieveItsBeautifulSoup
  274. )
  275. # monkeypatch sgmllib to fix http://www.python.org/sf/803422 :-(
  276. sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
  277. class MechanizeBs(_beautifulsoup.BeautifulSoup):
  278. _entitydefs = htmlentitydefs.name2codepoint
  279. # don't want the magic Microsoft-char workaround
  280. PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
  281. lambda(x):x.group(1) + ' />'),
  282. (re.compile('<!\s+([^<>]*)>'),
  283. lambda(x):'<!' + x.group(1) + '>')
  284. ]
  285. def __init__(self, encoding, text=None, avoidParserProblems=True,
  286. initialTextIsEverything=True):
  287. self._encoding = encoding
  288. _beautifulsoup.BeautifulSoup.__init__(
  289. self, text, avoidParserProblems, initialTextIsEverything)
  290. def handle_charref(self, ref):
  291. t = unescape("&#%s;"%ref, self._entitydefs, self._encoding)
  292. self.handle_data(t)
  293. def handle_entityref(self, ref):
  294. t = unescape("&%s;"%ref, self._entitydefs, self._encoding)
  295. self.handle_data(t)
  296. def unescape_attrs(self, attrs):
  297. escaped_attrs = []
  298. for key, val in attrs:
  299. val = unescape(val, self._entitydefs, self._encoding)
  300. escaped_attrs.append((key, val))
  301. return escaped_attrs
  302. class RobustLinksFactory:
  303. compress_re = COMPRESS_RE
  304. def __init__(self,
  305. link_parser_class=None,
  306. link_class=Link,
  307. urltags=None,
  308. ):
  309. if link_parser_class is None:
  310. link_parser_class = MechanizeBs
  311. self.link_parser_class = link_parser_class
  312. self.link_class = link_class
  313. if urltags is None:
  314. urltags = {
  315. "a": "href",
  316. "area": "href",
  317. "frame": "src",
  318. "iframe": "src",
  319. }
  320. self.urltags = urltags
  321. self._bs = None
  322. self._encoding = None
  323. self._base_url = None
  324. def set_soup(self, soup, base_url, encoding):
  325. self._bs = soup
  326. self._base_url = base_url
  327. self._encoding = encoding
  328. def links(self):
  329. import _beautifulsoup
  330. bs = self._bs
  331. base_url = self._base_url
  332. encoding = self._encoding
  333. gen = bs.recursiveChildGenerator()
  334. for ch in bs.recursiveChildGenerator():
  335. if (isinstance(ch, _beautifulsoup.Tag) and
  336. ch.name in self.urltags.keys()+["base"]):
  337. link = ch
  338. attrs = bs.unescape_attrs(link.attrs)
  339. attrs_dict = dict(attrs)
  340. if link.name == "base":
  341. base_href = attrs_dict.get("href")
  342. if base_href is not None:
  343. base_url = base_href
  344. continue
  345. url_attr = self.urltags[link.name]
  346. url = attrs_dict.get(url_attr)
  347. if not url:
  348. continue
  349. url = _rfc3986.clean_url(url, encoding)
  350. text = link.fetchText(lambda t: True)
  351. if not text:
  352. # follow _pullparser's weird behaviour rigidly
  353. if link.name == "a":
  354. text = ""
  355. else:
  356. text = None
  357. else:
  358. text = self.compress_re.sub(" ", " ".join(text).strip())
  359. yield Link(base_url, url, text, link.name, attrs)
  360. class RobustFormsFactory(FormsFactory):
  361. def __init__(self, *args, **kwds):
  362. args = form_parser_args(*args, **kwds)
  363. if args.form_parser_class is None:
  364. args.form_parser_class = RobustFormParser
  365. FormsFactory.__init__(self, **args.dictionary)
  366. def set_response(self, response, encoding):
  367. self._response = response
  368. self.encoding = encoding
  369. class RobustTitleFactory:
  370. def __init__(self):
  371. self._bs = self._encoding = None
  372. def set_soup(self, soup, encoding):
  373. self._bs = soup
  374. self._encoding = encoding
  375. def title(self):
  376. import _beautifulsoup
  377. title = self._bs.first("title")
  378. if title == _beautifulsoup.Null:
  379. return None
  380. else:
  381. inner_html = "".join([str(node) for node in title.contents])
  382. return COMPRESS_RE.sub(" ", inner_html.strip())
  383. class Factory:
  384. """Factory for forms, links, etc.
  385. This interface may expand in future.
  386. Public methods:
  387. set_request_class(request_class)
  388. set_response(response)
  389. forms()
  390. links()
  391. Public attributes:
  392. Note that accessing these attributes may raise ParseError.
  393. encoding: string specifying the encoding of response if it contains a text
  394. document (this value is left unspecified for documents that do not have
  395. an encoding, e.g. an image file)
  396. is_html: true if response contains an HTML document (XHTML may be
  397. regarded as HTML too)
  398. title: page title, or None if no title or not HTML
  399. global_form: form object containing all controls that are not descendants
  400. of any FORM element, or None if the forms_factory does not support
  401. supplying a global form
  402. """
  403. LAZY_ATTRS = ["encoding", "is_html", "title", "global_form"]
  404. def __init__(self, forms_factory, links_factory, title_factory,
  405. encoding_finder=EncodingFinder(DEFAULT_ENCODING),
  406. response_type_finder=ResponseTypeFinder(allow_xhtml=False),
  407. ):
  408. """
  409. Pass keyword arguments only.
  410. default_encoding: character encoding to use if encoding cannot be
  411. determined (or guessed) from the response. You should turn on
  412. HTTP-EQUIV handling if you want the best chance of getting this right
  413. without resorting to this default. The default value of this
  414. parameter (currently latin-1) may change in future.
  415. """
  416. self._forms_factory = forms_factory
  417. self._links_factory = links_factory
  418. self._title_factory = title_factory
  419. self._encoding_finder = encoding_finder
  420. self._response_type_finder = response_type_finder
  421. self.set_response(None)
  422. def set_request_class(self, request_class):
  423. """Set urllib2.Request class.
  424. ClientForm.HTMLForm instances returned by .forms() will return
  425. instances of this class when .click()ed.
  426. """
  427. self._forms_factory.request_class = request_class
  428. def set_response(self, response):
  429. """Set response.
  430. The response must either be None or implement the same interface as
  431. objects returned by urllib2.urlopen().
  432. """
  433. self._response = response
  434. self._forms_genf = self._links_genf = None
  435. self._get_title = None
  436. for name in self.LAZY_ATTRS:
  437. try:
  438. delattr(self, name)
  439. except AttributeError:
  440. pass
  441. def __getattr__(self, name):
  442. if name not in self.LAZY_ATTRS:
  443. return getattr(self.__class__, name)
  444. if name == "encoding":
  445. self.encoding = self._encoding_finder.encoding(
  446. copy.copy(self._response))
  447. return self.encoding
  448. elif name == "is_html":
  449. self.is_html = self._response_type_finder.is_html(
  450. copy.copy(self._response), self.encoding)
  451. return self.is_html
  452. elif name == "title":
  453. if self.is_html:
  454. self.title = self._title_factory.title()
  455. else:
  456. self.title = None
  457. return self.title
  458. elif name == "global_form":
  459. self.forms()
  460. return self.global_form
  461. def forms(self):
  462. """Return iterable over ClientForm.HTMLForm-like objects.
  463. Raises mechanize.ParseError on failure.
  464. """
  465. # this implementation sets .global_form as a side-effect, for benefit
  466. # of __getattr__ impl
  467. if self._forms_genf is None:
  468. try:
  469. self._forms_genf = CachingGeneratorFunction(
  470. self._forms_factory.forms())
  471. except: # XXXX define exception!
  472. self.set_response(self._response)
  473. raise
  474. self.global_form = getattr(
  475. self._forms_factory, "global_form", None)
  476. return self._forms_genf()
  477. def links(self):
  478. """Return iterable over mechanize.Link-like objects.
  479. Raises mechanize.ParseError on failure.
  480. """
  481. if self._links_genf is None:
  482. try:
  483. self._links_genf = CachingGeneratorFunction(
  484. self._links_factory.links())
  485. except: # XXXX define exception!
  486. self.set_response(self._response)
  487. raise
  488. return self._links_genf()
  489. class DefaultFactory(Factory):
  490. """Based on sgmllib."""
  491. def __init__(self, i_want_broken_xhtml_support=False):
  492. Factory.__init__(
  493. self,
  494. forms_factory=FormsFactory(),
  495. links_factory=LinksFactory(),
  496. title_factory=TitleFactory(),
  497. response_type_finder=ResponseTypeFinder(
  498. allow_xhtml=i_want_broken_xhtml_support),
  499. )
  500. def set_response(self, response):
  501. Factory.set_response(self, response)
  502. if response is not None:
  503. self._forms_factory.set_response(
  504. copy.copy(response), self.encoding)
  505. self._links_factory.set_response(
  506. copy.copy(response), response.geturl(), self.encoding)
  507. self._title_factory.set_response(
  508. copy.copy(response), self.encoding)
  509. class RobustFactory(Factory):
  510. """Based on BeautifulSoup, hopefully a bit more robust to bad HTML than is
  511. DefaultFactory.
  512. """
  513. def __init__(self, i_want_broken_xhtml_support=False,
  514. soup_class=None):
  515. Factory.__init__(
  516. self,
  517. forms_factory=RobustFormsFactory(),
  518. links_factory=RobustLinksFactory(),
  519. title_factory=RobustTitleFactory(),
  520. response_type_finder=ResponseTypeFinder(
  521. allow_xhtml=i_want_broken_xhtml_support),
  522. )
  523. if soup_class is None:
  524. soup_class = MechanizeBs
  525. self._soup_class = soup_class
  526. def set_response(self, response):
  527. Factory.set_response(self, response)
  528. if response is not None:
  529. data = response.read()
  530. soup = self._soup_class(self.encoding, data)
  531. self._forms_factory.set_response(
  532. copy.copy(response), self.encoding)
  533. self._links_factory.set_soup(
  534. soup, response.geturl(), self.encoding)
  535. self._title_factory.set_soup(soup, self.encoding)