/mechanize/_html.py

https://github.com/tmiyamon/mechanize-python · Python · 629 lines · 451 code · 77 blank · 101 comment · 88 complexity · 17fdee3fba5ebceb2fd75595dd3d168e MD5 · raw file

  1. """HTML handling.
  2. Copyright 2003-2006 John J. Lee <jjl@pobox.com>
  3. This code is free software; you can redistribute it and/or modify it under
  4. the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
  5. included with the distribution).
  6. """
  7. import codecs
  8. import copy
  9. import htmlentitydefs
  10. import re
  11. import _sgmllib_copy as sgmllib
  12. import _beautifulsoup
  13. import _form
  14. from _headersutil import split_header_words, is_html as _is_html
  15. import _request
  16. import _rfc3986
  17. DEFAULT_ENCODING = "latin-1"
  18. COMPRESS_RE = re.compile(r"\s+")
  19. class CachingGeneratorFunction(object):
  20. """Caching wrapper around a no-arguments iterable."""
  21. def __init__(self, iterable):
  22. self._cache = []
  23. # wrap iterable to make it non-restartable (otherwise, repeated
  24. # __call__ would give incorrect results)
  25. self._iterator = iter(iterable)
  26. def __call__(self):
  27. cache = self._cache
  28. for item in cache:
  29. yield item
  30. for item in self._iterator:
  31. cache.append(item)
  32. yield item
  33. class EncodingFinder:
  34. def __init__(self, default_encoding):
  35. self._default_encoding = default_encoding
  36. def encoding(self, response):
  37. # HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV
  38. # headers may be in the response. HTTP-EQUIV headers come last,
  39. # so try in order from first to last.
  40. for ct in response.info().getheaders("content-type"):
  41. for k, v in split_header_words([ct])[0]:
  42. if k == "charset":
  43. encoding = v
  44. try:
  45. codecs.lookup(v)
  46. except LookupError:
  47. continue
  48. else:
  49. return encoding
  50. return self._default_encoding
  51. class ResponseTypeFinder:
  52. def __init__(self, allow_xhtml):
  53. self._allow_xhtml = allow_xhtml
  54. def is_html(self, response, encoding):
  55. ct_hdrs = response.info().getheaders("content-type")
  56. url = response.geturl()
  57. # XXX encoding
  58. return _is_html(ct_hdrs, url, self._allow_xhtml)
  59. class Args(object):
  60. # idea for this argument-processing trick is from Peter Otten
  61. def __init__(self, args_map):
  62. self.__dict__["dictionary"] = dict(args_map)
  63. def __getattr__(self, key):
  64. try:
  65. return self.dictionary[key]
  66. except KeyError:
  67. return getattr(self.__class__, key)
  68. def __setattr__(self, key, value):
  69. if key == "dictionary":
  70. raise AttributeError()
  71. self.dictionary[key] = value
  72. def form_parser_args(
  73. select_default=False,
  74. form_parser_class=None,
  75. request_class=None,
  76. backwards_compat=False,
  77. ):
  78. return Args(locals())
  79. class Link:
  80. def __init__(self, base_url, url, text, tag, attrs):
  81. assert None not in [url, tag, attrs]
  82. self.base_url = base_url
  83. self.absolute_url = _rfc3986.urljoin(base_url, url)
  84. self.url, self.text, self.tag, self.attrs = url, text, tag, attrs
  85. def __cmp__(self, other):
  86. try:
  87. for name in "url", "text", "tag", "attrs":
  88. if getattr(self, name) != getattr(other, name):
  89. return -1
  90. except AttributeError:
  91. return -1
  92. return 0
  93. def __repr__(self):
  94. return "Link(base_url=%r, url=%r, text=%r, tag=%r, attrs=%r)" % (
  95. self.base_url, self.url, self.text, self.tag, self.attrs)
  96. class LinksFactory:
  97. def __init__(self,
  98. link_parser_class=None,
  99. link_class=Link,
  100. urltags=None,
  101. ):
  102. import _pullparser
  103. if link_parser_class is None:
  104. link_parser_class = _pullparser.TolerantPullParser
  105. self.link_parser_class = link_parser_class
  106. self.link_class = link_class
  107. if urltags is None:
  108. urltags = {
  109. "a": "href",
  110. "area": "href",
  111. "frame": "src",
  112. "iframe": "src",
  113. }
  114. self.urltags = urltags
  115. self._response = None
  116. self._encoding = None
  117. def set_response(self, response, base_url, encoding):
  118. self._response = response
  119. self._encoding = encoding
  120. self._base_url = base_url
  121. def links(self):
  122. """Return an iterator that provides links of the document."""
  123. response = self._response
  124. encoding = self._encoding
  125. base_url = self._base_url
  126. p = self.link_parser_class(response, encoding=encoding)
  127. try:
  128. for token in p.tags(*(self.urltags.keys()+["base"])):
  129. if token.type == "endtag":
  130. continue
  131. if token.data == "base":
  132. base_href = dict(token.attrs).get("href")
  133. if base_href is not None:
  134. base_url = base_href
  135. continue
  136. attrs = dict(token.attrs)
  137. tag = token.data
  138. text = None
  139. # XXX use attr_encoding for ref'd doc if that doc does not
  140. # provide one by other means
  141. #attr_encoding = attrs.get("charset")
  142. url = attrs.get(self.urltags[tag]) # XXX is "" a valid URL?
  143. if not url:
  144. # Probably an <A NAME="blah"> link or <AREA NOHREF...>.
  145. # For our purposes a link is something with a URL, so
  146. # ignore this.
  147. continue
  148. url = _rfc3986.clean_url(url, encoding)
  149. if tag == "a":
  150. if token.type != "startendtag":
  151. # hmm, this'd break if end tag is missing
  152. text = p.get_compressed_text(("endtag", tag))
  153. # but this doesn't work for e.g.
  154. # <a href="blah"><b>Andy</b></a>
  155. #text = p.get_compressed_text()
  156. yield Link(base_url, url, text, tag, token.attrs)
  157. except sgmllib.SGMLParseError, exc:
  158. raise _form.ParseError(exc)
  159. class FormsFactory:
  160. """Makes a sequence of objects satisfying HTMLForm interface.
  161. After calling .forms(), the .global_form attribute is a form object
  162. containing all controls not a descendant of any FORM element.
  163. For constructor argument docs, see ParseResponse argument docs.
  164. """
  165. def __init__(self,
  166. select_default=False,
  167. form_parser_class=None,
  168. request_class=None,
  169. backwards_compat=False,
  170. ):
  171. self.select_default = select_default
  172. if form_parser_class is None:
  173. form_parser_class = _form.FormParser
  174. self.form_parser_class = form_parser_class
  175. if request_class is None:
  176. request_class = _request.Request
  177. self.request_class = request_class
  178. self.backwards_compat = backwards_compat
  179. self._response = None
  180. self.encoding = None
  181. self.global_form = None
  182. def set_response(self, response, encoding):
  183. self._response = response
  184. self.encoding = encoding
  185. self.global_form = None
  186. def forms(self):
  187. encoding = self.encoding
  188. forms = _form.ParseResponseEx(
  189. self._response,
  190. select_default=self.select_default,
  191. form_parser_class=self.form_parser_class,
  192. request_class=self.request_class,
  193. encoding=encoding,
  194. _urljoin=_rfc3986.urljoin,
  195. _urlparse=_rfc3986.urlsplit,
  196. _urlunparse=_rfc3986.urlunsplit,
  197. )
  198. self.global_form = forms[0]
  199. return forms[1:]
  200. class TitleFactory:
  201. def __init__(self):
  202. self._response = self._encoding = None
  203. def set_response(self, response, encoding):
  204. self._response = response
  205. self._encoding = encoding
  206. def _get_title_text(self, parser):
  207. import _pullparser
  208. text = []
  209. tok = None
  210. while 1:
  211. try:
  212. tok = parser.get_token()
  213. except _pullparser.NoMoreTokensError:
  214. break
  215. if tok.type == "data":
  216. text.append(str(tok))
  217. elif tok.type == "entityref":
  218. t = unescape("&%s;" % tok.data,
  219. parser._entitydefs, parser.encoding)
  220. text.append(t)
  221. elif tok.type == "charref":
  222. t = unescape_charref(tok.data, parser.encoding)
  223. text.append(t)
  224. elif tok.type in ["starttag", "endtag", "startendtag"]:
  225. tag_name = tok.data
  226. if tok.type == "endtag" and tag_name == "title":
  227. break
  228. text.append(str(tok))
  229. return COMPRESS_RE.sub(" ", "".join(text).strip())
  230. def title(self):
  231. import _pullparser
  232. p = _pullparser.TolerantPullParser(
  233. self._response, encoding=self._encoding)
  234. try:
  235. try:
  236. p.get_tag("title")
  237. except _pullparser.NoMoreTokensError:
  238. return None
  239. else:
  240. return self._get_title_text(p)
  241. except sgmllib.SGMLParseError, exc:
  242. raise _form.ParseError(exc)
  243. def unescape(data, entities, encoding):
  244. if data is None or "&" not in data:
  245. return data
  246. def replace_entities(match):
  247. ent = match.group()
  248. if ent[1] == "#":
  249. return unescape_charref(ent[2:-1], encoding)
  250. repl = entities.get(ent[1:-1])
  251. if repl is not None:
  252. repl = unichr(repl)
  253. if type(repl) != type(""):
  254. try:
  255. repl = repl.encode(encoding)
  256. except UnicodeError:
  257. repl = ent
  258. else:
  259. repl = ent
  260. return repl
  261. return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
  262. def unescape_charref(data, encoding):
  263. name, base = data, 10
  264. if name.startswith("x"):
  265. name, base= name[1:], 16
  266. uc = unichr(int(name, base))
  267. if encoding is None:
  268. return uc
  269. else:
  270. try:
  271. repl = uc.encode(encoding)
  272. except UnicodeError:
  273. repl = "&#%s;" % data
  274. return repl
  275. class MechanizeBs(_beautifulsoup.BeautifulSoup):
  276. _entitydefs = htmlentitydefs.name2codepoint
  277. # don't want the magic Microsoft-char workaround
  278. PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
  279. lambda(x):x.group(1) + ' />'),
  280. (re.compile('<!\s+([^<>]*)>'),
  281. lambda(x):'<!' + x.group(1) + '>')
  282. ]
  283. def __init__(self, encoding, text=None, avoidParserProblems=True,
  284. initialTextIsEverything=True):
  285. self._encoding = encoding
  286. _beautifulsoup.BeautifulSoup.__init__(
  287. self, text, avoidParserProblems, initialTextIsEverything)
  288. def handle_charref(self, ref):
  289. t = unescape("&#%s;"%ref, self._entitydefs, self._encoding)
  290. self.handle_data(t)
  291. def handle_entityref(self, ref):
  292. t = unescape("&%s;"%ref, self._entitydefs, self._encoding)
  293. self.handle_data(t)
  294. def unescape_attrs(self, attrs):
  295. escaped_attrs = []
  296. for key, val in attrs:
  297. val = unescape(val, self._entitydefs, self._encoding)
  298. escaped_attrs.append((key, val))
  299. return escaped_attrs
  300. class RobustLinksFactory:
  301. compress_re = COMPRESS_RE
  302. def __init__(self,
  303. link_parser_class=None,
  304. link_class=Link,
  305. urltags=None,
  306. ):
  307. if link_parser_class is None:
  308. link_parser_class = MechanizeBs
  309. self.link_parser_class = link_parser_class
  310. self.link_class = link_class
  311. if urltags is None:
  312. urltags = {
  313. "a": "href",
  314. "area": "href",
  315. "frame": "src",
  316. "iframe": "src",
  317. }
  318. self.urltags = urltags
  319. self._bs = None
  320. self._encoding = None
  321. self._base_url = None
  322. def set_soup(self, soup, base_url, encoding):
  323. self._bs = soup
  324. self._base_url = base_url
  325. self._encoding = encoding
  326. def links(self):
  327. bs = self._bs
  328. base_url = self._base_url
  329. encoding = self._encoding
  330. for ch in bs.recursiveChildGenerator():
  331. if (isinstance(ch, _beautifulsoup.Tag) and
  332. ch.name in self.urltags.keys()+["base"]):
  333. link = ch
  334. attrs = bs.unescape_attrs(link.attrs)
  335. attrs_dict = dict(attrs)
  336. if link.name == "base":
  337. base_href = attrs_dict.get("href")
  338. if base_href is not None:
  339. base_url = base_href
  340. continue
  341. url_attr = self.urltags[link.name]
  342. url = attrs_dict.get(url_attr)
  343. if not url:
  344. continue
  345. url = _rfc3986.clean_url(url, encoding)
  346. text = link.fetchText(lambda t: True)
  347. if not text:
  348. # follow _pullparser's weird behaviour rigidly
  349. if link.name == "a":
  350. text = ""
  351. else:
  352. text = None
  353. else:
  354. text = self.compress_re.sub(" ", " ".join(text).strip())
  355. yield Link(base_url, url, text, link.name, attrs)
  356. class RobustFormsFactory(FormsFactory):
  357. def __init__(self, *args, **kwds):
  358. args = form_parser_args(*args, **kwds)
  359. if args.form_parser_class is None:
  360. args.form_parser_class = _form.RobustFormParser
  361. FormsFactory.__init__(self, **args.dictionary)
  362. def set_response(self, response, encoding):
  363. self._response = response
  364. self.encoding = encoding
  365. class RobustTitleFactory:
  366. def __init__(self):
  367. self._bs = self._encoding = None
  368. def set_soup(self, soup, encoding):
  369. self._bs = soup
  370. self._encoding = encoding
  371. def title(self):
  372. title = self._bs.first("title")
  373. if title == _beautifulsoup.Null:
  374. return None
  375. else:
  376. inner_html = "".join([str(node) for node in title.contents])
  377. return COMPRESS_RE.sub(" ", inner_html.strip())
  378. class Factory:
  379. """Factory for forms, links, etc.
  380. This interface may expand in future.
  381. Public methods:
  382. set_request_class(request_class)
  383. set_response(response)
  384. forms()
  385. links()
  386. Public attributes:
  387. Note that accessing these attributes may raise ParseError.
  388. encoding: string specifying the encoding of response if it contains a text
  389. document (this value is left unspecified for documents that do not have
  390. an encoding, e.g. an image file)
  391. is_html: true if response contains an HTML document (XHTML may be
  392. regarded as HTML too)
  393. title: page title, or None if no title or not HTML
  394. global_form: form object containing all controls that are not descendants
  395. of any FORM element, or None if the forms_factory does not support
  396. supplying a global form
  397. """
  398. LAZY_ATTRS = ["encoding", "is_html", "title", "global_form"]
  399. def __init__(self, forms_factory, links_factory, title_factory,
  400. encoding_finder=EncodingFinder(DEFAULT_ENCODING),
  401. response_type_finder=ResponseTypeFinder(allow_xhtml=False),
  402. ):
  403. """
  404. Pass keyword arguments only.
  405. default_encoding: character encoding to use if encoding cannot be
  406. determined (or guessed) from the response. You should turn on
  407. HTTP-EQUIV handling if you want the best chance of getting this right
  408. without resorting to this default. The default value of this
  409. parameter (currently latin-1) may change in future.
  410. """
  411. self._forms_factory = forms_factory
  412. self._links_factory = links_factory
  413. self._title_factory = title_factory
  414. self._encoding_finder = encoding_finder
  415. self._response_type_finder = response_type_finder
  416. self.set_response(None)
  417. def set_request_class(self, request_class):
  418. """Set request class (mechanize.Request by default).
  419. HTMLForm instances returned by .forms() will return instances of this
  420. class when .click()ed.
  421. """
  422. self._forms_factory.request_class = request_class
  423. def set_response(self, response):
  424. """Set response.
  425. The response must either be None or implement the same interface as
  426. objects returned by mechanize.urlopen().
  427. """
  428. self._response = response
  429. self._forms_genf = self._links_genf = None
  430. self._get_title = None
  431. for name in self.LAZY_ATTRS:
  432. try:
  433. delattr(self, name)
  434. except AttributeError:
  435. pass
  436. def __getattr__(self, name):
  437. if name not in self.LAZY_ATTRS:
  438. return getattr(self.__class__, name)
  439. if name == "encoding":
  440. self.encoding = self._encoding_finder.encoding(
  441. copy.copy(self._response))
  442. return self.encoding
  443. elif name == "is_html":
  444. self.is_html = self._response_type_finder.is_html(
  445. copy.copy(self._response), self.encoding)
  446. return self.is_html
  447. elif name == "title":
  448. if self.is_html:
  449. self.title = self._title_factory.title()
  450. else:
  451. self.title = None
  452. return self.title
  453. elif name == "global_form":
  454. self.forms()
  455. return self.global_form
  456. def forms(self):
  457. """Return iterable over HTMLForm-like objects.
  458. Raises mechanize.ParseError on failure.
  459. """
  460. # this implementation sets .global_form as a side-effect, for benefit
  461. # of __getattr__ impl
  462. if self._forms_genf is None:
  463. try:
  464. self._forms_genf = CachingGeneratorFunction(
  465. self._forms_factory.forms())
  466. except: # XXXX define exception!
  467. self.set_response(self._response)
  468. raise
  469. self.global_form = getattr(
  470. self._forms_factory, "global_form", None)
  471. return self._forms_genf()
  472. def links(self):
  473. """Return iterable over mechanize.Link-like objects.
  474. Raises mechanize.ParseError on failure.
  475. """
  476. if self._links_genf is None:
  477. try:
  478. self._links_genf = CachingGeneratorFunction(
  479. self._links_factory.links())
  480. except: # XXXX define exception!
  481. self.set_response(self._response)
  482. raise
  483. return self._links_genf()
  484. class DefaultFactory(Factory):
  485. """Based on sgmllib."""
  486. def __init__(self, i_want_broken_xhtml_support=False):
  487. Factory.__init__(
  488. self,
  489. forms_factory=FormsFactory(),
  490. links_factory=LinksFactory(),
  491. title_factory=TitleFactory(),
  492. response_type_finder=ResponseTypeFinder(
  493. allow_xhtml=i_want_broken_xhtml_support),
  494. )
  495. def set_response(self, response):
  496. Factory.set_response(self, response)
  497. if response is not None:
  498. self._forms_factory.set_response(
  499. copy.copy(response), self.encoding)
  500. self._links_factory.set_response(
  501. copy.copy(response), response.geturl(), self.encoding)
  502. self._title_factory.set_response(
  503. copy.copy(response), self.encoding)
  504. class RobustFactory(Factory):
  505. """Based on BeautifulSoup, hopefully a bit more robust to bad HTML than is
  506. DefaultFactory.
  507. """
  508. def __init__(self, i_want_broken_xhtml_support=False,
  509. soup_class=None):
  510. Factory.__init__(
  511. self,
  512. forms_factory=RobustFormsFactory(),
  513. links_factory=RobustLinksFactory(),
  514. title_factory=RobustTitleFactory(),
  515. response_type_finder=ResponseTypeFinder(
  516. allow_xhtml=i_want_broken_xhtml_support),
  517. )
  518. if soup_class is None:
  519. soup_class = MechanizeBs
  520. self._soup_class = soup_class
  521. def set_response(self, response):
  522. Factory.set_response(self, response)
  523. if response is not None:
  524. data = response.read()
  525. soup = self._soup_class(self.encoding, data)
  526. self._forms_factory.set_response(
  527. copy.copy(response), self.encoding)
  528. self._links_factory.set_soup(
  529. soup, response.geturl(), self.encoding)
  530. self._title_factory.set_soup(soup, self.encoding)