/script.module.schism.common/lib/bs4/__init__.py

https://bitbucket.org/gandelf/repository.gandelf · Python · 529 lines · 525 code · 1 blank · 3 comment · 9 complexity · 78640942481476bf63270f714e66a064 MD5 · raw file

  1. """Beautiful Soup
  2. Elixir and Tonic
  3. "The Screen-Scraper's Friend"
  4. http://www.crummy.com/software/BeautifulSoup/
  5. Beautiful Soup uses a pluggable XML or HTML parser to parse a
  6. (possibly invalid) document into a tree representation. Beautiful Soup
  7. provides methods and Pythonic idioms that make it easy to navigate,
  8. search, and modify the parse tree.
  9. Beautiful Soup works with Python 2.7 and up. It works better if lxml
  10. and/or html5lib is installed.
  11. For more than you ever wanted to know about Beautiful Soup, see the
  12. documentation:
  13. http://www.crummy.com/software/BeautifulSoup/bs4/doc/
  14. """
  15. # Use of this source code is governed by a BSD-style license that can be
  16. # found in the LICENSE file.
  17. __author__ = "Leonard Richardson (leonardr@segfault.org)"
  18. __version__ = "4.5.1"
  19. __copyright__ = "Copyright (c) 2004-2016 Leonard Richardson"
  20. __license__ = "MIT"
  21. __all__ = ['BeautifulSoup']
  22. import os
  23. import re
  24. import traceback
  25. import warnings
  26. from .builder import builder_registry, ParserRejectedMarkup
  27. from .dammit import UnicodeDammit
  28. from .element import (
  29. CData,
  30. Comment,
  31. DEFAULT_OUTPUT_ENCODING,
  32. Declaration,
  33. Doctype,
  34. NavigableString,
  35. PageElement,
  36. ProcessingInstruction,
  37. ResultSet,
  38. SoupStrainer,
  39. Tag,
  40. )
  41. # The very first thing we do is give a useful error if someone is
  42. # running this code under Python 3 without converting it.
  43. 'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
  44. class BeautifulSoup(Tag):
  45. """
  46. This class defines the basic interface called by the tree builders.
  47. These methods will be called by the parser:
  48. reset()
  49. feed(markup)
  50. The tree builder may call these methods from its feed() implementation:
  51. handle_starttag(name, attrs) # See note about return value
  52. handle_endtag(name)
  53. handle_data(data) # Appends to the current data node
  54. endData(containerClass=NavigableString) # Ends the current data node
  55. No matter how complicated the underlying parser is, you should be
  56. able to build a tree using 'start tag' events, 'end tag' events,
  57. 'data' events, and "done with data" events.
  58. If you encounter an empty-element tag (aka a self-closing tag,
  59. like HTML's <br> tag), call handle_starttag and then
  60. handle_endtag.
  61. """
  62. ROOT_TAG_NAME = u'[document]'
  63. # If the end-user gives no indication which tree builder they
  64. # want, look for one with these features.
  65. DEFAULT_BUILDER_FEATURES = ['html', 'fast']
  66. ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
  67. NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
  68. def __init__(self, markup="", features=None, builder=None,
  69. parse_only=None, from_encoding=None, exclude_encodings=None,
  70. **kwargs):
  71. """The Soup object is initialized as the 'root tag', and the
  72. provided markup (which can be a string or a file-like object)
  73. is fed into the underlying parser."""
  74. if 'convertEntities' in kwargs:
  75. warnings.warn(
  76. "BS4 does not respect the convertEntities argument to the "
  77. "BeautifulSoup constructor. Entities are always converted "
  78. "to Unicode characters.")
  79. if 'markupMassage' in kwargs:
  80. del kwargs['markupMassage']
  81. warnings.warn(
  82. "BS4 does not respect the markupMassage argument to the "
  83. "BeautifulSoup constructor. The tree builder is responsible "
  84. "for any necessary markup massage.")
  85. if 'smartQuotesTo' in kwargs:
  86. del kwargs['smartQuotesTo']
  87. warnings.warn(
  88. "BS4 does not respect the smartQuotesTo argument to the "
  89. "BeautifulSoup constructor. Smart quotes are always converted "
  90. "to Unicode characters.")
  91. if 'selfClosingTags' in kwargs:
  92. del kwargs['selfClosingTags']
  93. warnings.warn(
  94. "BS4 does not respect the selfClosingTags argument to the "
  95. "BeautifulSoup constructor. The tree builder is responsible "
  96. "for understanding self-closing tags.")
  97. if 'isHTML' in kwargs:
  98. del kwargs['isHTML']
  99. warnings.warn(
  100. "BS4 does not respect the isHTML argument to the "
  101. "BeautifulSoup constructor. Suggest you use "
  102. "features='lxml' for HTML and features='lxml-xml' for "
  103. "XML.")
  104. def deprecated_argument(old_name, new_name):
  105. if old_name in kwargs:
  106. warnings.warn(
  107. 'The "%s" argument to the BeautifulSoup constructor '
  108. 'has been renamed to "%s."' % (old_name, new_name))
  109. value = kwargs[old_name]
  110. del kwargs[old_name]
  111. return value
  112. return None
  113. parse_only = parse_only or deprecated_argument(
  114. "parseOnlyThese", "parse_only")
  115. from_encoding = from_encoding or deprecated_argument(
  116. "fromEncoding", "from_encoding")
  117. if from_encoding and isinstance(markup, unicode):
  118. warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
  119. from_encoding = None
  120. if len(kwargs) > 0:
  121. arg = kwargs.keys().pop()
  122. raise TypeError(
  123. "__init__() got an unexpected keyword argument '%s'" % arg)
  124. if builder is None:
  125. original_features = features
  126. if isinstance(features, basestring):
  127. features = [features]
  128. if features is None or len(features) == 0:
  129. features = self.DEFAULT_BUILDER_FEATURES
  130. builder_class = builder_registry.lookup(*features)
  131. if builder_class is None:
  132. raise FeatureNotFound(
  133. "Couldn't find a tree builder with the features you "
  134. "requested: %s. Do you need to install a parser library?"
  135. % ",".join(features))
  136. builder = builder_class()
  137. if not (original_features == builder.NAME or
  138. original_features in builder.ALTERNATE_NAMES):
  139. if builder.is_xml:
  140. markup_type = "XML"
  141. else:
  142. markup_type = "HTML"
  143. caller = traceback.extract_stack()[0]
  144. filename = caller[0]
  145. line_number = caller[1]
  146. warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
  147. filename=filename,
  148. line_number=line_number,
  149. parser=builder.NAME,
  150. markup_type=markup_type))
  151. self.builder = builder
  152. self.is_xml = builder.is_xml
  153. self.known_xml = self.is_xml
  154. self.builder.soup = self
  155. self.parse_only = parse_only
  156. if hasattr(markup, 'read'): # It's a file-type object.
  157. markup = markup.read()
  158. elif len(markup) <= 256 and (
  159. (isinstance(markup, bytes) and not b'<' in markup)
  160. or (isinstance(markup, unicode) and not u'<' in markup)
  161. ):
  162. # Print out warnings for a couple beginner problems
  163. # involving passing non-markup to Beautiful Soup.
  164. # Beautiful Soup will still parse the input as markup,
  165. # just in case that's what the user really wants.
  166. if (isinstance(markup, unicode)
  167. and not os.path.supports_unicode_filenames):
  168. possible_filename = markup.encode("utf8")
  169. else:
  170. possible_filename = markup
  171. is_file = False
  172. try:
  173. is_file = os.path.exists(possible_filename)
  174. except Exception, e:
  175. # This is almost certainly a problem involving
  176. # characters not valid in filenames on this
  177. # system. Just let it go.
  178. pass
  179. if is_file:
  180. if isinstance(markup, unicode):
  181. markup = markup.encode("utf8")
  182. warnings.warn(
  183. '"%s" looks like a filename, not markup. You should'
  184. 'probably open this file and pass the filehandle into'
  185. 'Beautiful Soup.' % markup)
  186. self._check_markup_is_url(markup)
  187. for (self.markup, self.original_encoding, self.declared_html_encoding,
  188. self.contains_replacement_characters) in (
  189. self.builder.prepare_markup(
  190. markup, from_encoding, exclude_encodings=exclude_encodings)):
  191. self.reset()
  192. try:
  193. self._feed()
  194. break
  195. except ParserRejectedMarkup:
  196. pass
  197. # Clear out the markup and remove the builder's circular
  198. # reference to this object.
  199. self.markup = None
  200. self.builder.soup = None
  201. def __copy__(self):
  202. copy = type(self)(
  203. self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
  204. )
  205. # Although we encoded the tree to UTF-8, that may not have
  206. # been the encoding of the original markup. Set the copy's
  207. # .original_encoding to reflect the original object's
  208. # .original_encoding.
  209. copy.original_encoding = self.original_encoding
  210. return copy
  211. def __getstate__(self):
  212. # Frequently a tree builder can't be pickled.
  213. d = dict(self.__dict__)
  214. if 'builder' in d and not self.builder.picklable:
  215. d['builder'] = None
  216. return d
  217. @staticmethod
  218. def _check_markup_is_url(markup):
  219. """
  220. Check if markup looks like it's actually a url and raise a warning
  221. if so. Markup can be unicode or str (py2) / bytes (py3).
  222. """
  223. if isinstance(markup, bytes):
  224. space = b' '
  225. cant_start_with = (b"http:", b"https:")
  226. elif isinstance(markup, unicode):
  227. space = u' '
  228. cant_start_with = (u"http:", u"https:")
  229. else:
  230. return
  231. if any(markup.startswith(prefix) for prefix in cant_start_with):
  232. if not space in markup:
  233. if isinstance(markup, bytes):
  234. decoded_markup = markup.decode('utf-8', 'replace')
  235. else:
  236. decoded_markup = markup
  237. warnings.warn(
  238. '"%s" looks like a URL. Beautiful Soup is not an'
  239. ' HTTP client. You should probably use an HTTP client like'
  240. ' requests to get the document behind the URL, and feed'
  241. ' that document to Beautiful Soup.' % decoded_markup
  242. )
  243. def _feed(self):
  244. # Convert the document to Unicode.
  245. self.builder.reset()
  246. self.builder.feed(self.markup)
  247. # Close out any unfinished strings and close all the open tags.
  248. self.endData()
  249. while self.currentTag.name != self.ROOT_TAG_NAME:
  250. self.popTag()
  251. def reset(self):
  252. Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
  253. self.hidden = 1
  254. self.builder.reset()
  255. self.current_data = []
  256. self.currentTag = None
  257. self.tagStack = []
  258. self.preserve_whitespace_tag_stack = []
  259. self.pushTag(self)
  260. def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
  261. """Create a new tag associated with this soup."""
  262. return Tag(None, self.builder, name, namespace, nsprefix, attrs)
  263. def new_string(self, s, subclass=NavigableString):
  264. """Create a new NavigableString associated with this soup."""
  265. return subclass(s)
  266. def insert_before(self, successor):
  267. raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
  268. def insert_after(self, successor):
  269. raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
  270. def popTag(self):
  271. tag = self.tagStack.pop()
  272. if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
  273. self.preserve_whitespace_tag_stack.pop()
  274. #print "Pop", tag.name
  275. if self.tagStack:
  276. self.currentTag = self.tagStack[-1]
  277. return self.currentTag
  278. def pushTag(self, tag):
  279. #print "Push", tag.name
  280. if self.currentTag:
  281. self.currentTag.contents.append(tag)
  282. self.tagStack.append(tag)
  283. self.currentTag = self.tagStack[-1]
  284. if tag.name in self.builder.preserve_whitespace_tags:
  285. self.preserve_whitespace_tag_stack.append(tag)
  286. def endData(self, containerClass=NavigableString):
  287. if self.current_data:
  288. current_data = u''.join(self.current_data)
  289. # If whitespace is not preserved, and this string contains
  290. # nothing but ASCII spaces, replace it with a single space
  291. # or newline.
  292. if not self.preserve_whitespace_tag_stack:
  293. strippable = True
  294. for i in current_data:
  295. if i not in self.ASCII_SPACES:
  296. strippable = False
  297. break
  298. if strippable:
  299. if '\n' in current_data:
  300. current_data = '\n'
  301. else:
  302. current_data = ' '
  303. # Reset the data collector.
  304. self.current_data = []
  305. # Should we add this string to the tree at all?
  306. if self.parse_only and len(self.tagStack) <= 1 and \
  307. (not self.parse_only.text or \
  308. not self.parse_only.search(current_data)):
  309. return
  310. o = containerClass(current_data)
  311. self.object_was_parsed(o)
  312. def object_was_parsed(self, o, parent=None, most_recent_element=None):
  313. """Add an object to the parse tree."""
  314. parent = parent or self.currentTag
  315. previous_element = most_recent_element or self._most_recent_element
  316. next_element = previous_sibling = next_sibling = None
  317. if isinstance(o, Tag):
  318. next_element = o.next_element
  319. next_sibling = o.next_sibling
  320. previous_sibling = o.previous_sibling
  321. if not previous_element:
  322. previous_element = o.previous_element
  323. o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
  324. self._most_recent_element = o
  325. parent.contents.append(o)
  326. if parent.next_sibling:
  327. # This node is being inserted into an element that has
  328. # already been parsed. Deal with any dangling references.
  329. index = len(parent.contents)-1
  330. while index >= 0:
  331. if parent.contents[index] is o:
  332. break
  333. index -= 1
  334. else:
  335. raise ValueError(
  336. "Error building tree: supposedly %r was inserted "
  337. "into %r after the fact, but I don't see it!" % (
  338. o, parent
  339. )
  340. )
  341. if index == 0:
  342. previous_element = parent
  343. previous_sibling = None
  344. else:
  345. previous_element = previous_sibling = parent.contents[index-1]
  346. if index == len(parent.contents)-1:
  347. next_element = parent.next_sibling
  348. next_sibling = None
  349. else:
  350. next_element = next_sibling = parent.contents[index+1]
  351. o.previous_element = previous_element
  352. if previous_element:
  353. previous_element.next_element = o
  354. o.next_element = next_element
  355. if next_element:
  356. next_element.previous_element = o
  357. o.next_sibling = next_sibling
  358. if next_sibling:
  359. next_sibling.previous_sibling = o
  360. o.previous_sibling = previous_sibling
  361. if previous_sibling:
  362. previous_sibling.next_sibling = o
  363. def _popToTag(self, name, nsprefix=None, inclusivePop=True):
  364. """Pops the tag stack up to and including the most recent
  365. instance of the given tag. If inclusivePop is false, pops the tag
  366. stack up to but *not* including the most recent instqance of
  367. the given tag."""
  368. #print "Popping to %s" % name
  369. if name == self.ROOT_TAG_NAME:
  370. # The BeautifulSoup object itself can never be popped.
  371. return
  372. most_recently_popped = None
  373. stack_size = len(self.tagStack)
  374. for i in range(stack_size - 1, 0, -1):
  375. t = self.tagStack[i]
  376. if (name == t.name and nsprefix == t.prefix):
  377. if inclusivePop:
  378. most_recently_popped = self.popTag()
  379. break
  380. most_recently_popped = self.popTag()
  381. return most_recently_popped
  382. def handle_starttag(self, name, namespace, nsprefix, attrs):
  383. """Push a start tag on to the stack.
  384. If this method returns None, the tag was rejected by the
  385. SoupStrainer. You should proceed as if the tag had not occurred
  386. in the document. For instance, if this was a self-closing tag,
  387. don't call handle_endtag.
  388. """
  389. # print "Start tag %s: %s" % (name, attrs)
  390. self.endData()
  391. if (self.parse_only and len(self.tagStack) <= 1
  392. and (self.parse_only.text
  393. or not self.parse_only.search_tag(name, attrs))):
  394. return None
  395. tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
  396. self.currentTag, self._most_recent_element)
  397. if tag is None:
  398. return tag
  399. if self._most_recent_element:
  400. self._most_recent_element.next_element = tag
  401. self._most_recent_element = tag
  402. self.pushTag(tag)
  403. return tag
  404. def handle_endtag(self, name, nsprefix=None):
  405. #print "End tag: " + name
  406. self.endData()
  407. self._popToTag(name, nsprefix)
  408. def handle_data(self, data):
  409. self.current_data.append(data)
  410. def decode(self, pretty_print=False,
  411. eventual_encoding=DEFAULT_OUTPUT_ENCODING,
  412. formatter="minimal"):
  413. """Returns a string or Unicode representation of this document.
  414. To get Unicode, pass None for encoding."""
  415. if self.is_xml:
  416. # Print the XML declaration
  417. encoding_part = ''
  418. if eventual_encoding != None:
  419. encoding_part = ' encoding="%s"' % eventual_encoding
  420. prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
  421. else:
  422. prefix = u''
  423. if not pretty_print:
  424. indent_level = None
  425. else:
  426. indent_level = 0
  427. return prefix + super(BeautifulSoup, self).decode(
  428. indent_level, eventual_encoding, formatter)
  429. # Alias to make it easier to type import: 'from bs4 import _soup'
  430. _s = BeautifulSoup
  431. _soup = BeautifulSoup
  432. class BeautifulStoneSoup(BeautifulSoup):
  433. """Deprecated interface to an XML parser."""
  434. def __init__(self, *args, **kwargs):
  435. kwargs['features'] = 'xml'
  436. warnings.warn(
  437. 'The BeautifulStoneSoup class is deprecated. Instead of using '
  438. 'it, pass features="xml" into the BeautifulSoup constructor.')
  439. super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
  440. class StopParsing(Exception):
  441. pass
  442. class FeatureNotFound(ValueError):
  443. pass
  444. #By default, act as an HTML pretty-printer.
  445. if __name__ == '__main__':
  446. import sys
  447. soup = BeautifulSoup(sys.stdin)
  448. print soup.prettify()