PageRenderTime 81ms CodeModel.GetById 29ms RepoModel.GetById 0ms app.codeStats 1ms

/linux/bs4/diagnose.py

https://gitlab.com/Rheinhart/csuchen-Guard
Python | 216 lines | 177 code | 26 blank | 13 comment | 19 complexity | b8bbb88b1dedc6645081ab2a31e61055 MD5 | raw file
  1. """Diagnostic functions, mainly for use when doing tech support."""
  2. __license__ = "MIT"
  3. import cProfile
  4. from StringIO import StringIO
  5. from HTMLParser import HTMLParser
  6. import bs4
  7. from bs4 import BeautifulSoup, __version__
  8. from bs4.builder import builder_registry
  9. import os
  10. import pstats
  11. import random
  12. import tempfile
  13. import time
  14. import traceback
  15. import sys
  16. import cProfile
  17. def diagnose(data):
  18. """Diagnostic suite for isolating common problems."""
  19. print "Diagnostic running on Beautiful Soup %s" % __version__
  20. print "Python version %s" % sys.version
  21. basic_parsers = ["html.parser", "html5lib", "lxml"]
  22. for name in basic_parsers:
  23. for builder in builder_registry.builders:
  24. if name in builder.features:
  25. break
  26. else:
  27. basic_parsers.remove(name)
  28. print (
  29. "I noticed that %s is not installed. Installing it may help." %
  30. name)
  31. if 'lxml' in basic_parsers:
  32. basic_parsers.append(["lxml", "xml"])
  33. try:
  34. from lxml import etree
  35. print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
  36. except ImportError, e:
  37. print (
  38. "lxml is not installed or couldn't be imported.")
  39. if 'html5lib' in basic_parsers:
  40. try:
  41. import html5lib
  42. print "Found html5lib version %s" % html5lib.__version__
  43. except ImportError, e:
  44. print (
  45. "html5lib is not installed or couldn't be imported.")
  46. if hasattr(data, 'read'):
  47. data = data.read()
  48. elif os.path.exists(data):
  49. print '"%s" looks like a filename. Reading data from the file.' % data
  50. data = open(data).read()
  51. elif data.startswith("http:") or data.startswith("https:"):
  52. print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
  53. print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
  54. return
  55. print
  56. for parser in basic_parsers:
  57. print "Trying to parse your markup with %s" % parser
  58. success = False
  59. try:
  60. soup = BeautifulSoup(data, parser)
  61. success = True
  62. except Exception, e:
  63. print "%s could not parse the markup." % parser
  64. traceback.print_exc()
  65. if success:
  66. print "Here's what %s did with the markup:" % parser
  67. print soup.prettify()
  68. print "-" * 80
  69. def lxml_trace(data, html=True, **kwargs):
  70. """Print out the lxml events that occur during parsing.
  71. This lets you see how lxml parses a document when no Beautiful
  72. Soup code is running.
  73. """
  74. from lxml import etree
  75. for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
  76. print("%s, %4s, %s" % (event, element.tag, element.text))
  77. class AnnouncingParser(HTMLParser):
  78. """Announces HTMLParser parse events, without doing anything else."""
  79. def _p(self, s):
  80. print(s)
  81. def handle_starttag(self, name, attrs):
  82. self._p("%s START" % name)
  83. def handle_endtag(self, name):
  84. self._p("%s END" % name)
  85. def handle_data(self, data):
  86. self._p("%s DATA" % data)
  87. def handle_charref(self, name):
  88. self._p("%s CHARREF" % name)
  89. def handle_entityref(self, name):
  90. self._p("%s ENTITYREF" % name)
  91. def handle_comment(self, data):
  92. self._p("%s COMMENT" % data)
  93. def handle_decl(self, data):
  94. self._p("%s DECL" % data)
  95. def unknown_decl(self, data):
  96. self._p("%s UNKNOWN-DECL" % data)
  97. def handle_pi(self, data):
  98. self._p("%s PI" % data)
  99. def htmlparser_trace(data):
  100. """Print out the HTMLParser events that occur during parsing.
  101. This lets you see how HTMLParser parses a document when no
  102. Beautiful Soup code is running.
  103. """
  104. parser = AnnouncingParser()
  105. parser.feed(data)
  106. _vowels = "aeiou"
  107. _consonants = "bcdfghjklmnpqrstvwxyz"
  108. def rword(length=5):
  109. "Generate a random word-like string."
  110. s = ''
  111. for i in range(length):
  112. if i % 2 == 0:
  113. t = _consonants
  114. else:
  115. t = _vowels
  116. s += random.choice(t)
  117. return s
  118. def rsentence(length=4):
  119. "Generate a random sentence-like string."
  120. return " ".join(rword(random.randint(4,9)) for i in range(length))
  121. def rdoc(num_elements=1000):
  122. """Randomly generate an invalid HTML document."""
  123. tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
  124. elements = []
  125. for i in range(num_elements):
  126. choice = random.randint(0,3)
  127. if choice == 0:
  128. # New tag.
  129. tag_name = random.choice(tag_names)
  130. elements.append("<%s>" % tag_name)
  131. elif choice == 1:
  132. elements.append(rsentence(random.randint(1,4)))
  133. elif choice == 2:
  134. # Close a tag.
  135. tag_name = random.choice(tag_names)
  136. elements.append("</%s>" % tag_name)
  137. return "<html>" + "\n".join(elements) + "</html>"
  138. def benchmark_parsers(num_elements=100000):
  139. """Very basic head-to-head performance benchmark."""
  140. print "Comparative parser benchmark on Beautiful Soup %s" % __version__
  141. data = rdoc(num_elements)
  142. print "Generated a large invalid HTML document (%d bytes)." % len(data)
  143. for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
  144. success = False
  145. try:
  146. a = time.time()
  147. soup = BeautifulSoup(data, parser)
  148. b = time.time()
  149. success = True
  150. except Exception, e:
  151. print "%s could not parse the markup." % parser
  152. traceback.print_exc()
  153. if success:
  154. print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
  155. from lxml import etree
  156. a = time.time()
  157. etree.HTML(data)
  158. b = time.time()
  159. print "Raw lxml parsed the markup in %.2fs." % (b-a)
  160. import html5lib
  161. parser = html5lib.HTMLParser()
  162. a = time.time()
  163. parser.parse(data)
  164. b = time.time()
  165. print "Raw html5lib parsed the markup in %.2fs." % (b-a)
  166. def profile(num_elements=100000, parser="lxml"):
  167. filehandle = tempfile.NamedTemporaryFile()
  168. filename = filehandle.name
  169. data = rdoc(num_elements)
  170. vars = dict(bs4=bs4, data=data, parser=parser)
  171. cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
  172. stats = pstats.Stats(filename)
  173. # stats.strip_dirs()
  174. stats.sort_stats("cumulative")
  175. stats.print_stats('_html5lib|bs4', 50)
  176. if __name__ == '__main__':
  177. diagnose(sys.stdin.read())