PageRenderTime 26ms CodeModel.GetById 11ms RepoModel.GetById 1ms app.codeStats 0ms

/phew.py

https://gitlab.com/fnaticshank/crawler
Python | 327 lines | 272 code | 25 blank | 30 comment | 7 complexity | 380b56885c08341d93c51bf4484cbe19 MD5 | raw file
  1. import re
  2. import sys
  3. import time
  4. import math
  5. import urllib2
  6. import urlparse
  7. import optparse
  8. import hashlib
  9. from cgi import escape
  10. from traceback import format_exc
  11. from Queue import Queue, Empty as QueueEmpty
  12. import requests
  13. from bs4 import BeautifulSoup
  14. class Link (object):
  15. def __init__(self, src, dst, link_type):
  16. self.src = src
  17. self.dst = dst
  18. self.link_type = link_type
  19. def __hash__(self):
  20. return hash((self.src, self.dst, self.link_type))
  21. def __eq__(self, other):
  22. return (self.src == other.src and
  23. self.dst == other.dst and
  24. self.link_type == other.link_type)
  25. def __str__(self):
  26. return self.src + " -> " + self.dst
  27. class Crawler(object):
  28. def __init__(self, root, depth_limit, confine=None, exclude=[], locked=True, filter_seen=True):
  29. self.root = root
  30. self.host = urlparse.urlparse(root)[1]
  31. ## Data for filters:
  32. self.depth_limit = depth_limit # Max depth (number of hops from root)
  33. self.locked = locked # Limit search to a single host?
  34. self.confine_prefix=confine # Limit search to this prefix
  35. self.exclude_prefixes=exclude; # URL prefixes NOT to visit
  36. self.urls_seen = set() # Used to avoid putting duplicates in queue
  37. self.urls_remembered = set() # For reporting to user
  38. self.visited_links= set() # Used to avoid re-processing a page
  39. self.links_remembered = set() # For reporting to user
  40. self.num_links = 0 # Links found (and not excluded by filters)
  41. self.num_followed = 0 # Links followed.
  42. # Pre-visit filters: Only visit a URL if it passes these tests
  43. self.pre_visit_filters=[self._prefix_ok,
  44. self._exclude_ok,
  45. self._not_visited,
  46. self._same_host]
  47. # Out-url filters: When examining a visited page, only process
  48. # links where the target matches these filters.
  49. if filter_seen:
  50. self.out_url_filters=[self._prefix_ok,
  51. self._same_host]
  52. else:
  53. self.out_url_filters=[]
  54. def _pre_visit_url_condense(self, url):
  55. """ Reduce (condense) URLs into some canonical form before
  56. visiting. All occurrences of equivalent URLs are treated as
  57. identical.
  58. All this does is strip the \"fragment\" component from URLs,
  59. so that http://foo.com/blah.html\#baz becomes
  60. http://foo.com/blah.html """
  61. base, frag = urlparse.urldefrag(url)
  62. return base
  63. ## URL Filtering functions. These all use information from the
  64. ## state of the Crawler to evaluate whether a given URL should be
  65. ## used in some context. Return value of True indicates that the
  66. ## URL should be used.
  67. def _prefix_ok(self, url):
  68. """Pass if the URL has the correct prefix, or none is specified"""
  69. return (self.confine_prefix is None or
  70. url.startswith(self.confine_prefix))
  71. def _exclude_ok(self, url):
  72. """Pass if the URL does not match any exclude patterns"""
  73. prefixes_ok = [ not url.startswith(p) for p in self.exclude_prefixes]
  74. return all(prefixes_ok)
  75. def _not_visited(self, url):
  76. """Pass if the URL has not already been visited"""
  77. return (url not in self.visited_links)
  78. def _same_host(self, url):
  79. """Pass if the URL is on the same host as the root URL"""
  80. try:
  81. host = urlparse.urlparse(url)[1]
  82. return re.match(".*%s" % self.host, host)
  83. except Exception, e:
  84. print >> sys.stderr, "ERROR: Can't process url '%s' (%s)" % (url, e)
  85. return False
  86. def crawl(self):
  87. """ Main function in the crawling process. Core algorithm is:
  88. q <- starting page
  89. while q not empty:
  90. url <- q.get()
  91. if url is new and suitable:
  92. page <- fetch(url)
  93. q.put(urls found in page)
  94. else:
  95. nothing
  96. new and suitable means that we don't re-visit URLs we've seen
  97. already fetched, and user-supplied criteria like maximum
  98. search depth are checked. """
  99. q = Queue()
  100. q.put((self.root, 0))
  101. while not q.empty():
  102. this_url, depth = q.get()
  103. #Non-URL-specific filter: Discard anything over depth limit
  104. if depth > self.depth_limit:
  105. continue
  106. #Apply URL-based filters.
  107. do_not_follow = [f for f in self.pre_visit_filters if not f(this_url)]
  108. #Special-case depth 0 (starting URL)
  109. if depth == 0 and [] != do_not_follow:
  110. print >> sys.stderr, "Whoops! Starting URL %s rejected by the following filters:", do_not_follow
  111. #If no filters failed (that is, all passed), process URL
  112. if [] == do_not_follow:
  113. try:
  114. self.visited_links.add(this_url)
  115. self.num_followed += 1
  116. page = Fetcher(this_url)
  117. page.fetch()
  118. for link_url in [self._pre_visit_url_condense(l) for l in page.out_links()]:
  119. if link_url not in self.urls_seen:
  120. q.put((link_url, depth+1))
  121. self.urls_seen.add(link_url)
  122. do_not_remember = [f for f in self.out_url_filters if not f(link_url)]
  123. if [] == do_not_remember:
  124. self.num_links += 1
  125. self.urls_remembered.add(link_url)
  126. link = Link(this_url, link_url, "href")
  127. if link not in self.links_remembered:
  128. self.links_remembered.add(link)
  129. except Exception, e:
  130. print >>sys.stderr, "ERROR: Can't process url '%s' (%s)" % (this_url, e)
  131. #print format_exc()
  132. class OpaqueDataException (Exception):
  133. def __init__(self, message, mimetype, url):
  134. Exception.__init__(self, message)
  135. self.mimetype=mimetype
  136. self.url=url
  137. class Fetcher(object):
  138. """The name Fetcher is a slight misnomer: This class retrieves and interprets web pages."""
  139. def __init__(self, url):
  140. self.url = url
  141. self.out_urls = []
  142. def __getitem__(self, x):
  143. return self.out_urls[x]
  144. def out_links(self):
  145. return self.out_urls
  146. #def _addHeaders(self, request):
  147. # request.add_header("User-Agent", AGENT)
  148. def _open(self):
  149. url = self.url
  150. try:
  151. request = urllib2.Request(url)
  152. handle = urllib2.build_opener()
  153. except IOError:
  154. return None
  155. return (request, handle)
  156. def fetch(self):
  157. request, handle = self._open()
  158. #self._addHeaders(request)
  159. if handle:
  160. try:
  161. data=handle.open(request)
  162. mime_type=data.info().gettype()
  163. url=data.geturl();
  164. if mime_type != "text/html":
  165. raise OpaqueDataException("Not interested in files of type %s" % mime_type,
  166. mime_type, url)
  167. content = unicode(data.read(), "utf-8",
  168. errors="replace")
  169. soup = BeautifulSoup(content, "lxml")
  170. tags = soup('a')
  171. except urllib2.HTTPError, error:
  172. if error.code == 404:
  173. print >> sys.stderr, "ERROR: %s -> %s" % (error, error.url)
  174. else:
  175. print >> sys.stderr, "ERROR: %s" % error
  176. tags = []
  177. except urllib2.URLError, error:
  178. print >> sys.stderr, "ERROR: %s" % error
  179. tags = []
  180. except OpaqueDataException, error:
  181. print >>sys.stderr, "Skipping %s, has type %s" % (error.url, error.mimetype)
  182. tags = []
  183. for tag in tags:
  184. href = tag.get("href")
  185. if href is not None:
  186. url = urlparse.urljoin(self.url, escape(href))
  187. if url not in self:
  188. self.out_urls.append(url)
  189. def getLinks(url):
  190. page = Fetcher(url)
  191. page.fetch()
  192. for i, url in enumerate(page):
  193. print "%d. %s" % (i, url)
  194. def parse_options():
  195. """parse_options() -> opts, args
  196. Parse any command-line options given returning both
  197. the parsed options and arguments.
  198. """
  199. parser = optparse.OptionParser()
  200. parser.add_option("-q", "--quiet",
  201. action="store_true", default=False, dest="quiet",
  202. help="Enable quiet mode")
  203. parser.add_option("-l", "--links",
  204. action="store_true", default=False, dest="links",
  205. help="Get links for specified url only")
  206. parser.add_option("-d", "--depth",
  207. action="store", type="int", default=30, dest="depth_limit",
  208. help="Maximum depth to traverse")
  209. parser.add_option("-c", "--confine",
  210. action="store", type="string", dest="confine",
  211. help="Confine crawl to specified prefix")
  212. parser.add_option("-x", "--exclude", action="append", type="string",
  213. dest="exclude", default=[], help="Exclude URLs by prefix")
  214. parser.add_option("-L", "--show-links", action="store_true", default=False,
  215. dest="out_links", help="Output links found")
  216. parser.add_option("-u", "--show-urls", action="store_true", default=False,
  217. dest="out_urls", help="Output URLs found")
  218. opts, args = parser.parse_args()
  219. if len(args) < 1:
  220. parser.print_help(sys.stderr)
  221. raise SystemExit, 1
  222. if opts.out_links and opts.out_urls:
  223. parser.print_help(sys.stderr)
  224. parser.error("options -L and -u are mutually exclusive")
  225. return opts, args
  226. def main():
  227. opts, args = parse_options()
  228. url = args[0]
  229. if opts.links:
  230. getLinks(url)
  231. raise SystemExit, 0
  232. depth_limit = opts.depth_limit
  233. confine_prefix=opts.confine
  234. exclude=opts.exclude
  235. sTime = time.time()
  236. print >> sys.stderr, "Crawling %s (Max Depth: %d)" % (url, depth_limit)
  237. crawler = Crawler(url, depth_limit, confine_prefix, exclude)
  238. crawler.crawl()
  239. if opts.out_urls:
  240. print "\n".join(crawler.urls_seen)
  241. if opts.out_links:
  242. print "\n".join([str(l) for l in crawler.links_remembered])
  243. eTime = time.time()
  244. tTime = eTime - sTime
  245. print >> sys.stderr, "Found: %d" % crawler.num_links
  246. print >> sys.stderr, "Followed: %d" % crawler.num_followed
  247. print >> sys.stderr, "Stats: (%d/s after %0.2fs)" % (
  248. int(math.ceil(float(crawler.num_links) / tTime)), tTime)
  249. if __name__ == "__main__":
  250. main()