PageRenderTime 24ms CodeModel.GetById 16ms RepoModel.GetById 1ms app.codeStats 0ms

/Gspider/kongulo/kongulo.py

http://devj.googlecode.com/
Python | 428 lines | 336 code | 26 blank | 66 comment | 11 complexity | 03003f25db20a9316a4029a874fe4981 MD5 | raw file
Possible License(s): LGPL-2.1, BSD-3-Clause
  1. #!/usr/bin/env python
  2. # Copyright (c) 2005, Google Inc.
  3. # All rights reserved.
  4. #
  5. # Redistribution and use in source and binary forms, with or without
  6. # modification, are permitted provided that the following conditions are
  7. # met:
  8. #
  9. # * Redistributions of source code must retain the above copyright
  10. # notice, this list of conditions and the following disclaimer.
  11. # * Redistributions in binary form must reproduce the above
  12. # copyright notice, this list of conditions and the following disclaimer
  13. # in the documentation and/or other materials provided with the
  14. # distribution.
  15. # * Neither the name of Google Inc. nor the names of its
  16. # contributors may be used to endorse or promote products derived from
  17. # this software without specific prior written permission.
  18. #
  19. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  20. # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  21. # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  22. # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  23. # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  24. # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  25. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  26. # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  27. # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  28. # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  29. # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30. import robotparser
  31. import urllib
  32. import urllib2
  33. import re
  34. import sets
  35. import sys
  36. import urlparse
  37. import win32com.client
  38. import time
  39. import pywintypes
  40. import pythoncom
  41. import optparse
  42. import getpass
  43. import itertools
  44. import email.Utils
  45. '''A simple web crawler that pushes pages into GDS. Features include:
  46. - Knows basic and digest HTTP authentication
  47. - Obeys robots.txt
  48. - Can loop, recrawling over previously crawled pages every X minutes
  49. - When recrawling, uses If-Modified-Since HTTP header to minimize transfers
  50. For usage instructions, run with -h flag.
  51. Requires Python 2.4 and the win32all extensions for Python 2.4 on Windows.
  52. Will not work unless Google Desktop Search 1.0 or later is installed.
  53. '''
  54. # Matches URLs in <a href=...> tags. Chosen above htmllib.HTMLParser because
  55. # this is much more lenient, not requiring HTML to be valid.
  56. _LINK_RE = re.compile(r'<\s*(a|img).+href\s*=\s*"?(.+?)"?(\s|>)',
  57. re.MULTILINE | re.IGNORECASE)
  58. # Matches <frame src="bla"> tags.
  59. _FRAME_RE = re.compile(r'<\s*(frame).+src\s*=\s*"?(.+?)"?(\s|>)',
  60. re.MULTILINE | re.IGNORECASE)
  61. # Digs out the text of an HTML document's title.
  62. _TITLE_RE = re.compile(r'<\s*title.*?>(.+)</\s*title\s*>',
  63. re.MULTILINE | re.IGNORECASE)
  64. # This plugin's GUID, used to register with GDS.
  65. _GUID = '{5e1788fe-a6e6-429f-816c-80cb969028d3}'
  66. class NoExceptionHandler(urllib2.BaseHandler):
  67. '''An exception handler for HTTP that never throws an exception for various
  68. error codes that Kongulo always checks explicitly rather than catching them
  69. as exceptions.'''
  70. def http_error_304(self, req, fp, code, msg, hdrs):
  71. '''We handle not-modified-since explicitly.'''
  72. return fp
  73. # We check error codes explicitly so we don't want an exception
  74. http_error_400 = http_error_401 = http_error_402 = http_error_403 \
  75. = http_error_404 = http_error_304
  76. class PasswordDb(urllib2.HTTPPasswordMgr):
  77. '''A very simple password store. The user can supply usernames using the
  78. -p flag on the command line, and will be prompted for the password for
  79. each username.'''
  80. def __init__(self):
  81. self.passwords = [] # [ [substring, uid, pw], [substring, uid, pw] ]
  82. def Populate(self, options):
  83. '''Given an options object as used by Kongulo, ask the user for the
  84. password for each user-id/substring-of-domain that the user provided using
  85. the -p flag.'''
  86. if not options.pw:
  87. return
  88. for item in options.pw.split(','):
  89. (uid, substring) = item.split('@')
  90. pw = getpass.getpass('Enter password for %s: ' % item)
  91. self.passwords.append([substring, uid, pw])
  92. def find_user_password(self, *args, **kw):
  93. for passdata in self.passwords:
  94. for name in args:
  95. if name.find(passdata[0]) != -1:
  96. return (passdata[1], passdata[2])
  97. print "!!! Need login info for (%s @ %s), consider using -p flag" % args
  98. return (None, None)
  99. passwords = PasswordDb()
  100. # A URL opener that can do basic and digest authentication, and never raises
  101. # exceptions for HTTP error codes we handle explicitly.
  102. opener = urllib2.build_opener(urllib2.HTTPBasicAuthHandler(passwords),
  103. urllib2.HTTPDigestAuthHandler(passwords),
  104. NoExceptionHandler())
  105. # To be a nice Internet citizen, we identify ourselves properly so that
  106. # whoever doesn't like Kongulo can exclude us using robots.txt
  107. opener.addheaders = [('User-agent', 'Kongulo v0.1 personal web crawler')]
  108. # Should always be true on Windows systems.
  109. assert hasattr(opener.handlers[0],
  110. 'proxies'), 'ProxyHandler must be first handler.'
  111. # This parses Windows proxy registry settings
  112. opener.handlers[0].proxies = urllib.getproxies()
  113. class LenientRobotParser(robotparser.RobotFileParser):
  114. '''Adds ability to parse robot files where same user agent is specified
  115. multiple times.'''
  116. def __init__(self, url):
  117. '''Setup internal state like RobotFileParser does.'''
  118. robotparser.RobotFileParser.__init__(self)
  119. f = opener.open(url)
  120. lines = []
  121. line = f.readline()
  122. while line:
  123. lines.append(line.strip())
  124. line = f.readline()
  125. self.errcode = f.code
  126. if self.errcode == 401 or self.errcode == 403:
  127. self.disallow_all = 1
  128. elif self.errcode >= 400:
  129. self.allow_all = 1
  130. elif self.errcode == 200 and lines:
  131. self.parse(lines)
  132. def parse(self, lines):
  133. """Strip repeated sequential definitions of same user agent, then
  134. call base's parse method."""
  135. last_ua = ''
  136. modified_lines = []
  137. for line in lines:
  138. line
  139. if line.lower().startswith('user-agent'):
  140. temp = last_ua
  141. last_ua = line.lower()
  142. if last_ua == temp:
  143. continue # skip line
  144. if line.strip() == '':
  145. last_ua = '' # reset on blank line
  146. modified_lines += [line]
  147. robotparser.RobotFileParser.parse(self, modified_lines)
  148. class UrlValidator:
  149. '''An object that handles checking if we should fetch and crawl a specific
  150. URL. This is based on the type of the URL (only crawl http URLs) and robot
  151. rules. Maintains a cache of robot rules already fetched.'''
  152. def __init__(self, match_url):
  153. self.robots = {} # Dict of robot URLs to robot parsers
  154. self.match_url = re.compile(match_url)
  155. def IsCrawlable(self, url):
  156. """Returns true if it's OK to crawl the absolute URL provided."""
  157. if not url.startswith('http') or not self.match_url.match(url):
  158. return 0
  159. return self.GetRules(url).can_fetch('*', url)
  160. def GetRules(self, url):
  161. """Returns the robot rules parser for 'url'"""
  162. robots_dir = urlparse.urljoin(url, "robots.txt") # First try dir-level
  163. if robots_dir in self.robots:
  164. return self.robots[robots_dir]
  165. robots_site = urlparse.urljoin(url, "/robots.txt") # Then the site-level
  166. if robots_site in self.robots:
  167. return self.robots[robots_site]
  168. # Inv: Our cache contains neither a dir-level nor site-level robots.txt file
  169. rules = LenientRobotParser(robots_dir) # First try dir-level
  170. if hasattr(rules, 'errcode') and rules.errcode == 200:
  171. self.robots[robots_dir] = rules
  172. else:
  173. rules = LenientRobotParser(robots_site) # Then try site-level
  174. self.robots[robots_site] = rules
  175. return rules
  176. class Crawler:
  177. '''This object holds the state of the crawl, and performs the crawl.'''
  178. def __init__(self, options):
  179. self.options = options # Store the options provided
  180. self.rules = UrlValidator(options.match) # Cache of robot rules etc.
  181. # Invariant of data:
  182. # - 'tocrawl' is a list of items that we have or will crawl. If we have
  183. # never crawled them since we started, the item at index 2 in each
  184. # crawlitem is None, otherwise it is a dictionary of headers,
  185. # specifically the 'If-Modified-Since' header, to prevent us from fetching
  186. # this item in the next crawl if it hasn't been modified.
  187. # - 'scheduled' is a list of items we have already added to 'tocrawl'
  188. # (perhaps a premature optimization since we could just iterate over
  189. # 'tocrawl')
  190. self.scheduled = sets.Set()
  191. # Format of this list is:
  192. # [[url1, depth1, { headername : headerval, ... } ], [url2, depth2], {}...]
  193. self.tocrawl = []
  194. # Fetch the entrypoint to the Google Desktop Search API.
  195. self.event_factory = win32com.client.Dispatch(
  196. 'GoogleDesktopSearch.EventFactory')
  197. def ExtractLinks(self, baseurl, htmldoc):
  198. """Returns all anchors from the document with contents 'htmldoc' at
  199. 'baseurl' that are OK to crawl."""
  200. urls = []
  201. for match in itertools.chain(_LINK_RE.finditer(htmldoc),
  202. _FRAME_RE.finditer(htmldoc)):
  203. url = urlparse.urljoin(baseurl, match.group(2))
  204. if self.rules.IsCrawlable(url):
  205. urls += [url]
  206. else:
  207. print " I %s" % url
  208. return urls
  209. def Crawl(self, baseurls):
  210. '''Performs the crawl.
  211. Args:
  212. baseurls: [url1, url2, ...]
  213. '''
  214. # Bootstrap our invariant of data
  215. for baseurl in baseurls:
  216. self.tocrawl.append([baseurl, self.options.depth, None])
  217. if self.options.loop:
  218. print "Running in loop mode - press Ctrl-C to stop."
  219. while True:
  220. for crawlitem in self.tocrawl:
  221. (url, depth, headers) = crawlitem
  222. try:
  223. if headers:
  224. doc = opener.open(urllib2.Request(url, headers=headers))
  225. else:
  226. doc = opener.open(url)
  227. doctype = doc.info().type
  228. if doc.code == 304: # not modified since last time
  229. print "--- (nomod) %s" % url
  230. elif (doc.code == 200 and doctype == 'text/html' or
  231. doctype == 'text/plain'):
  232. print "::: (%d) %s" % (depth, url)
  233. # Store last modified in the crawlitem
  234. # Prefer Last-Modified header, then Date header (to get same
  235. # formatting as used by the server), then current date in
  236. # appropriate format.
  237. last_modified = None
  238. if 'last_modified' in doc.headers:
  239. last_modified = fdoc.headers['last_modified']
  240. elif 'date' in doc.headers:
  241. last_modified = doc.headers['date']
  242. else:
  243. last_modified = email.Utils.formatdate(time.time(), usegmt=True)
  244. crawlitem[2] = { 'If-Modified-Since' : last_modified }
  245. content = doc.read()
  246. # Create a GDS event, populate its fields, and send it off to have
  247. # the web page added to the Google Desktop Search index.
  248. event = self.event_factory.CreateEvent(_GUID,
  249. 'Google.Desktop.WebPage')
  250. event.AddProperty('format', doctype)
  251. event.AddProperty('content', content)
  252. event.AddProperty('uri', url)
  253. # TODO Use the last-modified HTTP header instead of current time
  254. # if available.
  255. event.AddProperty('last_modified_time',
  256. pywintypes.Time(time.time() + time.timezone))
  257. if doctype == 'text/html': # no links in text documents
  258. title_match = _TITLE_RE.search(content)
  259. if title_match:
  260. title = title_match.group(1)
  261. event.AddProperty('title', title)
  262. for link in self.ExtractLinks(doc.geturl(), content):
  263. if depth > 0 and not link in self.scheduled:
  264. self.scheduled.add(link)
  265. self.tocrawl.append([link, depth - 1, None])
  266. # Don't use historical flag, because if we do, GDS will "throttle"
  267. # the events we send, not returning until the user becomes idle.
  268. # We also want to ensure the page is updated in the cache (in case
  269. # the user already visited it herself using a browser).
  270. event.Send(0x01)
  271. else:
  272. print "!!! (HTTP %d) %s" % (doc.code, url)
  273. doc.close()
  274. except IOError:
  275. print "!!! (nolink) %s" % url
  276. except ValueError:
  277. print "!!! (noauth) %s" % url
  278. if not self.options.loop:
  279. break
  280. else:
  281. print ("=== Completed crawl; will recrawl in %d minutes." %
  282. (self.options.sleep))
  283. time.sleep(60 * self.options.sleep)
  284. def Main():
  285. '''This function contains the logic for the command-line UI for Kongulo.'''
  286. # Set up options and parse arguments.
  287. parser = optparse.OptionParser(usage='%prog [options] BASEURL1 BASEURL2 ...')
  288. parser.add_option('-d', '--depth', type='int', dest='depth', default=0,
  289. help='How deep to follow links from BASEURLs (default 0, '
  290. 'suggest max 5-6)')
  291. parser.add_option('-m', '--match', dest='match', default='.+',
  292. help=r'Regular expression that URLs must match if they are '
  293. 'to be crawled, e.g. ".+intranet\.smurfgeburf\.com.+" to '
  294. 'stay within the Smurfgeburf intranet')
  295. parser.add_option('-l', '--loop', action='store_true', dest='loop',
  296. default=False, help='If this flag is given, Kongulo will '
  297. 'keep fetching the specified page and pages it points to. '
  298. 'It will not refetch pages that haven\'t changed.')
  299. parser.add_option('-s', '--sleep', type='int', dest='sleep', default=60,
  300. help='Number of minutes to sleep before looping (default '
  301. '60). Only valid if -l is also specified.')
  302. parser.add_option('-p', '--passwords', dest='pw',
  303. help='Comma-delimited list of user IDs at names that will '
  304. 'be matched as substrings against the domain or "region" '
  305. 'that a password is needed for, e.g. '
  306. '"joi@google.com,admin@192.168.250.1,snafu@slashdot.org". '
  307. 'You will be prompted for each password.')
  308. parser.add_option('-u', '--unregister', action='store_true', dest='unreg',
  309. help='Run with this flag to unregister the plugin. '
  310. 'All other options are ignored when you use this flag.')
  311. (options, args) = parser.parse_args()
  312. if len(args) < 1 and not options.unreg:
  313. parser.error('Provide at least one base URL')
  314. try:
  315. obj = win32com.client.Dispatch('GoogleDesktopSearch.Register')
  316. except pythoncom.ole_error:
  317. print ('ERROR: You need to install Google Desktop Search to be able to '
  318. 'use Kongulo.')
  319. sys.exit(2)
  320. if not options.unreg:
  321. try:
  322. # Register with GDS. This is a one-time operation and will return an
  323. # error if already registered. We cheat and just catch the error and
  324. # do nothing.
  325. # We try two different methods since different versions of GD have
  326. # different names for the registration method. We ignore the specific
  327. # exception that we get if the method is not supported.
  328. try:
  329. obj.RegisterComponent(_GUID,
  330. ['Title', 'Kongulo', 'Description', 'A simple web spider that '
  331. 'lets you keep copies of web sites in your Google Desktop Search '
  332. 'index.', 'Icon', '%SystemRoot%\system32\SHELL32.dll,134'])
  333. except AttributeError, e:
  334. if (len(e.args) == 0 or
  335. e.args[0] != 'GoogleDesktopSearch.Register.RegisterComponent'):
  336. raise e
  337. else:
  338. obj.RegisterIndexingComponent(_GUID,
  339. ['Title', 'Kongulo', 'Description', 'A simple web spider that '
  340. 'lets you keep copies of web sites in your Google Desktop Search '
  341. 'index.', 'Icon', '%SystemRoot%\system32\SHELL32.dll,134'])
  342. except pywintypes.com_error, e:
  343. if len(e.args) > 0 and e.args[0] == -2147352567:
  344. # This is the error we get if already registered.
  345. pass
  346. else:
  347. raise e
  348. else:
  349. # Try both approaches to unregister, too.
  350. try:
  351. obj.UnregisterIndexingComponent(_GUID)
  352. except:
  353. pass
  354. try:
  355. obj.UnregisterComponent(_GUID)
  356. except:
  357. pass
  358. sys.exit(0)
  359. passwords.Populate(options)
  360. Crawler(options).Crawl(args)
  361. if __name__ == '__main__':
  362. Main()