/Gspider/kongulo/kongulo.py
Python | 428 lines | 336 code | 26 blank | 66 comment | 11 complexity | 03003f25db20a9316a4029a874fe4981 MD5 | raw file
Possible License(s): LGPL-2.1, BSD-3-Clause
- #!/usr/bin/env python
-
- # Copyright (c) 2005, Google Inc.
- # All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are
- # met:
- #
- # * Redistributions of source code must retain the above copyright
- # notice, this list of conditions and the following disclaimer.
- # * Redistributions in binary form must reproduce the above
- # copyright notice, this list of conditions and the following disclaimer
- # in the documentation and/or other materials provided with the
- # distribution.
- # * Neither the name of Google Inc. nor the names of its
- # contributors may be used to endorse or promote products derived from
- # this software without specific prior written permission.
- #
- # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
- import robotparser
- import urllib
- import urllib2
- import re
- import sets
- import sys
- import urlparse
- import win32com.client
- import time
- import pywintypes
- import pythoncom
- import optparse
- import getpass
- import itertools
- import email.Utils
-
- '''A simple web crawler that pushes pages into GDS. Features include:
- - Knows basic and digest HTTP authentication
- - Obeys robots.txt
- - Can loop, recrawling over previously crawled pages every X minutes
- - When recrawling, uses If-Modified-Since HTTP header to minimize transfers
-
- For usage instructions, run with -h flag.
-
- Requires Python 2.4 and the win32all extensions for Python 2.4 on Windows.
- Will not work unless Google Desktop Search 1.0 or later is installed.
- '''
-
- # Matches URLs in <a href=...> tags. Chosen above htmllib.HTMLParser because
- # this is much more lenient, not requiring HTML to be valid.
- _LINK_RE = re.compile(r'<\s*(a|img).+href\s*=\s*"?(.+?)"?(\s|>)',
- re.MULTILINE | re.IGNORECASE)
-
-
- # Matches <frame src="bla"> tags.
- _FRAME_RE = re.compile(r'<\s*(frame).+src\s*=\s*"?(.+?)"?(\s|>)',
- re.MULTILINE | re.IGNORECASE)
-
-
- # Digs out the text of an HTML document's title.
- _TITLE_RE = re.compile(r'<\s*title.*?>(.+)</\s*title\s*>',
- re.MULTILINE | re.IGNORECASE)
-
-
- # This plugin's GUID, used to register with GDS.
- _GUID = '{5e1788fe-a6e6-429f-816c-80cb969028d3}'
-
-
- class NoExceptionHandler(urllib2.BaseHandler):
- '''An exception handler for HTTP that never throws an exception for various
- error codes that Kongulo always checks explicitly rather than catching them
- as exceptions.'''
- def http_error_304(self, req, fp, code, msg, hdrs):
- '''We handle not-modified-since explicitly.'''
- return fp
-
- # We check error codes explicitly so we don't want an exception
- http_error_400 = http_error_401 = http_error_402 = http_error_403 \
- = http_error_404 = http_error_304
-
-
- class PasswordDb(urllib2.HTTPPasswordMgr):
- '''A very simple password store. The user can supply usernames using the
- -p flag on the command line, and will be prompted for the password for
- each username.'''
-
- def __init__(self):
- self.passwords = [] # [ [substring, uid, pw], [substring, uid, pw] ]
-
- def Populate(self, options):
- '''Given an options object as used by Kongulo, ask the user for the
- password for each user-id/substring-of-domain that the user provided using
- the -p flag.'''
- if not options.pw:
- return
-
- for item in options.pw.split(','):
- (uid, substring) = item.split('@')
- pw = getpass.getpass('Enter password for %s: ' % item)
- self.passwords.append([substring, uid, pw])
-
- def find_user_password(self, *args, **kw):
- for passdata in self.passwords:
- for name in args:
- if name.find(passdata[0]) != -1:
- return (passdata[1], passdata[2])
- print "!!! Need login info for (%s @ %s), consider using -p flag" % args
- return (None, None)
-
- passwords = PasswordDb()
-
- # A URL opener that can do basic and digest authentication, and never raises
- # exceptions for HTTP error codes we handle explicitly.
- opener = urllib2.build_opener(urllib2.HTTPBasicAuthHandler(passwords),
- urllib2.HTTPDigestAuthHandler(passwords),
- NoExceptionHandler())
-
- # To be a nice Internet citizen, we identify ourselves properly so that
- # whoever doesn't like Kongulo can exclude us using robots.txt
- opener.addheaders = [('User-agent', 'Kongulo v0.1 personal web crawler')]
-
- # Should always be true on Windows systems.
- assert hasattr(opener.handlers[0],
- 'proxies'), 'ProxyHandler must be first handler.'
- # This parses Windows proxy registry settings
- opener.handlers[0].proxies = urllib.getproxies()
-
- class LenientRobotParser(robotparser.RobotFileParser):
- '''Adds ability to parse robot files where same user agent is specified
- multiple times.'''
-
- def __init__(self, url):
- '''Setup internal state like RobotFileParser does.'''
- robotparser.RobotFileParser.__init__(self)
- f = opener.open(url)
- lines = []
- line = f.readline()
- while line:
- lines.append(line.strip())
- line = f.readline()
- self.errcode = f.code
- if self.errcode == 401 or self.errcode == 403:
- self.disallow_all = 1
- elif self.errcode >= 400:
- self.allow_all = 1
- elif self.errcode == 200 and lines:
- self.parse(lines)
-
- def parse(self, lines):
- """Strip repeated sequential definitions of same user agent, then
- call base's parse method."""
- last_ua = ''
- modified_lines = []
- for line in lines:
- line
- if line.lower().startswith('user-agent'):
- temp = last_ua
- last_ua = line.lower()
- if last_ua == temp:
- continue # skip line
- if line.strip() == '':
- last_ua = '' # reset on blank line
- modified_lines += [line]
-
- robotparser.RobotFileParser.parse(self, modified_lines)
-
-
- class UrlValidator:
- '''An object that handles checking if we should fetch and crawl a specific
- URL. This is based on the type of the URL (only crawl http URLs) and robot
- rules. Maintains a cache of robot rules already fetched.'''
-
- def __init__(self, match_url):
- self.robots = {} # Dict of robot URLs to robot parsers
- self.match_url = re.compile(match_url)
-
- def IsCrawlable(self, url):
- """Returns true if it's OK to crawl the absolute URL provided."""
- if not url.startswith('http') or not self.match_url.match(url):
- return 0
- return self.GetRules(url).can_fetch('*', url)
-
- def GetRules(self, url):
- """Returns the robot rules parser for 'url'"""
- robots_dir = urlparse.urljoin(url, "robots.txt") # First try dir-level
- if robots_dir in self.robots:
- return self.robots[robots_dir]
- robots_site = urlparse.urljoin(url, "/robots.txt") # Then the site-level
- if robots_site in self.robots:
- return self.robots[robots_site]
-
- # Inv: Our cache contains neither a dir-level nor site-level robots.txt file
-
- rules = LenientRobotParser(robots_dir) # First try dir-level
- if hasattr(rules, 'errcode') and rules.errcode == 200:
- self.robots[robots_dir] = rules
- else:
- rules = LenientRobotParser(robots_site) # Then try site-level
- self.robots[robots_site] = rules
-
- return rules
-
-
- class Crawler:
- '''This object holds the state of the crawl, and performs the crawl.'''
-
- def __init__(self, options):
- self.options = options # Store the options provided
- self.rules = UrlValidator(options.match) # Cache of robot rules etc.
-
- # Invariant of data:
- # - 'tocrawl' is a list of items that we have or will crawl. If we have
- # never crawled them since we started, the item at index 2 in each
- # crawlitem is None, otherwise it is a dictionary of headers,
- # specifically the 'If-Modified-Since' header, to prevent us from fetching
- # this item in the next crawl if it hasn't been modified.
- # - 'scheduled' is a list of items we have already added to 'tocrawl'
- # (perhaps a premature optimization since we could just iterate over
- # 'tocrawl')
- self.scheduled = sets.Set()
- # Format of this list is:
- # [[url1, depth1, { headername : headerval, ... } ], [url2, depth2], {}...]
- self.tocrawl = []
-
- # Fetch the entrypoint to the Google Desktop Search API.
- self.event_factory = win32com.client.Dispatch(
- 'GoogleDesktopSearch.EventFactory')
-
-
- def ExtractLinks(self, baseurl, htmldoc):
- """Returns all anchors from the document with contents 'htmldoc' at
- 'baseurl' that are OK to crawl."""
- urls = []
- for match in itertools.chain(_LINK_RE.finditer(htmldoc),
- _FRAME_RE.finditer(htmldoc)):
- url = urlparse.urljoin(baseurl, match.group(2))
- if self.rules.IsCrawlable(url):
- urls += [url]
- else:
- print " I %s" % url
- return urls
-
- def Crawl(self, baseurls):
- '''Performs the crawl.
-
- Args:
- baseurls: [url1, url2, ...]
- '''
- # Bootstrap our invariant of data
- for baseurl in baseurls:
- self.tocrawl.append([baseurl, self.options.depth, None])
-
- if self.options.loop:
- print "Running in loop mode - press Ctrl-C to stop."
-
- while True:
- for crawlitem in self.tocrawl:
- (url, depth, headers) = crawlitem
- try:
- if headers:
- doc = opener.open(urllib2.Request(url, headers=headers))
- else:
- doc = opener.open(url)
-
- doctype = doc.info().type
- if doc.code == 304: # not modified since last time
- print "--- (nomod) %s" % url
- elif (doc.code == 200 and doctype == 'text/html' or
- doctype == 'text/plain'):
- print "::: (%d) %s" % (depth, url)
-
- # Store last modified in the crawlitem
- # Prefer Last-Modified header, then Date header (to get same
- # formatting as used by the server), then current date in
- # appropriate format.
- last_modified = None
- if 'last_modified' in doc.headers:
- last_modified = fdoc.headers['last_modified']
- elif 'date' in doc.headers:
- last_modified = doc.headers['date']
- else:
- last_modified = email.Utils.formatdate(time.time(), usegmt=True)
- crawlitem[2] = { 'If-Modified-Since' : last_modified }
-
- content = doc.read()
-
- # Create a GDS event, populate its fields, and send it off to have
- # the web page added to the Google Desktop Search index.
- event = self.event_factory.CreateEvent(_GUID,
- 'Google.Desktop.WebPage')
- event.AddProperty('format', doctype)
- event.AddProperty('content', content)
- event.AddProperty('uri', url)
- # TODO Use the last-modified HTTP header instead of current time
- # if available.
- event.AddProperty('last_modified_time',
- pywintypes.Time(time.time() + time.timezone))
-
- if doctype == 'text/html': # no links in text documents
- title_match = _TITLE_RE.search(content)
- if title_match:
- title = title_match.group(1)
- event.AddProperty('title', title)
-
- for link in self.ExtractLinks(doc.geturl(), content):
- if depth > 0 and not link in self.scheduled:
- self.scheduled.add(link)
- self.tocrawl.append([link, depth - 1, None])
-
- # Don't use historical flag, because if we do, GDS will "throttle"
- # the events we send, not returning until the user becomes idle.
- # We also want to ensure the page is updated in the cache (in case
- # the user already visited it herself using a browser).
- event.Send(0x01)
- else:
- print "!!! (HTTP %d) %s" % (doc.code, url)
-
- doc.close()
- except IOError:
- print "!!! (nolink) %s" % url
- except ValueError:
- print "!!! (noauth) %s" % url
-
- if not self.options.loop:
- break
- else:
- print ("=== Completed crawl; will recrawl in %d minutes." %
- (self.options.sleep))
- time.sleep(60 * self.options.sleep)
-
-
- def Main():
- '''This function contains the logic for the command-line UI for Kongulo.'''
-
- # Set up options and parse arguments.
- parser = optparse.OptionParser(usage='%prog [options] BASEURL1 BASEURL2 ...')
- parser.add_option('-d', '--depth', type='int', dest='depth', default=0,
- help='How deep to follow links from BASEURLs (default 0, '
- 'suggest max 5-6)')
- parser.add_option('-m', '--match', dest='match', default='.+',
- help=r'Regular expression that URLs must match if they are '
- 'to be crawled, e.g. ".+intranet\.smurfgeburf\.com.+" to '
- 'stay within the Smurfgeburf intranet')
- parser.add_option('-l', '--loop', action='store_true', dest='loop',
- default=False, help='If this flag is given, Kongulo will '
- 'keep fetching the specified page and pages it points to. '
- 'It will not refetch pages that haven\'t changed.')
- parser.add_option('-s', '--sleep', type='int', dest='sleep', default=60,
- help='Number of minutes to sleep before looping (default '
- '60). Only valid if -l is also specified.')
- parser.add_option('-p', '--passwords', dest='pw',
- help='Comma-delimited list of user IDs at names that will '
- 'be matched as substrings against the domain or "region" '
- 'that a password is needed for, e.g. '
- '"joi@google.com,admin@192.168.250.1,snafu@slashdot.org". '
- 'You will be prompted for each password.')
- parser.add_option('-u', '--unregister', action='store_true', dest='unreg',
- help='Run with this flag to unregister the plugin. '
- 'All other options are ignored when you use this flag.')
- (options, args) = parser.parse_args()
- if len(args) < 1 and not options.unreg:
- parser.error('Provide at least one base URL')
-
- try:
- obj = win32com.client.Dispatch('GoogleDesktopSearch.Register')
- except pythoncom.ole_error:
- print ('ERROR: You need to install Google Desktop Search to be able to '
- 'use Kongulo.')
- sys.exit(2)
-
- if not options.unreg:
- try:
- # Register with GDS. This is a one-time operation and will return an
- # error if already registered. We cheat and just catch the error and
- # do nothing.
-
- # We try two different methods since different versions of GD have
- # different names for the registration method. We ignore the specific
- # exception that we get if the method is not supported.
- try:
- obj.RegisterComponent(_GUID,
- ['Title', 'Kongulo', 'Description', 'A simple web spider that '
- 'lets you keep copies of web sites in your Google Desktop Search '
- 'index.', 'Icon', '%SystemRoot%\system32\SHELL32.dll,134'])
- except AttributeError, e:
- if (len(e.args) == 0 or
- e.args[0] != 'GoogleDesktopSearch.Register.RegisterComponent'):
- raise e
- else:
- obj.RegisterIndexingComponent(_GUID,
- ['Title', 'Kongulo', 'Description', 'A simple web spider that '
- 'lets you keep copies of web sites in your Google Desktop Search '
- 'index.', 'Icon', '%SystemRoot%\system32\SHELL32.dll,134'])
- except pywintypes.com_error, e:
- if len(e.args) > 0 and e.args[0] == -2147352567:
- # This is the error we get if already registered.
- pass
- else:
- raise e
- else:
- # Try both approaches to unregister, too.
- try:
- obj.UnregisterIndexingComponent(_GUID)
- except:
- pass
- try:
- obj.UnregisterComponent(_GUID)
- except:
- pass
- sys.exit(0)
-
- passwords.Populate(options)
- Crawler(options).Crawl(args)
-
-
- if __name__ == '__main__':
- Main()