kongulo.py | searchcode

/Gspider/kongulo/kongulo.py

http://devj.googlecode.com/
Python | 428 lines | 336 code | 26 blank | 66 comment | 11 complexity | 03003f25db20a9316a4029a874fe4981 MD5 | raw file
Possible License(s): LGPL-2.1, BSD-3-Clause

#!/usr/bin/env python



# Copyright (c) 2005, Google Inc.

# All rights reserved.

# 

# Redistribution and use in source and binary forms, with or without

# modification, are permitted provided that the following conditions are

# met:

# 

#     * Redistributions of source code must retain the above copyright

# notice, this list of conditions and the following disclaimer.

#     * Redistributions in binary form must reproduce the above

# copyright notice, this list of conditions and the following disclaimer

# in the documentation and/or other materials provided with the

# distribution.

#     * Neither the name of Google Inc. nor the names of its

# contributors may be used to endorse or promote products derived from

# this software without specific prior written permission.

# 

# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.



import robotparser

import urllib

import urllib2

import re

import sets

import sys

import urlparse

import win32com.client

import time

import pywintypes

import pythoncom

import optparse

import getpass

import itertools

import email.Utils



'''A simple web crawler that pushes pages into GDS.  Features include:

  - Knows basic and digest HTTP authentication

  - Obeys robots.txt

  - Can loop, recrawling over previously crawled pages every X minutes

  - When recrawling, uses If-Modified-Since HTTP header to minimize transfers



For usage instructions, run with -h flag.



Requires Python 2.4 and the win32all extensions for Python 2.4 on Windows.

Will not work unless Google Desktop Search 1.0 or later is installed.

'''



# Matches URLs in <a href=...> tags.  Chosen above htmllib.HTMLParser because

# this is much more lenient, not requiring HTML to be valid.

_LINK_RE = re.compile(r'<\s*(a|img).+href\s*=\s*"?(.+?)"?(\s|>)',

                      re.MULTILINE | re.IGNORECASE)





# Matches <frame src="bla"> tags.

_FRAME_RE = re.compile(r'<\s*(frame).+src\s*=\s*"?(.+?)"?(\s|>)',

                       re.MULTILINE | re.IGNORECASE)





# Digs out the text of an HTML document's title.

_TITLE_RE = re.compile(r'<\s*title.*?>(.+)</\s*title\s*>',

                       re.MULTILINE | re.IGNORECASE)





# This plugin's GUID, used to register with GDS.

_GUID = '{5e1788fe-a6e6-429f-816c-80cb969028d3}'





class NoExceptionHandler(urllib2.BaseHandler):

  '''An exception handler for HTTP that never throws an exception for various

  error codes that Kongulo always checks explicitly rather than catching them

  as exceptions.'''

  def http_error_304(self, req, fp, code, msg, hdrs):

    '''We handle not-modified-since explicitly.'''

    return fp

  

  # We check error codes explicitly so we don't want an exception

  http_error_400 = http_error_401 = http_error_402 = http_error_403 \

  = http_error_404 = http_error_304





class PasswordDb(urllib2.HTTPPasswordMgr):

  '''A very simple password store.  The user can supply usernames using the

  -p flag on the command line, and will be prompted for the password for

  each username.'''

  

  def __init__(self):

    self.passwords = []  # [ [substring, uid, pw], [substring, uid, pw] ]

  

  def Populate(self, options):

    '''Given an options object as used by Kongulo, ask the user for the

    password for each user-id/substring-of-domain that the user provided using

    the -p flag.'''

    if not options.pw:

      return

    

    for item in options.pw.split(','):

      (uid, substring) = item.split('@')

      pw = getpass.getpass('Enter password for %s: ' % item)

      self.passwords.append([substring, uid, pw])

    

  def find_user_password(self, *args, **kw):

    for passdata in self.passwords:

      for name in args:

        if name.find(passdata[0]) != -1:

          return (passdata[1], passdata[2])

    print "!!! Need login info for (%s @ %s), consider using -p flag" % args

    return (None, None)



passwords = PasswordDb()



# A URL opener that can do basic and digest authentication, and never raises

# exceptions for HTTP error codes we handle explicitly.

opener = urllib2.build_opener(urllib2.HTTPBasicAuthHandler(passwords),

                              urllib2.HTTPDigestAuthHandler(passwords),

                              NoExceptionHandler())



# To be a nice Internet citizen, we identify ourselves properly so that

# whoever doesn't like Kongulo can exclude us using robots.txt

opener.addheaders = [('User-agent', 'Kongulo v0.1 personal web crawler')]



# Should always be true on Windows systems.

assert hasattr(opener.handlers[0],

               'proxies'), 'ProxyHandler must be first handler.'

# This parses Windows proxy registry settings

opener.handlers[0].proxies = urllib.getproxies()



class LenientRobotParser(robotparser.RobotFileParser):

  '''Adds ability to parse robot files where same user agent is specified

  multiple times.'''



  def __init__(self, url):

    '''Setup internal state like RobotFileParser does.'''

    robotparser.RobotFileParser.__init__(self)

    f = opener.open(url)

    lines = []

    line = f.readline()

    while line:

        lines.append(line.strip())

        line = f.readline()

    self.errcode = f.code

    if self.errcode == 401 or self.errcode == 403:

        self.disallow_all = 1

    elif self.errcode >= 400:

        self.allow_all = 1

    elif self.errcode == 200 and lines:

        self.parse(lines)

    

  def parse(self, lines):

    """Strip repeated sequential definitions of same user agent, then

    call base's parse method."""

    last_ua = ''

    modified_lines = []

    for line in lines:

      line

      if line.lower().startswith('user-agent'):

        temp = last_ua

        last_ua = line.lower()

        if last_ua == temp:

          continue  # skip line

      if line.strip() == '':

        last_ua = ''  # reset on blank line

      modified_lines += [line]

    

    robotparser.RobotFileParser.parse(self, modified_lines)





class UrlValidator:

  '''An object that handles checking if we should fetch and crawl a specific

  URL.  This is based on the type of the URL (only crawl http URLs) and robot

  rules.  Maintains a cache of robot rules already fetched.'''

  

  def __init__(self, match_url):

    self.robots = {}  # Dict of robot URLs to robot parsers

    self.match_url = re.compile(match_url)

  

  def IsCrawlable(self, url):

    """Returns true if it's OK to crawl the absolute URL provided."""

    if not url.startswith('http') or not self.match_url.match(url):

      return 0

    return self.GetRules(url).can_fetch('*', url)

  

  def GetRules(self, url):

    """Returns the robot rules parser for 'url'"""

    robots_dir = urlparse.urljoin(url, "robots.txt")  # First try dir-level

    if robots_dir in self.robots:

      return self.robots[robots_dir]

    robots_site = urlparse.urljoin(url, "/robots.txt")  # Then the site-level

    if robots_site in self.robots:

      return self.robots[robots_site]

    

    # Inv: Our cache contains neither a dir-level nor site-level robots.txt file

    

    rules = LenientRobotParser(robots_dir)  # First try dir-level

    if hasattr(rules, 'errcode') and rules.errcode == 200:

      self.robots[robots_dir] = rules

    else:

      rules = LenientRobotParser(robots_site)  # Then try site-level

      self.robots[robots_site] = rules

    

    return rules





class Crawler:

  '''This object holds the state of the crawl, and performs the crawl.'''

  

  def __init__(self, options):    

    self.options = options  # Store the options provided

    self.rules = UrlValidator(options.match)  # Cache of robot rules etc.

    

    # Invariant of data:

    # - 'tocrawl' is a list of items that we have or will crawl.  If we have

    #   never crawled them since we started, the item at index 2 in each

    #   crawlitem is None, otherwise it is a dictionary of headers,

    #   specifically the 'If-Modified-Since' header, to prevent us from fetching

    #   this item in the next crawl if it hasn't been modified.

    # - 'scheduled' is a list of items we have already added to 'tocrawl'

    #   (perhaps a premature optimization since we could just iterate over

    #   'tocrawl')

    self.scheduled = sets.Set()

    # Format of this list is:

    # [[url1, depth1, { headername : headerval, ... } ], [url2, depth2], {}...]

    self.tocrawl = []

    

    # Fetch the entrypoint to the Google Desktop Search API.

    self.event_factory = win32com.client.Dispatch(

      'GoogleDesktopSearch.EventFactory')



  

  def ExtractLinks(self, baseurl, htmldoc):

    """Returns all anchors from the document with contents 'htmldoc' at

    'baseurl' that are OK to crawl."""

    urls = []

    for match in itertools.chain(_LINK_RE.finditer(htmldoc),

                                 _FRAME_RE.finditer(htmldoc)):

      url = urlparse.urljoin(baseurl, match.group(2))

      if self.rules.IsCrawlable(url):

        urls += [url]

      else:

        print "    I %s" % url

    return urls

  

  def Crawl(self, baseurls):

    '''Performs the crawl.

    

    Args:

      baseurls: [url1, url2, ...]

    '''

    # Bootstrap our invariant of data

    for baseurl in baseurls:

      self.tocrawl.append([baseurl, self.options.depth, None])

    

    if self.options.loop:

      print "Running in loop mode - press Ctrl-C to stop."

    

    while True:

      for crawlitem in self.tocrawl:

        (url, depth, headers) = crawlitem

        try:

          if headers:

            doc = opener.open(urllib2.Request(url, headers=headers))

          else:

            doc = opener.open(url)

          

          doctype = doc.info().type

          if doc.code == 304:  # not modified since last time

            print "--- (nomod) %s" % url

          elif (doc.code == 200 and doctype == 'text/html' or

                doctype == 'text/plain'):

            print "::: (%d) %s" % (depth, url)

            

            # Store last modified in the crawlitem

            # Prefer Last-Modified header, then Date header (to get same

            # formatting as used by the server), then current date in

            # appropriate format.

            last_modified = None

            if 'last_modified' in doc.headers:

              last_modified = fdoc.headers['last_modified']

            elif 'date' in doc.headers:

              last_modified = doc.headers['date']

            else:

              last_modified = email.Utils.formatdate(time.time(), usegmt=True)

            crawlitem[2] = { 'If-Modified-Since' : last_modified }

            

            content = doc.read()

            

            # Create a GDS event, populate its fields, and send it off to have

            # the web page added to the Google Desktop Search index.

            event = self.event_factory.CreateEvent(_GUID,

                                                   'Google.Desktop.WebPage')

            event.AddProperty('format', doctype)

            event.AddProperty('content', content)

            event.AddProperty('uri', url)

            # TODO Use the last-modified HTTP header instead of current time

            # if available.

            event.AddProperty('last_modified_time',

                              pywintypes.Time(time.time() + time.timezone))

            

            if doctype == 'text/html':  # no links in text documents

              title_match = _TITLE_RE.search(content)

              if title_match:

                title = title_match.group(1)

                event.AddProperty('title', title)

              

              for link in self.ExtractLinks(doc.geturl(), content):

                if depth > 0 and not link in self.scheduled:

                  self.scheduled.add(link)

                  self.tocrawl.append([link, depth - 1, None])

  

            # Don't use historical flag, because if we do, GDS will "throttle"

            # the events we send, not returning until the user becomes idle.

            # We also want to ensure the page is updated in the cache (in case

            # the user already visited it herself using a browser).

            event.Send(0x01)

          else:

            print "!!! (HTTP %d) %s" % (doc.code, url)

    

          doc.close()

        except IOError:

          print "!!! (nolink) %s" % url

        except ValueError:

          print "!!! (noauth) %s" % url

      

      if not self.options.loop:

        break

      else:

        print ("=== Completed crawl; will recrawl in %d minutes." %

               (self.options.sleep))

        time.sleep(60 * self.options.sleep)





def Main():

  '''This function contains the logic for the command-line UI for Kongulo.'''

  

  # Set up options and parse arguments.

  parser = optparse.OptionParser(usage='%prog [options] BASEURL1 BASEURL2 ...')

  parser.add_option('-d', '--depth', type='int', dest='depth', default=0,

                    help='How deep to follow links from BASEURLs (default 0, '

                         'suggest max 5-6)')

  parser.add_option('-m', '--match', dest='match', default='.+',

                    help=r'Regular expression that URLs must match if they are '

                    'to be crawled, e.g. ".+intranet\.smurfgeburf\.com.+" to '

                    'stay within the Smurfgeburf intranet')

  parser.add_option('-l', '--loop', action='store_true', dest='loop',

                    default=False, help='If this flag is given, Kongulo will '

                    'keep fetching the specified page and pages it points to.  '

                    'It will not refetch pages that haven\'t changed.')

  parser.add_option('-s', '--sleep', type='int', dest='sleep', default=60,

                    help='Number of minutes to sleep before looping (default '

                    '60). Only valid if -l is also specified.')

  parser.add_option('-p', '--passwords', dest='pw',

                    help='Comma-delimited list of user IDs at names that will '

                    'be matched as substrings against the domain or "region" '

                    'that a password is needed for, e.g. '

                    '"joi@google.com,admin@192.168.250.1,snafu@slashdot.org".  '

                    'You will be prompted for each password.')

  parser.add_option('-u', '--unregister', action='store_true', dest='unreg',

                    help='Run with this flag to unregister the plugin. '

                         'All other options are ignored when you use this flag.')

  (options, args) = parser.parse_args()

  if len(args) < 1 and not options.unreg:

    parser.error('Provide at least one base URL')

  

  try:

    obj = win32com.client.Dispatch('GoogleDesktopSearch.Register')

  except pythoncom.ole_error:

    print ('ERROR: You need to install Google Desktop Search to be able to '

           'use Kongulo.')

    sys.exit(2)

  

  if not options.unreg:

    try:

      # Register with GDS.  This is a one-time operation and will return an

      # error if already registered.  We cheat and just catch the error and

      # do nothing.

      

      # We try two different methods since different versions of GD have

      # different names for the registration method.  We ignore the specific

      # exception that we get if the method is not supported.

      try:

        obj.RegisterComponent(_GUID,

                 ['Title', 'Kongulo', 'Description', 'A simple web spider that '

                  'lets you keep copies of web sites in your Google Desktop Search '

                  'index.', 'Icon', '%SystemRoot%\system32\SHELL32.dll,134'])

      except AttributeError, e:

        if (len(e.args) == 0 or

            e.args[0] != 'GoogleDesktopSearch.Register.RegisterComponent'):

          raise e

        else:

          obj.RegisterIndexingComponent(_GUID,

               ['Title', 'Kongulo', 'Description', 'A simple web spider that '

                'lets you keep copies of web sites in your Google Desktop Search '

                'index.', 'Icon', '%SystemRoot%\system32\SHELL32.dll,134'])          

    except pywintypes.com_error, e:

      if len(e.args) > 0 and e.args[0] == -2147352567:

        # This is the error we get if already registered.

        pass

      else:

        raise e

  else:

    # Try both approaches to unregister, too.

    try:

      obj.UnregisterIndexingComponent(_GUID)

    except:

      pass

    try:

      obj.UnregisterComponent(_GUID)

    except:

      pass

    sys.exit(0)

   

  passwords.Populate(options)

  Crawler(options).Crawl(args)





if __name__ == '__main__':

  Main()