PageRenderTime 73ms CodeModel.GetById 20ms app.highlight 17ms RepoModel.GetById 34ms app.codeStats 0ms

/Lib/robotparser.py

http://unladen-swallow.googlecode.com/
Python | 215 lines | 186 code | 9 blank | 20 comment | 16 complexity | 545edc252f011eccdff89ef05fb2fa68 MD5 | raw file
  1""" robotparser.py
  2
  3    Copyright (C) 2000  Bastian Kleineidam
  4
  5    You can choose between two licenses when using this package:
  6    1) GNU GPLv2
  7    2) PSF license for Python 2.2
  8
  9    The robots.txt Exclusion Protocol is implemented as specified in
 10    http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
 11"""
 12import urlparse
 13import urllib
 14
 15__all__ = ["RobotFileParser"]
 16
 17
 18class RobotFileParser:
 19    """ This class provides a set of methods to read, parse and answer
 20    questions about a single robots.txt file.
 21
 22    """
 23
 24    def __init__(self, url=''):
 25        self.entries = []
 26        self.default_entry = None
 27        self.disallow_all = False
 28        self.allow_all = False
 29        self.set_url(url)
 30        self.last_checked = 0
 31
 32    def mtime(self):
 33        """Returns the time the robots.txt file was last fetched.
 34
 35        This is useful for long-running web spiders that need to
 36        check for new robots.txt files periodically.
 37
 38        """
 39        return self.last_checked
 40
 41    def modified(self):
 42        """Sets the time the robots.txt file was last fetched to the
 43        current time.
 44
 45        """
 46        import time
 47        self.last_checked = time.time()
 48
 49    def set_url(self, url):
 50        """Sets the URL referring to a robots.txt file."""
 51        self.url = url
 52        self.host, self.path = urlparse.urlparse(url)[1:3]
 53
 54    def read(self):
 55        """Reads the robots.txt URL and feeds it to the parser."""
 56        opener = URLopener()
 57        f = opener.open(self.url)
 58        lines = [line.strip() for line in f]
 59        f.close()
 60        self.errcode = opener.errcode
 61        if self.errcode in (401, 403):
 62            self.disallow_all = True
 63        elif self.errcode >= 400:
 64            self.allow_all = True
 65        elif self.errcode == 200 and lines:
 66            self.parse(lines)
 67
 68    def _add_entry(self, entry):
 69        if "*" in entry.useragents:
 70            # the default entry is considered last
 71            self.default_entry = entry
 72        else:
 73            self.entries.append(entry)
 74
 75    def parse(self, lines):
 76        """parse the input lines from a robots.txt file.
 77           We allow that a user-agent: line is not preceded by
 78           one or more blank lines."""
 79        # states:
 80        #   0: start state
 81        #   1: saw user-agent line
 82        #   2: saw an allow or disallow line
 83        state = 0
 84        linenumber = 0
 85        entry = Entry()
 86
 87        for line in lines:
 88            linenumber += 1
 89            if not line:
 90                if state == 1:
 91                    entry = Entry()
 92                    state = 0
 93                elif state == 2:
 94                    self._add_entry(entry)
 95                    entry = Entry()
 96                    state = 0
 97            # remove optional comment and strip line
 98            i = line.find('#')
 99            if i >= 0:
100                line = line[:i]
101            line = line.strip()
102            if not line:
103                continue
104            line = line.split(':', 1)
105            if len(line) == 2:
106                line[0] = line[0].strip().lower()
107                line[1] = urllib.unquote(line[1].strip())
108                if line[0] == "user-agent":
109                    if state == 2:
110                        self._add_entry(entry)
111                        entry = Entry()
112                    entry.useragents.append(line[1])
113                    state = 1
114                elif line[0] == "disallow":
115                    if state != 0:
116                        entry.rulelines.append(RuleLine(line[1], False))
117                        state = 2
118                elif line[0] == "allow":
119                    if state != 0:
120                        entry.rulelines.append(RuleLine(line[1], True))
121                        state = 2
122        if state == 2:
123            self.entries.append(entry)
124
125
126    def can_fetch(self, useragent, url):
127        """using the parsed robots.txt decide if useragent can fetch url"""
128        if self.disallow_all:
129            return False
130        if self.allow_all:
131            return True
132        # search for given user agent matches
133        # the first match counts
134        url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/"
135        for entry in self.entries:
136            if entry.applies_to(useragent):
137                return entry.allowance(url)
138        # try the default entry last
139        if self.default_entry:
140            return self.default_entry.allowance(url)
141        # agent not found ==> access granted
142        return True
143
144
145    def __str__(self):
146        return ''.join([str(entry) + "\n" for entry in self.entries])
147
148
149class RuleLine:
150    """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
151       (allowance==False) followed by a path."""
152    def __init__(self, path, allowance):
153        if path == '' and not allowance:
154            # an empty value means allow all
155            allowance = True
156        self.path = urllib.quote(path)
157        self.allowance = allowance
158
159    def applies_to(self, filename):
160        return self.path == "*" or filename.startswith(self.path)
161
162    def __str__(self):
163        return (self.allowance and "Allow" or "Disallow") + ": " + self.path
164
165
166class Entry:
167    """An entry has one or more user-agents and zero or more rulelines"""
168    def __init__(self):
169        self.useragents = []
170        self.rulelines = []
171
172    def __str__(self):
173        ret = []
174        for agent in self.useragents:
175            ret.extend(["User-agent: ", agent, "\n"])
176        for line in self.rulelines:
177            ret.extend([str(line), "\n"])
178        return ''.join(ret)
179
180    def applies_to(self, useragent):
181        """check if this entry applies to the specified agent"""
182        # split the name token and make it lower case
183        useragent = useragent.split("/")[0].lower()
184        for agent in self.useragents:
185            if agent == '*':
186                # we have the catch-all agent
187                return True
188            agent = agent.lower()
189            if agent in useragent:
190                return True
191        return False
192
193    def allowance(self, filename):
194        """Preconditions:
195        - our agent applies to this entry
196        - filename is URL decoded"""
197        for line in self.rulelines:
198            if line.applies_to(filename):
199                return line.allowance
200        return True
201
202class URLopener(urllib.FancyURLopener):
203    def __init__(self, *args):
204        urllib.FancyURLopener.__init__(self, *args)
205        self.errcode = 200
206
207    def prompt_user_passwd(self, host, realm):
208        ## If robots.txt file is accessible only with a password,
209        ## we act as if the file wasn't there.
210        return None, None
211
212    def http_error_default(self, url, fp, errcode, errmsg, headers):
213        self.errcode = errcode
214        return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
215                                                        errmsg, headers)