/Lib/robotparser.py

http://unladen-swallow.googlecode.com/ · Python · 215 lines · 137 code · 27 blank · 51 comment · 40 complexity · 545edc252f011eccdff89ef05fb2fa68 MD5 · raw file

  1. """ robotparser.py
  2. Copyright (C) 2000 Bastian Kleineidam
  3. You can choose between two licenses when using this package:
  4. 1) GNU GPLv2
  5. 2) PSF license for Python 2.2
  6. The robots.txt Exclusion Protocol is implemented as specified in
  7. http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
  8. """
  9. import urlparse
  10. import urllib
  11. __all__ = ["RobotFileParser"]
  12. class RobotFileParser:
  13. """ This class provides a set of methods to read, parse and answer
  14. questions about a single robots.txt file.
  15. """
  16. def __init__(self, url=''):
  17. self.entries = []
  18. self.default_entry = None
  19. self.disallow_all = False
  20. self.allow_all = False
  21. self.set_url(url)
  22. self.last_checked = 0
  23. def mtime(self):
  24. """Returns the time the robots.txt file was last fetched.
  25. This is useful for long-running web spiders that need to
  26. check for new robots.txt files periodically.
  27. """
  28. return self.last_checked
  29. def modified(self):
  30. """Sets the time the robots.txt file was last fetched to the
  31. current time.
  32. """
  33. import time
  34. self.last_checked = time.time()
  35. def set_url(self, url):
  36. """Sets the URL referring to a robots.txt file."""
  37. self.url = url
  38. self.host, self.path = urlparse.urlparse(url)[1:3]
  39. def read(self):
  40. """Reads the robots.txt URL and feeds it to the parser."""
  41. opener = URLopener()
  42. f = opener.open(self.url)
  43. lines = [line.strip() for line in f]
  44. f.close()
  45. self.errcode = opener.errcode
  46. if self.errcode in (401, 403):
  47. self.disallow_all = True
  48. elif self.errcode >= 400:
  49. self.allow_all = True
  50. elif self.errcode == 200 and lines:
  51. self.parse(lines)
  52. def _add_entry(self, entry):
  53. if "*" in entry.useragents:
  54. # the default entry is considered last
  55. self.default_entry = entry
  56. else:
  57. self.entries.append(entry)
  58. def parse(self, lines):
  59. """parse the input lines from a robots.txt file.
  60. We allow that a user-agent: line is not preceded by
  61. one or more blank lines."""
  62. # states:
  63. # 0: start state
  64. # 1: saw user-agent line
  65. # 2: saw an allow or disallow line
  66. state = 0
  67. linenumber = 0
  68. entry = Entry()
  69. for line in lines:
  70. linenumber += 1
  71. if not line:
  72. if state == 1:
  73. entry = Entry()
  74. state = 0
  75. elif state == 2:
  76. self._add_entry(entry)
  77. entry = Entry()
  78. state = 0
  79. # remove optional comment and strip line
  80. i = line.find('#')
  81. if i >= 0:
  82. line = line[:i]
  83. line = line.strip()
  84. if not line:
  85. continue
  86. line = line.split(':', 1)
  87. if len(line) == 2:
  88. line[0] = line[0].strip().lower()
  89. line[1] = urllib.unquote(line[1].strip())
  90. if line[0] == "user-agent":
  91. if state == 2:
  92. self._add_entry(entry)
  93. entry = Entry()
  94. entry.useragents.append(line[1])
  95. state = 1
  96. elif line[0] == "disallow":
  97. if state != 0:
  98. entry.rulelines.append(RuleLine(line[1], False))
  99. state = 2
  100. elif line[0] == "allow":
  101. if state != 0:
  102. entry.rulelines.append(RuleLine(line[1], True))
  103. state = 2
  104. if state == 2:
  105. self.entries.append(entry)
  106. def can_fetch(self, useragent, url):
  107. """using the parsed robots.txt decide if useragent can fetch url"""
  108. if self.disallow_all:
  109. return False
  110. if self.allow_all:
  111. return True
  112. # search for given user agent matches
  113. # the first match counts
  114. url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/"
  115. for entry in self.entries:
  116. if entry.applies_to(useragent):
  117. return entry.allowance(url)
  118. # try the default entry last
  119. if self.default_entry:
  120. return self.default_entry.allowance(url)
  121. # agent not found ==> access granted
  122. return True
  123. def __str__(self):
  124. return ''.join([str(entry) + "\n" for entry in self.entries])
  125. class RuleLine:
  126. """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
  127. (allowance==False) followed by a path."""
  128. def __init__(self, path, allowance):
  129. if path == '' and not allowance:
  130. # an empty value means allow all
  131. allowance = True
  132. self.path = urllib.quote(path)
  133. self.allowance = allowance
  134. def applies_to(self, filename):
  135. return self.path == "*" or filename.startswith(self.path)
  136. def __str__(self):
  137. return (self.allowance and "Allow" or "Disallow") + ": " + self.path
  138. class Entry:
  139. """An entry has one or more user-agents and zero or more rulelines"""
  140. def __init__(self):
  141. self.useragents = []
  142. self.rulelines = []
  143. def __str__(self):
  144. ret = []
  145. for agent in self.useragents:
  146. ret.extend(["User-agent: ", agent, "\n"])
  147. for line in self.rulelines:
  148. ret.extend([str(line), "\n"])
  149. return ''.join(ret)
  150. def applies_to(self, useragent):
  151. """check if this entry applies to the specified agent"""
  152. # split the name token and make it lower case
  153. useragent = useragent.split("/")[0].lower()
  154. for agent in self.useragents:
  155. if agent == '*':
  156. # we have the catch-all agent
  157. return True
  158. agent = agent.lower()
  159. if agent in useragent:
  160. return True
  161. return False
  162. def allowance(self, filename):
  163. """Preconditions:
  164. - our agent applies to this entry
  165. - filename is URL decoded"""
  166. for line in self.rulelines:
  167. if line.applies_to(filename):
  168. return line.allowance
  169. return True
  170. class URLopener(urllib.FancyURLopener):
  171. def __init__(self, *args):
  172. urllib.FancyURLopener.__init__(self, *args)
  173. self.errcode = 200
  174. def prompt_user_passwd(self, host, realm):
  175. ## If robots.txt file is accessible only with a password,
  176. ## we act as if the file wasn't there.
  177. return None, None
  178. def http_error_default(self, url, fp, errcode, errmsg, headers):
  179. self.errcode = errcode
  180. return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
  181. errmsg, headers)