/Tools/webchecker/websucker.py

http://unladen-swallow.googlecode.com/ · Python · 125 lines · 105 code · 14 blank · 6 comment · 29 complexity · ca620a504f976bf32af9a05172d7e35c MD5 · raw file

  1. #! /usr/bin/env python
  2. """A variant on webchecker that creates a mirror copy of a remote site."""
  3. __version__ = "$Revision: 28654 $"
  4. import os
  5. import sys
  6. import urllib
  7. import getopt
  8. import webchecker
  9. # Extract real version number if necessary
  10. if __version__[0] == '$':
  11. _v = __version__.split()
  12. if len(_v) == 3:
  13. __version__ = _v[1]
  14. def main():
  15. verbose = webchecker.VERBOSE
  16. try:
  17. opts, args = getopt.getopt(sys.argv[1:], "qv")
  18. except getopt.error, msg:
  19. print msg
  20. print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..."
  21. return 2
  22. for o, a in opts:
  23. if o == "-q":
  24. verbose = 0
  25. if o == "-v":
  26. verbose = verbose + 1
  27. c = Sucker()
  28. c.setflags(verbose=verbose)
  29. c.urlopener.addheaders = [
  30. ('User-agent', 'websucker/%s' % __version__),
  31. ]
  32. for arg in args:
  33. print "Adding root", arg
  34. c.addroot(arg)
  35. print "Run..."
  36. c.run()
  37. class Sucker(webchecker.Checker):
  38. checkext = 0
  39. nonames = 1
  40. # SAM 11/13/99: in general, URLs are now URL pairs.
  41. # Since we've suppressed name anchor checking,
  42. # we can ignore the second dimension.
  43. def readhtml(self, url_pair):
  44. url = url_pair[0]
  45. text = None
  46. path = self.savefilename(url)
  47. try:
  48. f = open(path, "rb")
  49. except IOError:
  50. f = self.openpage(url_pair)
  51. if f:
  52. info = f.info()
  53. nurl = f.geturl()
  54. if nurl != url:
  55. url = nurl
  56. path = self.savefilename(url)
  57. text = f.read()
  58. f.close()
  59. self.savefile(text, path)
  60. if not self.checkforhtml(info, url):
  61. text = None
  62. else:
  63. if self.checkforhtml({}, url):
  64. text = f.read()
  65. f.close()
  66. return text, url
  67. def savefile(self, text, path):
  68. dir, base = os.path.split(path)
  69. makedirs(dir)
  70. try:
  71. f = open(path, "wb")
  72. f.write(text)
  73. f.close()
  74. self.message("saved %s", path)
  75. except IOError, msg:
  76. self.message("didn't save %s: %s", path, str(msg))
  77. def savefilename(self, url):
  78. type, rest = urllib.splittype(url)
  79. host, path = urllib.splithost(rest)
  80. path = path.lstrip("/")
  81. user, host = urllib.splituser(host)
  82. host, port = urllib.splitnport(host)
  83. host = host.lower()
  84. if not path or path[-1] == "/":
  85. path = path + "index.html"
  86. if os.sep != "/":
  87. path = os.sep.join(path.split("/"))
  88. if os.name == "mac":
  89. path = os.sep + path
  90. path = os.path.join(host, path)
  91. return path
  92. def makedirs(dir):
  93. if not dir:
  94. return
  95. if os.path.exists(dir):
  96. if not os.path.isdir(dir):
  97. try:
  98. os.rename(dir, dir + ".bak")
  99. os.mkdir(dir)
  100. os.rename(dir + ".bak", os.path.join(dir, "index.html"))
  101. except os.error:
  102. pass
  103. return
  104. head, tail = os.path.split(dir)
  105. if not tail:
  106. print "Huh? Don't know how to make dir", dir
  107. return
  108. makedirs(head)
  109. os.mkdir(dir, 0777)
  110. if __name__ == '__main__':
  111. sys.exit(main() or 0)