websucker.py - This Python script creates a mirror copy of …

/Tools/webchecker/websucker.py

http://unladen-swallow.googlecode.com/ · Python · 125 lines · 105 code · 14 blank · 6 comment · 29 complexity · ca620a504f976bf32af9a05172d7e35c MD5 · raw file


#! /usr/bin/env python

"""A variant on webchecker that creates a mirror copy of a remote site."""

__version__ = "$Revision: 28654 $"

import os
import sys
import urllib
import getopt

import webchecker

# Extract real version number if necessary
if __version__[0] == '$':
    _v = __version__.split()
    if len(_v) == 3:
        __version__ = _v[1]

def main():
    verbose = webchecker.VERBOSE
    try:
        opts, args = getopt.getopt(sys.argv[1:], "qv")
    except getopt.error, msg:
        print msg
        print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..."
        return 2
    for o, a in opts:
        if o == "-q":
            verbose = 0
        if o == "-v":
            verbose = verbose + 1
    c = Sucker()
    c.setflags(verbose=verbose)
    c.urlopener.addheaders = [
            ('User-agent', 'websucker/%s' % __version__),
        ]
    for arg in args:
        print "Adding root", arg
        c.addroot(arg)
    print "Run..."
    c.run()

class Sucker(webchecker.Checker):

    checkext = 0
    nonames = 1

    # SAM 11/13/99: in general, URLs are now URL pairs.
    # Since we've suppressed name anchor checking,
    # we can ignore the second dimension.

    def readhtml(self, url_pair):
        url = url_pair[0]
        text = None
        path = self.savefilename(url)
        try:
            f = open(path, "rb")
        except IOError:
            f = self.openpage(url_pair)
            if f:
                info = f.info()
                nurl = f.geturl()
                if nurl != url:
                    url = nurl
                    path = self.savefilename(url)
                text = f.read()
                f.close()
                self.savefile(text, path)
                if not self.checkforhtml(info, url):
                    text = None
        else:
            if self.checkforhtml({}, url):
                text = f.read()
            f.close()
        return text, url

    def savefile(self, text, path):
        dir, base = os.path.split(path)
        makedirs(dir)
        try:
            f = open(path, "wb")
            f.write(text)
            f.close()
            self.message("saved %s", path)
        except IOError, msg:
            self.message("didn't save %s: %s", path, str(msg))

    def savefilename(self, url):
        type, rest = urllib.splittype(url)
        host, path = urllib.splithost(rest)
        path = path.lstrip("/")
        user, host = urllib.splituser(host)
        host, port = urllib.splitnport(host)
        host = host.lower()
        if not path or path[-1] == "/":
            path = path + "index.html"
        if os.sep != "/":
            path = os.sep.join(path.split("/"))
            if os.name == "mac":
                path = os.sep + path
        path = os.path.join(host, path)
        return path

def makedirs(dir):
    if not dir:
        return
    if os.path.exists(dir):
        if not os.path.isdir(dir):
            try:
                os.rename(dir, dir + ".bak")
                os.mkdir(dir)
                os.rename(dir + ".bak", os.path.join(dir, "index.html"))
            except os.error:
                pass
        return
    head, tail = os.path.split(dir)
    if not tail:
        print "Huh?  Don't know how to make dir", dir
        return
    makedirs(head)
    os.mkdir(dir, 0777)

if __name__ == '__main__':
    sys.exit(main() or 0)

Summary ✨

This Python script creates a mirror copy of a remote website by downloading its content and saving it to a local directory. It uses the webchecker library to perform the download, but with some modifications to create a mirror copy. The script can be run from the command line with options to control verbosity.

Tech Fingerprint

Alerts (10)

'def' Ensure functions have docstrings for documentation
20 53 78 89 105
'open(' Use 'with open()' to ensure Files are properly closed
58 82
Complexity hotspot; lines 108 to 110 (total complexity: 3)
108 109 110