pre-cache-curl.py - This Python script uses pycurl to downl…

/gazetteer/pre-cache-curl.py

http://alageospatialportal.googlecode.com/ · Python · 145 lines · 80 code · 18 blank · 47 comment · 16 complexity · 5f28d54e260b3ca21c65aecbb08a921d MD5 · raw file


#! /usr/bin/env python
# -*- coding: iso-8859-1 -*-
# vi:ts=4:et
# $Id: retriever-multi.py,v 1.29 2005/07/28 11:04:13 mfx Exp $

#
# Usage: python retriever-multi.py <file with URLs to fetch> [<# of
#          concurrent connections>]
#

import sys
import pycurl
import time

# We should ignore SIGPIPE when using pycurl.NOSIGNAL - see
# the libcurl tutorial for more info.
try:
    import signal
    from signal import SIGPIPE, SIG_IGN
    signal.signal(signal.SIGPIPE, signal.SIG_IGN)
except ImportError:
    pass


# Get args
num_conn = 100
#try:
#    if sys.argv[1] == "-":
#        urls = sys.stdin.readlines()
#    else:
#        urls = open(sys.argv[1]).readlines()
#    if len(sys.argv) >= 3:
#        num_conn = int(sys.argv[2])
#except:
#    print "Usage: %s <file with URLs to fetch> [<# of concurrent connections>]" % sys.argv[0]
#    raise SystemExit


# Make a queue with (url, filename) tuples
queue = []
#for url in urls:
#    url = url.strip()
#    if not url or url[0] == "#":
#        continue
#    filename = "doc_%03d.dat" % (len(queue) + 1)
#    queue.append((url, filename))

urlbase = "http://spatial.ala.org.au/gazetteer"
file = open("sample.txt")
#requests = []
layers = [""] #["aus1", "aus2", "ibra","imcra"] 

for line in file:
	for layer in layers:
		request = urlbase + "" + layer + "/latlon/"	
		occurrence = line.split('\t')
		latitude = float(occurrence[1])/10000
		longitude = float(occurrence[2])/10000
		request += str(latitude) + "," + str(longitude)
		queue.append(request)

# Check args
assert queue, "no URLs given"
num_urls = len(queue)
num_conn = min(num_conn, num_urls)
assert 1 <= num_conn <= 10000, "invalid number of concurrent connections"
print "PycURL %s (compiled against 0x%x)" % (pycurl.version, pycurl.COMPILE_LIBCURL_VERSION_NUM)
print "----- Getting", num_urls, "URLs using", num_conn, "connections -----"


# Pre-allocate a list of curl objects
m = pycurl.CurlMulti()
m.handles = []
for i in range(num_conn):
    c = pycurl.Curl()
    c.fp = None
    c.setopt(pycurl.FOLLOWLOCATION, 1)
    c.setopt(pycurl.MAXREDIRS, 5)
    c.setopt(pycurl.CONNECTTIMEOUT, 30)
    c.setopt(pycurl.TIMEOUT, 300)
    c.setopt(pycurl.NOSIGNAL, 1)
    m.handles.append(c)


# Main loop
freelist = m.handles[:]
num_processed = 0
num_urls = len(queue)

def callback(buf):
#	pass
	print(buf)


t1 = time.time()
while num_processed < num_urls:
    # If there is an url to process and a free curl object, add to multi stack
    while queue and freelist:
        url = queue.pop(0)
        c = freelist.pop()
       # c.fp = open(filename, "wb")
        c.setopt(pycurl.URL, url)
        c.setopt(c.WRITEFUNCTION,callback)
        m.add_handle(c)
        # store some info
        #c.filename = filename
        c.url = url
    # Run the internal curl state machine for the multi stack
    while 1:
        ret, num_handles = m.perform()
        if ret != pycurl.E_CALL_MULTI_PERFORM:
            break
    # Check for curl objects which have terminated, and add them to the freelist
    while 1:
        num_q, ok_list, err_list = m.info_read()
        for c in ok_list:
            #c.fp.close()
            #c.fp = None
            m.remove_handle(c)
            print "Success:", c.url, c.getinfo(pycurl.EFFECTIVE_URL)
            freelist.append(c)
        for c, errno, errmsg in err_list:
            #c.fp.close()
            #c.fp = None
            m.remove_handle(c)
            print "Failed: ", c.url, errno, errmsg
            freelist.append(c)
        num_processed = num_processed + len(ok_list) + len(err_list)
        if num_q == 0:
            break
    # Currently no more I/O is pending, could do something in the meantime
    # (display a progress bar, etc.).
    # We just call select() to sleep until some more data is available.
    m.select(1.0)

t2 = time.time()
print("TOTAL TIME: " + str(t2 - t1) + " SECONDS")
print("AVERAGE REQ: " + str((t2 -t1)/((num_urls)/len(layers))) + " SECONDS")
# Cleanup
for c in m.handles:
    if c.fp is not None:
        c.fp.close()
        c.fp = None
    c.close()
m.close()

Summary ✨

This Python script uses pycurl to download a large number of URLs concurrently from a spatial database, storing the results in files named “doc_XXX.dat”. It runs multiple curl connections simultaneously, handling errors and displaying progress. The output shows the total time taken, average request time, and information about each successful or failed connection.

Tech Fingerprint

Alerts (1)

'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
34