/gazetteer/pre-cache-curl.py
http://alageospatialportal.googlecode.com/ · Python · 145 lines · 80 code · 18 blank · 47 comment · 16 complexity · 5f28d54e260b3ca21c65aecbb08a921d MD5 · raw file
- #! /usr/bin/env python
- # -*- coding: iso-8859-1 -*-
- # vi:ts=4:et
- # $Id: retriever-multi.py,v 1.29 2005/07/28 11:04:13 mfx Exp $
- #
- # Usage: python retriever-multi.py <file with URLs to fetch> [<# of
- # concurrent connections>]
- #
- import sys
- import pycurl
- import time
- # We should ignore SIGPIPE when using pycurl.NOSIGNAL - see
- # the libcurl tutorial for more info.
- try:
- import signal
- from signal import SIGPIPE, SIG_IGN
- signal.signal(signal.SIGPIPE, signal.SIG_IGN)
- except ImportError:
- pass
- # Get args
- num_conn = 100
- #try:
- # if sys.argv[1] == "-":
- # urls = sys.stdin.readlines()
- # else:
- # urls = open(sys.argv[1]).readlines()
- # if len(sys.argv) >= 3:
- # num_conn = int(sys.argv[2])
- #except:
- # print "Usage: %s <file with URLs to fetch> [<# of concurrent connections>]" % sys.argv[0]
- # raise SystemExit
- # Make a queue with (url, filename) tuples
- queue = []
- #for url in urls:
- # url = url.strip()
- # if not url or url[0] == "#":
- # continue
- # filename = "doc_%03d.dat" % (len(queue) + 1)
- # queue.append((url, filename))
- urlbase = "http://spatial.ala.org.au/gazetteer"
- file = open("sample.txt")
- #requests = []
- layers = [""] #["aus1", "aus2", "ibra","imcra"]
- for line in file:
- for layer in layers:
- request = urlbase + "" + layer + "/latlon/"
- occurrence = line.split('\t')
- latitude = float(occurrence[1])/10000
- longitude = float(occurrence[2])/10000
- request += str(latitude) + "," + str(longitude)
- queue.append(request)
- # Check args
- assert queue, "no URLs given"
- num_urls = len(queue)
- num_conn = min(num_conn, num_urls)
- assert 1 <= num_conn <= 10000, "invalid number of concurrent connections"
- print "PycURL %s (compiled against 0x%x)" % (pycurl.version, pycurl.COMPILE_LIBCURL_VERSION_NUM)
- print "----- Getting", num_urls, "URLs using", num_conn, "connections -----"
- # Pre-allocate a list of curl objects
- m = pycurl.CurlMulti()
- m.handles = []
- for i in range(num_conn):
- c = pycurl.Curl()
- c.fp = None
- c.setopt(pycurl.FOLLOWLOCATION, 1)
- c.setopt(pycurl.MAXREDIRS, 5)
- c.setopt(pycurl.CONNECTTIMEOUT, 30)
- c.setopt(pycurl.TIMEOUT, 300)
- c.setopt(pycurl.NOSIGNAL, 1)
- m.handles.append(c)
- # Main loop
- freelist = m.handles[:]
- num_processed = 0
- num_urls = len(queue)
- def callback(buf):
- # pass
- print(buf)
- t1 = time.time()
- while num_processed < num_urls:
- # If there is an url to process and a free curl object, add to multi stack
- while queue and freelist:
- url = queue.pop(0)
- c = freelist.pop()
- # c.fp = open(filename, "wb")
- c.setopt(pycurl.URL, url)
- c.setopt(c.WRITEFUNCTION,callback)
- m.add_handle(c)
- # store some info
- #c.filename = filename
- c.url = url
- # Run the internal curl state machine for the multi stack
- while 1:
- ret, num_handles = m.perform()
- if ret != pycurl.E_CALL_MULTI_PERFORM:
- break
- # Check for curl objects which have terminated, and add them to the freelist
- while 1:
- num_q, ok_list, err_list = m.info_read()
- for c in ok_list:
- #c.fp.close()
- #c.fp = None
- m.remove_handle(c)
- print "Success:", c.url, c.getinfo(pycurl.EFFECTIVE_URL)
- freelist.append(c)
- for c, errno, errmsg in err_list:
- #c.fp.close()
- #c.fp = None
- m.remove_handle(c)
- print "Failed: ", c.url, errno, errmsg
- freelist.append(c)
- num_processed = num_processed + len(ok_list) + len(err_list)
- if num_q == 0:
- break
- # Currently no more I/O is pending, could do something in the meantime
- # (display a progress bar, etc.).
- # We just call select() to sleep until some more data is available.
- m.select(1.0)
- t2 = time.time()
- print("TOTAL TIME: " + str(t2 - t1) + " SECONDS")
- print("AVERAGE REQ: " + str((t2 -t1)/((num_urls)/len(layers))) + " SECONDS")
- # Cleanup
- for c in m.handles:
- if c.fp is not None:
- c.fp.close()
- c.fp = None
- c.close()
- m.close()