PageRenderTime 30ms CodeModel.GetById 12ms app.highlight 14ms RepoModel.GetById 1ms app.codeStats 0ms

/gazetteer/pre-cache-curl.py

http://alageospatialportal.googlecode.com/
Python | 145 lines | 108 code | 8 blank | 29 comment | 0 complexity | 5f28d54e260b3ca21c65aecbb08a921d MD5 | raw file
  1#! /usr/bin/env python
  2# -*- coding: iso-8859-1 -*-
  3# vi:ts=4:et
  4# $Id: retriever-multi.py,v 1.29 2005/07/28 11:04:13 mfx Exp $
  5
  6#
  7# Usage: python retriever-multi.py <file with URLs to fetch> [<# of
  8#          concurrent connections>]
  9#
 10
 11import sys
 12import pycurl
 13import time
 14
 15# We should ignore SIGPIPE when using pycurl.NOSIGNAL - see
 16# the libcurl tutorial for more info.
 17try:
 18    import signal
 19    from signal import SIGPIPE, SIG_IGN
 20    signal.signal(signal.SIGPIPE, signal.SIG_IGN)
 21except ImportError:
 22    pass
 23
 24
 25# Get args
 26num_conn = 100
 27#try:
 28#    if sys.argv[1] == "-":
 29#        urls = sys.stdin.readlines()
 30#    else:
 31#        urls = open(sys.argv[1]).readlines()
 32#    if len(sys.argv) >= 3:
 33#        num_conn = int(sys.argv[2])
 34#except:
 35#    print "Usage: %s <file with URLs to fetch> [<# of concurrent connections>]" % sys.argv[0]
 36#    raise SystemExit
 37
 38
 39# Make a queue with (url, filename) tuples
 40queue = []
 41#for url in urls:
 42#    url = url.strip()
 43#    if not url or url[0] == "#":
 44#        continue
 45#    filename = "doc_%03d.dat" % (len(queue) + 1)
 46#    queue.append((url, filename))
 47
 48urlbase = "http://spatial.ala.org.au/gazetteer"
 49file = open("sample.txt")
 50#requests = []
 51layers = [""] #["aus1", "aus2", "ibra","imcra"] 
 52
 53for line in file:
 54	for layer in layers:
 55		request = urlbase + "" + layer + "/latlon/"	
 56		occurrence = line.split('\t')
 57		latitude = float(occurrence[1])/10000
 58		longitude = float(occurrence[2])/10000
 59		request += str(latitude) + "," + str(longitude)
 60		queue.append(request)
 61
 62# Check args
 63assert queue, "no URLs given"
 64num_urls = len(queue)
 65num_conn = min(num_conn, num_urls)
 66assert 1 <= num_conn <= 10000, "invalid number of concurrent connections"
 67print "PycURL %s (compiled against 0x%x)" % (pycurl.version, pycurl.COMPILE_LIBCURL_VERSION_NUM)
 68print "----- Getting", num_urls, "URLs using", num_conn, "connections -----"
 69
 70
 71# Pre-allocate a list of curl objects
 72m = pycurl.CurlMulti()
 73m.handles = []
 74for i in range(num_conn):
 75    c = pycurl.Curl()
 76    c.fp = None
 77    c.setopt(pycurl.FOLLOWLOCATION, 1)
 78    c.setopt(pycurl.MAXREDIRS, 5)
 79    c.setopt(pycurl.CONNECTTIMEOUT, 30)
 80    c.setopt(pycurl.TIMEOUT, 300)
 81    c.setopt(pycurl.NOSIGNAL, 1)
 82    m.handles.append(c)
 83
 84
 85# Main loop
 86freelist = m.handles[:]
 87num_processed = 0
 88num_urls = len(queue)
 89
 90def callback(buf):
 91#	pass
 92	print(buf)
 93
 94
 95t1 = time.time()
 96while num_processed < num_urls:
 97    # If there is an url to process and a free curl object, add to multi stack
 98    while queue and freelist:
 99        url = queue.pop(0)
100        c = freelist.pop()
101       # c.fp = open(filename, "wb")
102        c.setopt(pycurl.URL, url)
103        c.setopt(c.WRITEFUNCTION,callback)
104        m.add_handle(c)
105        # store some info
106        #c.filename = filename
107        c.url = url
108    # Run the internal curl state machine for the multi stack
109    while 1:
110        ret, num_handles = m.perform()
111        if ret != pycurl.E_CALL_MULTI_PERFORM:
112            break
113    # Check for curl objects which have terminated, and add them to the freelist
114    while 1:
115        num_q, ok_list, err_list = m.info_read()
116        for c in ok_list:
117            #c.fp.close()
118            #c.fp = None
119            m.remove_handle(c)
120            print "Success:", c.url, c.getinfo(pycurl.EFFECTIVE_URL)
121            freelist.append(c)
122        for c, errno, errmsg in err_list:
123            #c.fp.close()
124            #c.fp = None
125            m.remove_handle(c)
126            print "Failed: ", c.url, errno, errmsg
127            freelist.append(c)
128        num_processed = num_processed + len(ok_list) + len(err_list)
129        if num_q == 0:
130            break
131    # Currently no more I/O is pending, could do something in the meantime
132    # (display a progress bar, etc.).
133    # We just call select() to sleep until some more data is available.
134    m.select(1.0)
135
136t2 = time.time()
137print("TOTAL TIME: " + str(t2 - t1) + " SECONDS")
138print("AVERAGE REQ: " + str((t2 -t1)/((num_urls)/len(layers))) + " SECONDS")
139# Cleanup
140for c in m.handles:
141    if c.fp is not None:
142        c.fp.close()
143        c.fp = None
144    c.close()
145m.close()