/gazetteer/pre-cache-curl.py

http://alageospatialportal.googlecode.com/ · Python · 145 lines · 80 code · 18 blank · 47 comment · 16 complexity · 5f28d54e260b3ca21c65aecbb08a921d MD5 · raw file

  1. #! /usr/bin/env python
  2. # -*- coding: iso-8859-1 -*-
  3. # vi:ts=4:et
  4. # $Id: retriever-multi.py,v 1.29 2005/07/28 11:04:13 mfx Exp $
  5. #
  6. # Usage: python retriever-multi.py <file with URLs to fetch> [<# of
  7. # concurrent connections>]
  8. #
  9. import sys
  10. import pycurl
  11. import time
  12. # We should ignore SIGPIPE when using pycurl.NOSIGNAL - see
  13. # the libcurl tutorial for more info.
  14. try:
  15. import signal
  16. from signal import SIGPIPE, SIG_IGN
  17. signal.signal(signal.SIGPIPE, signal.SIG_IGN)
  18. except ImportError:
  19. pass
  20. # Get args
  21. num_conn = 100
  22. #try:
  23. # if sys.argv[1] == "-":
  24. # urls = sys.stdin.readlines()
  25. # else:
  26. # urls = open(sys.argv[1]).readlines()
  27. # if len(sys.argv) >= 3:
  28. # num_conn = int(sys.argv[2])
  29. #except:
  30. # print "Usage: %s <file with URLs to fetch> [<# of concurrent connections>]" % sys.argv[0]
  31. # raise SystemExit
  32. # Make a queue with (url, filename) tuples
  33. queue = []
  34. #for url in urls:
  35. # url = url.strip()
  36. # if not url or url[0] == "#":
  37. # continue
  38. # filename = "doc_%03d.dat" % (len(queue) + 1)
  39. # queue.append((url, filename))
  40. urlbase = "http://spatial.ala.org.au/gazetteer"
  41. file = open("sample.txt")
  42. #requests = []
  43. layers = [""] #["aus1", "aus2", "ibra","imcra"]
  44. for line in file:
  45. for layer in layers:
  46. request = urlbase + "" + layer + "/latlon/"
  47. occurrence = line.split('\t')
  48. latitude = float(occurrence[1])/10000
  49. longitude = float(occurrence[2])/10000
  50. request += str(latitude) + "," + str(longitude)
  51. queue.append(request)
  52. # Check args
  53. assert queue, "no URLs given"
  54. num_urls = len(queue)
  55. num_conn = min(num_conn, num_urls)
  56. assert 1 <= num_conn <= 10000, "invalid number of concurrent connections"
  57. print "PycURL %s (compiled against 0x%x)" % (pycurl.version, pycurl.COMPILE_LIBCURL_VERSION_NUM)
  58. print "----- Getting", num_urls, "URLs using", num_conn, "connections -----"
  59. # Pre-allocate a list of curl objects
  60. m = pycurl.CurlMulti()
  61. m.handles = []
  62. for i in range(num_conn):
  63. c = pycurl.Curl()
  64. c.fp = None
  65. c.setopt(pycurl.FOLLOWLOCATION, 1)
  66. c.setopt(pycurl.MAXREDIRS, 5)
  67. c.setopt(pycurl.CONNECTTIMEOUT, 30)
  68. c.setopt(pycurl.TIMEOUT, 300)
  69. c.setopt(pycurl.NOSIGNAL, 1)
  70. m.handles.append(c)
  71. # Main loop
  72. freelist = m.handles[:]
  73. num_processed = 0
  74. num_urls = len(queue)
  75. def callback(buf):
  76. # pass
  77. print(buf)
  78. t1 = time.time()
  79. while num_processed < num_urls:
  80. # If there is an url to process and a free curl object, add to multi stack
  81. while queue and freelist:
  82. url = queue.pop(0)
  83. c = freelist.pop()
  84. # c.fp = open(filename, "wb")
  85. c.setopt(pycurl.URL, url)
  86. c.setopt(c.WRITEFUNCTION,callback)
  87. m.add_handle(c)
  88. # store some info
  89. #c.filename = filename
  90. c.url = url
  91. # Run the internal curl state machine for the multi stack
  92. while 1:
  93. ret, num_handles = m.perform()
  94. if ret != pycurl.E_CALL_MULTI_PERFORM:
  95. break
  96. # Check for curl objects which have terminated, and add them to the freelist
  97. while 1:
  98. num_q, ok_list, err_list = m.info_read()
  99. for c in ok_list:
  100. #c.fp.close()
  101. #c.fp = None
  102. m.remove_handle(c)
  103. print "Success:", c.url, c.getinfo(pycurl.EFFECTIVE_URL)
  104. freelist.append(c)
  105. for c, errno, errmsg in err_list:
  106. #c.fp.close()
  107. #c.fp = None
  108. m.remove_handle(c)
  109. print "Failed: ", c.url, errno, errmsg
  110. freelist.append(c)
  111. num_processed = num_processed + len(ok_list) + len(err_list)
  112. if num_q == 0:
  113. break
  114. # Currently no more I/O is pending, could do something in the meantime
  115. # (display a progress bar, etc.).
  116. # We just call select() to sleep until some more data is available.
  117. m.select(1.0)
  118. t2 = time.time()
  119. print("TOTAL TIME: " + str(t2 - t1) + " SECONDS")
  120. print("AVERAGE REQ: " + str((t2 -t1)/((num_urls)/len(layers))) + " SECONDS")
  121. # Cleanup
  122. for c in m.handles:
  123. if c.fp is not None:
  124. c.fp.close()
  125. c.fp = None
  126. c.close()
  127. m.close()