PageRenderTime 36ms CodeModel.GetById 30ms RepoModel.GetById 1ms app.codeStats 0ms

/scrapers/GrampianCareData/scrape.py

https://github.com/sneeu/aliss_scrapers
Python | 113 lines | 100 code | 12 blank | 1 comment | 1 complexity | 3fce80bca50c4bec333f6516643b5652 MD5 | raw file
  1. import json
  2. import re
  3. import sys
  4. import Queue
  5. import threading
  6. import urllib2
  7. from BeautifulSoup import BeautifulSoup
  8. from soupselect import select as css
  9. TIMEOUT = 15
  10. URL_TEMPLATE = "http://www.grampiancaredata.gov.uk/development/keyword-search/?tx_evgcdsearch_pi1%%5Breport%%5D=gcd_search&tx_evgcdsearch_pi1%%5Bstart%%5D=%d"
  11. LOCK = threading.Lock()
  12. data = {}
  13. def do_work(*args):
  14. # Do something with args
  15. url = args[0]
  16. with LOCK:
  17. print url
  18. html = ''.join(urllib2.urlopen(url, timeout=TIMEOUT).readlines())
  19. html = html.replace('<!- Google Analytics -->', '')
  20. html = re.sub('<script.*?>[\s\S]*?</.*?script>', '', html)
  21. soup = BeautifulSoup(html)
  22. item = {}
  23. def parse(listitem):
  24. title = ident = web = short_address = phone = lat = lng = None
  25. tags = []
  26. t = css(listitem, 'h1 a')
  27. if t:
  28. title = t[0].contents[0]
  29. ident = t[0]['href']
  30. t = css(listitem, '.tel-fax .record-detail')
  31. if t:
  32. phone = t[0].contents[1].strip()
  33. t = css(listitem, '.web a[href^=http]')
  34. if t:
  35. web = t[0]['href']
  36. t = css(listitem, '.p-code .record-detail')
  37. if t:
  38. short_address = str(t[0].contents[1]).strip()
  39. item = {
  40. 'title': title,
  41. 'lat': lat,
  42. 'lng': lng,
  43. 'url': web,
  44. 'phone': phone,
  45. 'short_address': short_address,
  46. 'tags': tags,
  47. 'origin': ident
  48. }
  49. with LOCK:
  50. sys.stdout.write('.')
  51. data[ident] = item
  52. for listitem in css(soup, '.search-row-grey-wrapper'):
  53. parse(listitem)
  54. for listitem in css(soup, '.search-row-white-wrapper'):
  55. parse(listitem)
  56. number_of_workers = 5
  57. work_queue = Queue.Queue()
  58. def worker():
  59. while True:
  60. item = work_queue.get()
  61. try:
  62. do_work(*item)
  63. except Exception, e:
  64. print e
  65. work_queue.task_done()
  66. def main():
  67. urllib2.install_opener(urllib2.build_opener())
  68. for __ in range(number_of_workers):
  69. t = threading.Thread(target=worker)
  70. t.setDaemon(True)
  71. t.start()
  72. max_result = 1945
  73. # max_result = 24
  74. for item in (URL_TEMPLATE % n for n in xrange(0, max_result, 25)):
  75. work_queue.put([item])
  76. work_queue.join()
  77. print json.dumps(data)
  78. if __name__ == '__main__':
  79. main()