scrape.py | searchcode

/scrapers/GrampianCareData/scrape.py

https://github.com/sneeu/aliss_scrapers
Python | 113 lines | 100 code | 12 blank | 1 comment | 1 complexity | 3fce80bca50c4bec333f6516643b5652 MD5 | raw file

import json
import re
import sys
import Queue
import threading
import urllib2

from BeautifulSoup import BeautifulSoup

from soupselect import select as css


TIMEOUT = 15
URL_TEMPLATE = "http://www.grampiancaredata.gov.uk/development/keyword-search/?tx_evgcdsearch_pi1%%5Breport%%5D=gcd_search&tx_evgcdsearch_pi1%%5Bstart%%5D=%d"


LOCK = threading.Lock()


data = {}


def do_work(*args):
    # Do something with args
    url = args[0]

    with LOCK:
        print url

    html = ''.join(urllib2.urlopen(url, timeout=TIMEOUT).readlines())
    html = html.replace('<!- Google Analytics -->', '')
    html = re.sub('<script.*?>[\s\S]*?</.*?script>', '', html)
    soup = BeautifulSoup(html)

    item = {}

    def parse(listitem):
        title = ident = web = short_address = phone = lat = lng = None
        tags = []

        t = css(listitem, 'h1 a')
        if t:
            title = t[0].contents[0]
            ident = t[0]['href']

        t = css(listitem, '.tel-fax .record-detail')
        if t:
            phone = t[0].contents[1].strip()

        t = css(listitem, '.web a[href^=http]')
        if t:
            web = t[0]['href']

        t = css(listitem, '.p-code .record-detail')
        if t:
            short_address = str(t[0].contents[1]).strip()

        item = {
            'title': title,
            'lat': lat,
            'lng': lng,
            'url': web,
            'phone': phone,
            'short_address': short_address,
            'tags': tags,
            'origin': ident
        }

        with LOCK:
            sys.stdout.write('.')
            data[ident] = item

    for listitem in css(soup, '.search-row-grey-wrapper'):
        parse(listitem)

    for listitem in css(soup, '.search-row-white-wrapper'):
        parse(listitem)


number_of_workers = 5
work_queue = Queue.Queue()


def worker():
    while True:
        item = work_queue.get()
        try:
            do_work(*item)
        except Exception, e:
            print e
        work_queue.task_done()


def main():
    urllib2.install_opener(urllib2.build_opener())

    for __ in range(number_of_workers):
        t = threading.Thread(target=worker)
        t.setDaemon(True)
        t.start()

    max_result = 1945
    # max_result = 24

    for item in (URL_TEMPLATE % n for n in xrange(0, max_result, 25)):
        work_queue.put([item])

    work_queue.join()
    print json.dumps(data)


if __name__ == '__main__':
    main()