olympic.py - Create the soup object from the HTML data

/scripts/olympic.py

https://gitlab.com/camilo-celis/DB_SQuirreL · Python · 250 lines · 177 code · 58 blank · 15 comment · 42 complexity · 1a80b2500dcfcbebc37e6ee6239448c1 MD5 · raw file

#!/usr/bin/env python
"""
Scrape Olympic data and store in a CSV file.

Written by Joel Addison, 2012
"""

import urllib2
from bs4 import BeautifulSoup
import csv
import os
import simplejson
import string

import unicode_csv
from progressbar import ProgressBar

class Games(object):

    def __init__(self):
        self.athletes = {}
        self.events = {}
        self.countries = {}

        self.athlete_errors = []

        self.base_url = 'http://databaseolympics.com/games/gameslist.htm'

    def get_games_list(self):
        html_data = urllib2.urlopen(self.base_url).read()
        #Create the soup object from the HTML data
        self.soup = BeautifulSoup(html_data)

    def get_games_data(self, games_type):
        games_links = self.soup.find(text=games_type).find_next('table').find_all('a')

        details = {}

        for link in games_links:
            url = "http://databaseolympics.com/games/gamesyear.htm?g=%s" % link['href'].split('=')[1]
            page = BeautifulSoup(urllib2.urlopen(url).read(), from_encoding="iso-8859-1")

            year = link.string
            details[year] = {}

            countries = page.find(text="Country").find_next("table").findAll('a')
            details[year]['countries'] = {}

            for country in countries:
                c_url = "http://databaseolympics.com%s" % country['href']
                c_name = country.encode_contents()
                details[year]['countries'][c_name] = c_url

        return details

    def get_country_data(self, country_url):
        page = BeautifulSoup(urllib2.urlopen(country_url).read(), from_encoding="iso-8859-1")

        events = []

        table = page.find_all('table')[2].find_all('tr')
        for row in table[1:]:
            cells = row.find_all('td')

            athlete_id = unicode(cells[3].find('a')['href'].split('ilkid=')[1])

            events.append(dict(
                year=cells[0].find('a').string,
                sport=cells[1].find('a').string,
                event=cells[2].find('a').string,
                athlete=athlete_id,
                result=cells[4].string or "",
                medal=cells[5].string or "",
                ))

        return events

    def get_athlete_data(self, athlete, a_name=None):
        #http://databaseolympics.com/players/playerpage.htm?ilkid=GONZAMAR03
        if not self.athletes.get(athlete, None):
            aid = urllib2.quote(athlete.encode('latin-1'))
            url = "http://databaseolympics.com/players/playerpage.htm?ilkid=%s" % aid

            page = BeautifulSoup(urllib2.urlopen(url).read(), from_encoding="iso-8859-1")

            name = page.find('h1')
            bio = name.find_next('font').find_all('a')

            birthdate = ""
            country = ""
            sport = ""
            for detail in bio:
                if 'sport' in detail['href']:
                    sport = detail.string
                elif 'country' in detail['href']:
                    country = detail.string
                elif 'birthday' in detail['href']:
                    birthdate = detail.string

            if a_name is None:
                a_name = name.string

            self.athletes[athlete] = dict(
                name=a_name,
                country=country,
                sport=sport,
                birthdate=birthdate,
            )

    def write_games_csvs(self, events, season):
        for year, details in events.iteritems():
            print "  Processing %s..." % year
            events = []
            for country, c_url in details['countries'].iteritems():
                print "    country: %s" % country
                events.extend(self.get_country_data(c_url))

                cid = c_url.split('cty=')[1]
                self.countries[cid] = country

            #write to csv
            with open('%s_%s.csv' % (season, year), 'wb') as wf:
                writer = unicode_csv.UnicodeWriter(wf)
                writer.writerow(['year', 'sport', 'event', 'athlete', 'result', 'medal'])
                for event in events:
                    writer.writerow([event['year'], event['sport'], event['event'],
                        event['athlete'], event['result'], event['medal']])

            # Write events to json
            with open('%s_%s.json' % (season, year), 'wb') as wf:
                simplejson.dump(events, wf)

    def write_athlete_csv(self):
        with open('all_athletes.csv', 'wb') as wf:
            writer = unicode_csv.UnicodeWriter(wf)
            writer.writerow(['athlete_id', 'name', 'country', 'sport', 'birthdate'])
            for aid, athlete in self.athletes.iteritems():
                writer.writerow([aid, athlete['name'], athlete['country'],
                    athlete['sport'], athlete['birthdate']])

        with open('all_athletes.json', 'wb') as wf:
            simplejson.dump(self.athletes, wf)

        with open('error_athletes.json', 'wb') as wf:
            simplejson.dump(self.athlete_errors, wf)

    def write_country_csv(self):
        with open('all_countries.csv', 'wb') as wf:
            writer = unicode_csv.UnicodeWriter(wf)
            writer.writerow(['cid', 'country'])
            for cid, country in self.countries.iteritems():
                writer.writerow([cid, country])

        with open('all_countries.json', 'wb') as wf:
            simplejson.dump(self.countries, wf)

    def get_country_list(self):
        url = "http://databaseolympics.com/country/countrylist.htm"
        page = BeautifulSoup(urllib2.urlopen(url).read())
        links = page.find_all('p')[1].find_all('a')

        for link in links:
            code = link['href'].split('cty=')[1]
            self.countries[code] = link.string

        self.write_country_csv()

    def get_athlete_list(self, letters=None, error_only=False):
        base_url = "http://databaseolympics.com/players/playerlist.htm?lt=%s" #a

        p = ProgressBar()

        if letters is None:
            alphabet = [a for a in string.ascii_lowercase]
        else:
            alphabet = letters
        for letter in alphabet:
            page = BeautifulSoup(urllib2.urlopen(base_url % letter).read(), from_encoding="iso-8859-1")
            links = page.find_all('p')[3].find_all('a')

            num_links = len(links)

            for i, link in enumerate(links):
                if 'player' in link['href']:
                    athlete_id = link['href'].split('ilkid=')[1]
                    try:
                        self.get_athlete_data(athlete_id, link.string)
                    except Exception, e:
                        self.athlete_errors.append(dict(aid=athlete_id, name=link.string, country=links[i+1].string))

                pos = int((float(i) / num_links) * 100)
                p.render(pos, 'step %s\nProcessing...\nDescription: Letter %s (%d athletes).' % (pos, letter, num_links/2))

            if not error_only:
                with open('athletes_%s.csv' % letter, 'wb') as wf:
                    writer = unicode_csv.UnicodeWriter(wf)
                    writer.writerow(['athlete_id', 'name', 'country', 'sport', 'birthdate'])
                    for aid, athlete in self.athletes.iteritems():
                        writer.writerow([aid, athlete['name'], athlete['country'],
                            athlete['sport'], athlete['birthdate']])

                with open('athletes_%s.json' % letter, 'wb') as wf:
                    simplejson.dump(self.athletes, wf)

            self.athletes = {}

            with open('error_athletes_%s.json' % letter, 'wb') as wf:
                simplejson.dump(self.athlete_errors, wf)

            with open('error_athletes_%s.csv' % letter, 'wb') as wf:
                writer = unicode_csv.UnicodeWriter(wf)
                writer.writerow(['athlete_id', 'name', 'country'])
                for athlete in self.athlete_errors:
                    writer.writerow([athlete['aid'], athlete['name'], athlete['country']])

            self.athlete_errors = []

    def scrape_games_data(self, season):
        # Create olympics dictionary
        try:
            fp = open('%s.json' % season.lower(), 'r')
            print '%s links file exists. Loading...' % season
            games_links = simplejson.load(fp)
            fp.close()
        except IOError as e:
            self.get_games_list()
            print '%s links file not found. Scraping data...' % season
            games_links = games.get_games_data("%s Olympics" % season)
            fp = open('%s.json' % season.lower(), 'wb')
            simplejson.dump(games_links, fp)
            fp.close()

        # Create csv files
        print "=" * 50
        print "%s Olympics - creating CSV files" % season
        games.write_games_csvs(games_links, season.lower())

if __name__ == '__main__':

    games = Games()

    # Get Summer Olympics data
    games.scrape_games_data("Summer")

    # Get Winter Olympics data
    games.scrape_games_data("Winter")

    # Create countries csv file
    games.write_country_csv()
Tech Fingerprint

Alerts (11)

'def' Ensure functions have docstrings for documentation
29 34 56 78 168
'open(' Use 'with open()' to ensure Files are properly closed
30 57 159
Complexity hotspot; lines 173 to 175 (total complexity: 3)
173 174 175