/scripts/olympic.py
https://gitlab.com/camilo-celis/DB_SQuirreL · Python · 250 lines · 177 code · 58 blank · 15 comment · 42 complexity · 1a80b2500dcfcbebc37e6ee6239448c1 MD5 · raw file
- #!/usr/bin/env python
- """
- Scrape Olympic data and store in a CSV file.
- Written by Joel Addison, 2012
- """
- import urllib2
- from bs4 import BeautifulSoup
- import csv
- import os
- import simplejson
- import string
- import unicode_csv
- from progressbar import ProgressBar
- class Games(object):
- def __init__(self):
- self.athletes = {}
- self.events = {}
- self.countries = {}
- self.athlete_errors = []
- self.base_url = 'http://databaseolympics.com/games/gameslist.htm'
- def get_games_list(self):
- html_data = urllib2.urlopen(self.base_url).read()
- #Create the soup object from the HTML data
- self.soup = BeautifulSoup(html_data)
- def get_games_data(self, games_type):
- games_links = self.soup.find(text=games_type).find_next('table').find_all('a')
- details = {}
- for link in games_links:
- url = "http://databaseolympics.com/games/gamesyear.htm?g=%s" % link['href'].split('=')[1]
- page = BeautifulSoup(urllib2.urlopen(url).read(), from_encoding="iso-8859-1")
- year = link.string
- details[year] = {}
- countries = page.find(text="Country").find_next("table").findAll('a')
- details[year]['countries'] = {}
- for country in countries:
- c_url = "http://databaseolympics.com%s" % country['href']
- c_name = country.encode_contents()
- details[year]['countries'][c_name] = c_url
- return details
- def get_country_data(self, country_url):
- page = BeautifulSoup(urllib2.urlopen(country_url).read(), from_encoding="iso-8859-1")
- events = []
- table = page.find_all('table')[2].find_all('tr')
- for row in table[1:]:
- cells = row.find_all('td')
- athlete_id = unicode(cells[3].find('a')['href'].split('ilkid=')[1])
- events.append(dict(
- year=cells[0].find('a').string,
- sport=cells[1].find('a').string,
- event=cells[2].find('a').string,
- athlete=athlete_id,
- result=cells[4].string or "",
- medal=cells[5].string or "",
- ))
- return events
- def get_athlete_data(self, athlete, a_name=None):
- #http://databaseolympics.com/players/playerpage.htm?ilkid=GONZAMAR03
- if not self.athletes.get(athlete, None):
- aid = urllib2.quote(athlete.encode('latin-1'))
- url = "http://databaseolympics.com/players/playerpage.htm?ilkid=%s" % aid
- page = BeautifulSoup(urllib2.urlopen(url).read(), from_encoding="iso-8859-1")
- name = page.find('h1')
- bio = name.find_next('font').find_all('a')
- birthdate = ""
- country = ""
- sport = ""
- for detail in bio:
- if 'sport' in detail['href']:
- sport = detail.string
- elif 'country' in detail['href']:
- country = detail.string
- elif 'birthday' in detail['href']:
- birthdate = detail.string
- if a_name is None:
- a_name = name.string
- self.athletes[athlete] = dict(
- name=a_name,
- country=country,
- sport=sport,
- birthdate=birthdate,
- )
- def write_games_csvs(self, events, season):
- for year, details in events.iteritems():
- print " Processing %s..." % year
- events = []
- for country, c_url in details['countries'].iteritems():
- print " country: %s" % country
- events.extend(self.get_country_data(c_url))
- cid = c_url.split('cty=')[1]
- self.countries[cid] = country
- #write to csv
- with open('%s_%s.csv' % (season, year), 'wb') as wf:
- writer = unicode_csv.UnicodeWriter(wf)
- writer.writerow(['year', 'sport', 'event', 'athlete', 'result', 'medal'])
- for event in events:
- writer.writerow([event['year'], event['sport'], event['event'],
- event['athlete'], event['result'], event['medal']])
- # Write events to json
- with open('%s_%s.json' % (season, year), 'wb') as wf:
- simplejson.dump(events, wf)
- def write_athlete_csv(self):
- with open('all_athletes.csv', 'wb') as wf:
- writer = unicode_csv.UnicodeWriter(wf)
- writer.writerow(['athlete_id', 'name', 'country', 'sport', 'birthdate'])
- for aid, athlete in self.athletes.iteritems():
- writer.writerow([aid, athlete['name'], athlete['country'],
- athlete['sport'], athlete['birthdate']])
- with open('all_athletes.json', 'wb') as wf:
- simplejson.dump(self.athletes, wf)
- with open('error_athletes.json', 'wb') as wf:
- simplejson.dump(self.athlete_errors, wf)
- def write_country_csv(self):
- with open('all_countries.csv', 'wb') as wf:
- writer = unicode_csv.UnicodeWriter(wf)
- writer.writerow(['cid', 'country'])
- for cid, country in self.countries.iteritems():
- writer.writerow([cid, country])
- with open('all_countries.json', 'wb') as wf:
- simplejson.dump(self.countries, wf)
- def get_country_list(self):
- url = "http://databaseolympics.com/country/countrylist.htm"
- page = BeautifulSoup(urllib2.urlopen(url).read())
- links = page.find_all('p')[1].find_all('a')
- for link in links:
- code = link['href'].split('cty=')[1]
- self.countries[code] = link.string
- self.write_country_csv()
- def get_athlete_list(self, letters=None, error_only=False):
- base_url = "http://databaseolympics.com/players/playerlist.htm?lt=%s" #a
- p = ProgressBar()
- if letters is None:
- alphabet = [a for a in string.ascii_lowercase]
- else:
- alphabet = letters
- for letter in alphabet:
- page = BeautifulSoup(urllib2.urlopen(base_url % letter).read(), from_encoding="iso-8859-1")
- links = page.find_all('p')[3].find_all('a')
- num_links = len(links)
- for i, link in enumerate(links):
- if 'player' in link['href']:
- athlete_id = link['href'].split('ilkid=')[1]
- try:
- self.get_athlete_data(athlete_id, link.string)
- except Exception, e:
- self.athlete_errors.append(dict(aid=athlete_id, name=link.string, country=links[i+1].string))
- pos = int((float(i) / num_links) * 100)
- p.render(pos, 'step %s\nProcessing...\nDescription: Letter %s (%d athletes).' % (pos, letter, num_links/2))
- if not error_only:
- with open('athletes_%s.csv' % letter, 'wb') as wf:
- writer = unicode_csv.UnicodeWriter(wf)
- writer.writerow(['athlete_id', 'name', 'country', 'sport', 'birthdate'])
- for aid, athlete in self.athletes.iteritems():
- writer.writerow([aid, athlete['name'], athlete['country'],
- athlete['sport'], athlete['birthdate']])
- with open('athletes_%s.json' % letter, 'wb') as wf:
- simplejson.dump(self.athletes, wf)
- self.athletes = {}
- with open('error_athletes_%s.json' % letter, 'wb') as wf:
- simplejson.dump(self.athlete_errors, wf)
- with open('error_athletes_%s.csv' % letter, 'wb') as wf:
- writer = unicode_csv.UnicodeWriter(wf)
- writer.writerow(['athlete_id', 'name', 'country'])
- for athlete in self.athlete_errors:
- writer.writerow([athlete['aid'], athlete['name'], athlete['country']])
- self.athlete_errors = []
- def scrape_games_data(self, season):
- # Create olympics dictionary
- try:
- fp = open('%s.json' % season.lower(), 'r')
- print '%s links file exists. Loading...' % season
- games_links = simplejson.load(fp)
- fp.close()
- except IOError as e:
- self.get_games_list()
- print '%s links file not found. Scraping data...' % season
- games_links = games.get_games_data("%s Olympics" % season)
- fp = open('%s.json' % season.lower(), 'wb')
- simplejson.dump(games_links, fp)
- fp.close()
- # Create csv files
- print "=" * 50
- print "%s Olympics - creating CSV files" % season
- games.write_games_csvs(games_links, season.lower())
- if __name__ == '__main__':
- games = Games()
- # Get Summer Olympics data
- games.scrape_games_data("Summer")
- # Get Winter Olympics data
- games.scrape_games_data("Winter")
- # Create countries csv file
- games.write_country_csv()