PageRenderTime 47ms CodeModel.GetById 21ms RepoModel.GetById 1ms app.codeStats 0ms

/scripts/olympic.py

https://gitlab.com/camilo-celis/DB_SQuirreL
Python | 250 lines | 224 code | 19 blank | 7 comment | 4 complexity | 1a80b2500dcfcbebc37e6ee6239448c1 MD5 | raw file
  1. #!/usr/bin/env python
  2. """
  3. Scrape Olympic data and store in a CSV file.
  4. Written by Joel Addison, 2012
  5. """
  6. import urllib2
  7. from bs4 import BeautifulSoup
  8. import csv
  9. import os
  10. import simplejson
  11. import string
  12. import unicode_csv
  13. from progressbar import ProgressBar
  14. class Games(object):
  15. def __init__(self):
  16. self.athletes = {}
  17. self.events = {}
  18. self.countries = {}
  19. self.athlete_errors = []
  20. self.base_url = 'http://databaseolympics.com/games/gameslist.htm'
  21. def get_games_list(self):
  22. html_data = urllib2.urlopen(self.base_url).read()
  23. #Create the soup object from the HTML data
  24. self.soup = BeautifulSoup(html_data)
  25. def get_games_data(self, games_type):
  26. games_links = self.soup.find(text=games_type).find_next('table').find_all('a')
  27. details = {}
  28. for link in games_links:
  29. url = "http://databaseolympics.com/games/gamesyear.htm?g=%s" % link['href'].split('=')[1]
  30. page = BeautifulSoup(urllib2.urlopen(url).read(), from_encoding="iso-8859-1")
  31. year = link.string
  32. details[year] = {}
  33. countries = page.find(text="Country").find_next("table").findAll('a')
  34. details[year]['countries'] = {}
  35. for country in countries:
  36. c_url = "http://databaseolympics.com%s" % country['href']
  37. c_name = country.encode_contents()
  38. details[year]['countries'][c_name] = c_url
  39. return details
  40. def get_country_data(self, country_url):
  41. page = BeautifulSoup(urllib2.urlopen(country_url).read(), from_encoding="iso-8859-1")
  42. events = []
  43. table = page.find_all('table')[2].find_all('tr')
  44. for row in table[1:]:
  45. cells = row.find_all('td')
  46. athlete_id = unicode(cells[3].find('a')['href'].split('ilkid=')[1])
  47. events.append(dict(
  48. year=cells[0].find('a').string,
  49. sport=cells[1].find('a').string,
  50. event=cells[2].find('a').string,
  51. athlete=athlete_id,
  52. result=cells[4].string or "",
  53. medal=cells[5].string or "",
  54. ))
  55. return events
  56. def get_athlete_data(self, athlete, a_name=None):
  57. #http://databaseolympics.com/players/playerpage.htm?ilkid=GONZAMAR03
  58. if not self.athletes.get(athlete, None):
  59. aid = urllib2.quote(athlete.encode('latin-1'))
  60. url = "http://databaseolympics.com/players/playerpage.htm?ilkid=%s" % aid
  61. page = BeautifulSoup(urllib2.urlopen(url).read(), from_encoding="iso-8859-1")
  62. name = page.find('h1')
  63. bio = name.find_next('font').find_all('a')
  64. birthdate = ""
  65. country = ""
  66. sport = ""
  67. for detail in bio:
  68. if 'sport' in detail['href']:
  69. sport = detail.string
  70. elif 'country' in detail['href']:
  71. country = detail.string
  72. elif 'birthday' in detail['href']:
  73. birthdate = detail.string
  74. if a_name is None:
  75. a_name = name.string
  76. self.athletes[athlete] = dict(
  77. name=a_name,
  78. country=country,
  79. sport=sport,
  80. birthdate=birthdate,
  81. )
  82. def write_games_csvs(self, events, season):
  83. for year, details in events.iteritems():
  84. print " Processing %s..." % year
  85. events = []
  86. for country, c_url in details['countries'].iteritems():
  87. print " country: %s" % country
  88. events.extend(self.get_country_data(c_url))
  89. cid = c_url.split('cty=')[1]
  90. self.countries[cid] = country
  91. #write to csv
  92. with open('%s_%s.csv' % (season, year), 'wb') as wf:
  93. writer = unicode_csv.UnicodeWriter(wf)
  94. writer.writerow(['year', 'sport', 'event', 'athlete', 'result', 'medal'])
  95. for event in events:
  96. writer.writerow([event['year'], event['sport'], event['event'],
  97. event['athlete'], event['result'], event['medal']])
  98. # Write events to json
  99. with open('%s_%s.json' % (season, year), 'wb') as wf:
  100. simplejson.dump(events, wf)
  101. def write_athlete_csv(self):
  102. with open('all_athletes.csv', 'wb') as wf:
  103. writer = unicode_csv.UnicodeWriter(wf)
  104. writer.writerow(['athlete_id', 'name', 'country', 'sport', 'birthdate'])
  105. for aid, athlete in self.athletes.iteritems():
  106. writer.writerow([aid, athlete['name'], athlete['country'],
  107. athlete['sport'], athlete['birthdate']])
  108. with open('all_athletes.json', 'wb') as wf:
  109. simplejson.dump(self.athletes, wf)
  110. with open('error_athletes.json', 'wb') as wf:
  111. simplejson.dump(self.athlete_errors, wf)
  112. def write_country_csv(self):
  113. with open('all_countries.csv', 'wb') as wf:
  114. writer = unicode_csv.UnicodeWriter(wf)
  115. writer.writerow(['cid', 'country'])
  116. for cid, country in self.countries.iteritems():
  117. writer.writerow([cid, country])
  118. with open('all_countries.json', 'wb') as wf:
  119. simplejson.dump(self.countries, wf)
  120. def get_country_list(self):
  121. url = "http://databaseolympics.com/country/countrylist.htm"
  122. page = BeautifulSoup(urllib2.urlopen(url).read())
  123. links = page.find_all('p')[1].find_all('a')
  124. for link in links:
  125. code = link['href'].split('cty=')[1]
  126. self.countries[code] = link.string
  127. self.write_country_csv()
  128. def get_athlete_list(self, letters=None, error_only=False):
  129. base_url = "http://databaseolympics.com/players/playerlist.htm?lt=%s" #a
  130. p = ProgressBar()
  131. if letters is None:
  132. alphabet = [a for a in string.ascii_lowercase]
  133. else:
  134. alphabet = letters
  135. for letter in alphabet:
  136. page = BeautifulSoup(urllib2.urlopen(base_url % letter).read(), from_encoding="iso-8859-1")
  137. links = page.find_all('p')[3].find_all('a')
  138. num_links = len(links)
  139. for i, link in enumerate(links):
  140. if 'player' in link['href']:
  141. athlete_id = link['href'].split('ilkid=')[1]
  142. try:
  143. self.get_athlete_data(athlete_id, link.string)
  144. except Exception, e:
  145. self.athlete_errors.append(dict(aid=athlete_id, name=link.string, country=links[i+1].string))
  146. pos = int((float(i) / num_links) * 100)
  147. p.render(pos, 'step %s\nProcessing...\nDescription: Letter %s (%d athletes).' % (pos, letter, num_links/2))
  148. if not error_only:
  149. with open('athletes_%s.csv' % letter, 'wb') as wf:
  150. writer = unicode_csv.UnicodeWriter(wf)
  151. writer.writerow(['athlete_id', 'name', 'country', 'sport', 'birthdate'])
  152. for aid, athlete in self.athletes.iteritems():
  153. writer.writerow([aid, athlete['name'], athlete['country'],
  154. athlete['sport'], athlete['birthdate']])
  155. with open('athletes_%s.json' % letter, 'wb') as wf:
  156. simplejson.dump(self.athletes, wf)
  157. self.athletes = {}
  158. with open('error_athletes_%s.json' % letter, 'wb') as wf:
  159. simplejson.dump(self.athlete_errors, wf)
  160. with open('error_athletes_%s.csv' % letter, 'wb') as wf:
  161. writer = unicode_csv.UnicodeWriter(wf)
  162. writer.writerow(['athlete_id', 'name', 'country'])
  163. for athlete in self.athlete_errors:
  164. writer.writerow([athlete['aid'], athlete['name'], athlete['country']])
  165. self.athlete_errors = []
  166. def scrape_games_data(self, season):
  167. # Create olympics dictionary
  168. try:
  169. fp = open('%s.json' % season.lower(), 'r')
  170. print '%s links file exists. Loading...' % season
  171. games_links = simplejson.load(fp)
  172. fp.close()
  173. except IOError as e:
  174. self.get_games_list()
  175. print '%s links file not found. Scraping data...' % season
  176. games_links = games.get_games_data("%s Olympics" % season)
  177. fp = open('%s.json' % season.lower(), 'wb')
  178. simplejson.dump(games_links, fp)
  179. fp.close()
  180. # Create csv files
  181. print "=" * 50
  182. print "%s Olympics - creating CSV files" % season
  183. games.write_games_csvs(games_links, season.lower())
  184. if __name__ == '__main__':
  185. games = Games()
  186. # Get Summer Olympics data
  187. games.scrape_games_data("Summer")
  188. # Get Winter Olympics data
  189. games.scrape_games_data("Winter")
  190. # Create countries csv file
  191. games.write_country_csv()