PageRenderTime 60ms CodeModel.GetById 30ms RepoModel.GetById 0ms app.codeStats 0ms

/external_data.py

https://bitbucket.org/aabtzu/mlkaggle
Python | 267 lines | 264 code | 3 blank | 0 comment | 0 complexity | c6a6ec1108cae5e289cdf7a81016bf7d MD5 | raw file
  1. __author__ = 'amit'
  2. import pandas
  3. import bs4
  4. import requests
  5. import urllib3
  6. import os
  7. import datetime
  8. import argparse
  9. CURRENT_SEASON = 2020
  10. SPREADS_URL = 'http://www.footballlocks.com/nfl_point_spreads.shtml'
  11. SCORES_URL = 'http://www.pro-football-reference.com/years/%d/games.htm' % CURRENT_SEASON
  12. CODE_DIR = "".join([os.environ['MLNFL_ROOT'], os.path.sep])
  13. path_to_lines = CODE_DIR + "data/lines/"
  14. lines_file = path_to_lines + "nflAllLines.csv"
  15. def read_lines():
  16. """
  17. read in the master lines file
  18. :return:
  19. """
  20. df_lines = pandas.read_csv(lines_file)
  21. df_lines.Date = pandas.to_datetime(df_lines.Date).dt.date
  22. return df_lines
  23. def save_lines(df_lines):
  24. """
  25. Synopsis: save the master lines file
  26. :param df_lines: pandas DataFrame of all lines and scores
  27. :return: None
  28. """
  29. df_lines.to_csv(lines_file, index=False)
  30. def scrape_spreads(season=CURRENT_SEASON):
  31. """
  32. Scrape spreads from site
  33. :return:
  34. """
  35. r = requests.get(SPREADS_URL)
  36. data = r.text
  37. soup = bs4.BeautifulSoup(data, "lxml")
  38. # get the tables w/ the spreads
  39. tt = soup.findAll("table", {"width": "580"})
  40. df_spreads = pandas.DataFrame()
  41. for i in range(2): # hard coded table number most of the time
  42. #for i in range(1): # hard coded table number for last week of season
  43. dfs = pandas.read_html(str(tt[i]), )
  44. df_spreads = df_spreads.append(dfs[0])
  45. df_spreads.index = range(len(df_spreads))
  46. df_spreads.columns = ['date', 'favorite', 'spread', 'underdog']
  47. filter_bad = df_spreads.favorite == 'Favorite'
  48. df_spreads = df_spreads[~filter_bad]
  49. # get the home favorite
  50. df_spreads['home_favorite'] = (df_spreads.favorite.str.contains('^At ')) | (df_spreads.favorite.str.contains('At '))
  51. # fix any spreads that are tied (PK)
  52. df_spreads.loc[df_spreads.spread.astype(str).str.contains('Off'), 'spread'] = -.1 # need -1 for some reason
  53. df_spreads.loc[df_spreads.spread.astype(str).str.contains('PK'), 'spread'] = -.1 # need -1 for some reason
  54. # flip sign on spread for away favorite
  55. df_spreads['factor'] = 1
  56. df_spreads.loc[df_spreads.home_favorite == True,'factor'] = -1
  57. df_spreads['spreads2'] = df_spreads.spread.astype(float) * df_spreads.factor
  58. # get the home team
  59. df_spreads['home_team'] = df_spreads.favorite
  60. home_filter = df_spreads.underdog.str.contains('^At ') | df_spreads.underdog.str.contains('(At|at) ')
  61. df_spreads.loc[home_filter, 'home_team'] = df_spreads.loc[home_filter, 'underdog']
  62. df_spreads.home_team = df_spreads.home_team.str.replace('BUffalo', 'Buffalo')
  63. df_spreads.home_team = df_spreads.home_team.str.replace('Francsico', 'Francisco')
  64. df_spreads.home_team = df_spreads.home_team.str.replace('^At ', '')
  65. df_spreads.home_team = df_spreads.home_team.str.replace('\(At .*\)', '')
  66. #df_spreads.home_team = df_spreads.home_team.str.replace('.At .*?$', '')
  67. df_spreads.home_team = df_spreads.home_team.str.replace('\(.*\)', '')
  68. df_spreads['datetime'] = pandas.to_datetime(str(season)+'/'+df_spreads.date.str.split(" ", expand=True)[0],
  69. format='%Y/%m/%d').dt.date
  70. print(df_spreads.dropna())
  71. return df_spreads.dropna()
  72. def merge_spreads(df_spreads, df_lines, current_week = None):
  73. """
  74. Merge Spreads into lines dataframe
  75. :param df_spreads:
  76. :param df_lines:
  77. :return:
  78. """
  79. # find the right week/game and update the spread
  80. max_date = (df_spreads.datetime.max() + pandas.DateOffset(days=1)).date()
  81. week_filter = (df_lines.Date <= max_date) & (df_lines.Date >= df_spreads.datetime.min())
  82. if current_week is not None:
  83. week_filter = (df_lines.season == CURRENT_SEASON) & (df_lines.week == current_week)
  84. for ii, rr in df_spreads.iterrows():
  85. print (ii, rr['home_team'], rr['spreads2'])
  86. if ('NY' in rr['home_team']) | ('LA' in rr['home_team']):
  87. rr['home_team'] = rr['home_team'].split(' ')[1]
  88. game_filter = df_lines[week_filter]['Home Team'].str.contains(rr['home_team'])
  89. irow = df_lines[week_filter][game_filter].index[0]
  90. df_lines.loc[irow, 'Line'] = rr['spreads2']
  91. return df_lines
  92. def scrape_scores(week, season=CURRENT_SEASON):
  93. """
  94. Scrape scores
  95. :param week:
  96. :param season:
  97. :return:
  98. """
  99. scores_url = SCORES_URL
  100. r = requests.get(scores_url)
  101. data = r.text
  102. soup = bs4.BeautifulSoup(data, 'lxml')
  103. # get the tables w/ the spreads
  104. tt = soup.findAll("table")
  105. df_scores = pandas.read_html(str(tt[0]), )[0]
  106. # Make sure to enter the correct week number
  107. week_filter = df_scores.Week == str(week)
  108. df_week = df_scores[week_filter]
  109. # rename home game col
  110. old_home_col = df_week.columns[5]
  111. home_col = 'home_game'
  112. df_week.rename(columns={old_home_col: home_col}, inplace=True)
  113. winner_col = 'Winner/tie'
  114. loser_col = 'Loser/tie'
  115. winner_pts_col = 'PtsW'
  116. loser_pts_col = 'PtsL'
  117. # means that winner was away team, loser was home team
  118. away_filter = df_week[home_col] == '@'
  119. # populate these new cols
  120. new_cols = ['home_pts', 'away_pts', 'home_team', 'away_team']
  121. for cc in new_cols:
  122. df_week[cc] = None
  123. df_week.loc[~away_filter, 'home_team'] = df_week.loc[~away_filter, winner_col]
  124. df_week.loc[away_filter, 'home_team'] = df_week.loc[away_filter, loser_col]
  125. df_week.loc[~away_filter, 'away_team'] = df_week.loc[~away_filter, loser_col]
  126. df_week.loc[away_filter, 'away_team'] = df_week.loc[away_filter, winner_col]
  127. df_week.loc[~away_filter, 'home_pts'] = df_week.loc[~away_filter, winner_pts_col]
  128. df_week.loc[away_filter, 'home_pts'] = df_week.loc[away_filter, loser_pts_col]
  129. df_week.loc[~away_filter, 'away_pts'] = df_week.loc[~away_filter, loser_pts_col]
  130. df_week.loc[away_filter, 'away_pts'] = df_week.loc[away_filter, winner_pts_col]
  131. return df_week
  132. def merge_scores(df_week, week, season, df_lines):
  133. """
  134. Merge Scores into file
  135. :param df_week:
  136. :param week:
  137. :param season:
  138. :param df_lines:
  139. :return:
  140. """
  141. # find the right week/game and update the score
  142. week_filter = (df_lines.season == season) & (df_lines.week == week)
  143. for ii, rr in df_week.iterrows():
  144. print (ii, rr['home_team'], rr['home_pts'])
  145. game_filter = df_lines[week_filter]['Home Team'].str.contains(rr['home_team'])
  146. irow = df_lines[week_filter][game_filter].index[0]
  147. print (df_lines.iloc[irow]['Home Team'])
  148. df_lines.loc[irow, 'Home Score'] = rr['home_pts']
  149. df_lines.loc[irow, 'Visitor Score'] = rr['away_pts']
  150. return df_lines
  151. def get_current_week(df_lines, current_season=CURRENT_SEASON):
  152. """
  153. Get current week lines
  154. :param df_lines:
  155. :param current_season:
  156. :return:
  157. """
  158. today = datetime.datetime.today().date()
  159. date_filter = (df_lines.Date > today) & (df_lines.season == current_season)
  160. current_week = df_lines[date_filter].week.min()
  161. return int(current_week)
  162. def verify_data(df_data, data_type):
  163. """
  164. look at the data
  165. :param df_data:
  166. :param data_type:
  167. :return:
  168. """
  169. print ("verifying %s data:" % data_type)
  170. print (df_data)
  171. ans = input("accept (y/n): ")
  172. if ans.lower() == 'y':
  173. return True
  174. return False
  175. if __name__ == "__main__":
  176. # read lines file and get current week
  177. df_lines = read_lines()
  178. season = CURRENT_SEASON
  179. try:
  180. current_week = get_current_week(df_lines, season)
  181. except:
  182. current_week = 0 # need to specify a week for playoffs
  183. # define input args
  184. parser = argparse.ArgumentParser()
  185. parser.add_argument('--week', '-w', action='store', default=current_week, dest='game_week',
  186. type=int, help='Pass the week number to make the picks')
  187. parser.add_argument('--scores', action='store_true', dest='scores')
  188. parser.add_argument('--spreads', action='store_true', dest='spreads')
  189. args = parser.parse_args()
  190. # get and save scores
  191. if args.scores:
  192. week = args.game_week
  193. print ("getting scores of week %d of %d season ..." % (week, season))
  194. df_week = scrape_scores(week, season)
  195. df_lines = merge_scores(df_week, week, season, df_lines)
  196. if verify_data(df_week, 'scores'):
  197. save_lines(df_lines)
  198. if args.spreads:
  199. # get and save spreads
  200. print ("getting most recent spreads ...")
  201. df_spreads = scrape_spreads()
  202. df_lines = merge_spreads(df_spreads, df_lines, current_week)
  203. if verify_data(df_spreads, 'spreads'):
  204. save_lines(df_lines)