PageRenderTime 45ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/libscrape/extract/shotchart_cbssports.py

https://github.com/kpascual/nbascrape
Python | 127 lines | 69 code | 38 blank | 20 comment | 13 complexity | 46e5663d11a6c4ace79818945de965e9 MD5 | raw file
  1. import urllib2
  2. import re
  3. import csv
  4. import datetime
  5. import sys
  6. import logging
  7. from BeautifulSoup import BeautifulSoup
  8. from libscrape.config import constants
  9. LOGDIR_EXTRACT = constants.LOGDIR_EXTRACT
  10. class ShotExtract:
  11. def __init__(self, html, filename, gamedata):
  12. self.html = html
  13. self.gamedata = gamedata
  14. self.game_name = self.gamedata['abbrev']
  15. self.away_team = self.gamedata['away_team_id']
  16. self.home_team = self.gamedata['home_team_id']
  17. self.filename = filename
  18. def extractAndDump(self):
  19. home_players = self.getHomePlayers()
  20. away_players = self.getAwayPlayers()
  21. shots = self.getShotData()
  22. self._dumpShots(shots)
  23. self._dumpPlayers(home_players + away_players)
  24. logging.info("EXTRACT - shotchart_cbssports - game_id: %s - shots extracted: %s" % (self.gamedata['id'], len(shots)))
  25. def getHomePlayers(self):
  26. pattern = re.compile(".*var\s+playerDataHomeString\s+\=\s+new\s+String\(\"(?P<info>.+)\"\).+")
  27. match = pattern.search(self.html)
  28. if match:
  29. matched = [[self.home_team, player.split(':')[0]] + player.split(':')[1].split(',') for player in match.group('info').split('|')]
  30. return matched
  31. return []
  32. def getAwayPlayers(self):
  33. pattern = re.compile(".*var\s+playerDataAwayString\s+\=\s+new\s+String\(\"(?P<info>.+)\"\).+")
  34. match = pattern.search(self.html)
  35. if match:
  36. matched = [[self.away_team, player.split(':')[0]] + player.split(':')[1].split(',') for player in match.group('info').split('|')]
  37. return matched
  38. return []
  39. def getShotData(self):
  40. pattern = re.compile(".*var\s+currentShotData\s+\=\s+new\s+String\(\"(?P<info>.+)\"\).+")
  41. list_shotdata = []
  42. matches = pattern.search(self.html)
  43. if matches:
  44. shotdata = matches.group('info')
  45. list_shotdata = [[i] + itm.split(',') for i,itm in enumerate(shotdata.split('~'))]
  46. else:
  47. pass
  48. return list_shotdata
  49. def _dumpShots(self, data):
  50. writer = csv.writer(open(LOGDIR_EXTRACT + self.filename + '_shots','wb'),delimiter=',',lineterminator='\n')
  51. writer.writerows(data)
  52. def _dumpPlayers(self, data):
  53. writer = csv.writer(open(LOGDIR_EXTRACT + self.filename + '_players','wb'),delimiter=',',lineterminator='\n')
  54. writer.writerows(data)
  55. def assertCourtDimensions(self):
  56. x = 0
  57. y = 0
  58. matchx = re.search(".*sp\.x\s+=\s+(?P<x>\d+).*",self.html)
  59. matchy = re.search(".*sp\.y\s+=\s+(?P<y>\d+).*",self.html)
  60. if matchx:
  61. x = int(matchx.group('x'))
  62. if matchy:
  63. y = int(matchy.group('y'))
  64. return (x, y)
  65. def assertShotDefinitions(self):
  66. match = re.findall(".*sp\.shotTypeArray\[(?P<index>\d+)\]\s+=\s+\"(?P<name>[0-9a-zA-Z\s]+)\";.*",self.html)
  67. if match:
  68. return match
  69. return []
  70. #shot data index of values, after split by ~
  71. # 1: possession (0 == away team, 1 == home team)
  72. # 2: miltime
  73. # 3: current period
  74. # 4: player_id
  75. # 5: shot type (there's a map found in javascript)
  76. # 6: shot result
  77. # 7: x coordinate
  78. # 8: y coordinate
  79. # 9: distance
  80. # player data index of values
  81. # 1: player name (&nbsp; should be replaced with a space " ")
  82. # 2: jersey #
  83. # 3: position
  84. # 4: FG data
  85. # 5: 3pt data
  86. # 6: free throw data
  87. # 7: total points
  88. # 8: totals
  89. # Dimensions of court: 300 wide, 282 length, ratio? 6