PageRenderTime 28ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/yahoo_sports/base.rb

https://github.com/chetan/yahoo_sports
Ruby | 435 lines | 345 code | 37 blank | 53 comment | 24 complexity | 0074c34dedd720a1feae2c68dbfd35e8 MD5 | raw file
  1. require 'rubygems'
  2. require 'tzinfo'
  3. if RUBY_PLATFORM =~ /darwin/ then
  4. # fix for scrapi on Mac OS X
  5. require "tidy"
  6. Tidy.path = "/usr/lib/libtidy.dylib"
  7. end
  8. require 'net/http'
  9. require 'scrapi'
  10. require 'ostruct'
  11. require 'htmlentities'
  12. module YahooSports
  13. # Fetches the given URL and returns the body
  14. #
  15. # @param [String] URL
  16. # @return [String] contents of response body
  17. def self.fetchurl(url)
  18. # puts "FETCHING: '#{url}'"
  19. return Net::HTTP.get_response(URI.parse(URI.escape(url))).body
  20. end
  21. # Strip HTML tags from the given string. Also performs some common entity
  22. # substitutions.
  23. #
  24. # List of entity codes:
  25. # *  
  26. # * &
  27. # * "
  28. # * <
  29. # * >
  30. # * &ellip;
  31. # * '
  32. #
  33. # @param [String] html text to be filtered
  34. # @return [String] original string with HTML tags filtered out and entities replaced
  35. def self.strip_tags(html)
  36. HTMLEntities.new.decode(
  37. html.gsub(/<.+?>/,'').
  38. gsub(/&nbsp;/,' ').
  39. gsub(/&amp;/,'&').
  40. gsub(/&quot;/,'"').
  41. gsub(/&lt;/,'<').
  42. gsub(/&gt;/,'>').
  43. gsub(/&ellip;/,'...').
  44. gsub(/&apos;/, "'").
  45. gsub(/<br *\/>/m, '')
  46. ).strip
  47. end
  48. class Base
  49. # Get the scoreboard games for the given sport. Includes recently completed,
  50. # live and upcoming games.
  51. #
  52. # Source: http://sports.yahoo.com/<sport>
  53. #
  54. # Game struct has the following keys:
  55. # game.date # date of game; includes time if preview
  56. # game.team1 # visiting team
  57. # game.team2 # home team
  58. # game.score1 # team1's score, if live or final
  59. # game.score2 # team2's score, if live or final
  60. # game.state # live, final or preview
  61. # game.tv # TV station showing the game, if preview and available
  62. #
  63. # Example:
  64. # #<OpenStruct state="final", score1="34", date=Thu Nov 26 00:00:00 -0500 2009, score2="12", team1="Green Bay", team2="Detroit">
  65. #
  66. #
  67. # @param [String] sport sport to list, can be one of ["mlb", "nba", "nfl", "nhl"]
  68. # @param [String] state Optionally filter for the given state ("live", "final", or "preview")
  69. # @return [Array<OpenStruct>] list of games
  70. def self.get_homepage_games(sport, state = "")
  71. sport.downcase!
  72. if sport !~ /^(nba|nhl|nfl|mlb)$/ then
  73. raise sprintf("Invalid param for 'sport' = '%s'", sport)
  74. end
  75. state.downcase! if not state.empty?
  76. if not state.empty? and state !~ /^(live|final|preview)$/ then
  77. raise sprintf("Invalid param for 'state' = '%s'", state)
  78. end
  79. html = YahooSports.fetchurl("http://sports.yahoo.com/#{sport}/proxy/html/scorethin")
  80. if not html then
  81. raise 'Error fetching url'
  82. end
  83. sports_game = Scraper.define do
  84. array :teams
  85. array :scores
  86. process "li.odd, li.even, li.live", :date_src => "@id"
  87. process "li.odd, li.even, li.live", :class_src => "@class"
  88. process "li.link-box", :extra_src => :text
  89. process "td.team>a", :teams => :text
  90. process "td.score", :scores => :text
  91. process "li.status>a", :status => :text
  92. result :date_src, :teams, :scores, :status, :class_src, :extra_src
  93. end
  94. sports = Scraper.define do
  95. array :games
  96. process "ul.game-list>li", :games => sports_game
  97. result :games
  98. end
  99. games_temp = sports.scrape(html)
  100. games = []
  101. return games if games_temp.nil?
  102. games_temp.each { |g|
  103. gm = OpenStruct.new
  104. gm.team1 = g.teams[0].strip if g.teams[0]
  105. gm.team2 = g.teams[1].strip if g.teams[1]
  106. gm.score1 = g.scores[0].strip if g.scores[0]
  107. gm.score2 = g.scores[1].strip if g.scores[1]
  108. if g.class_src.include? ' ' then
  109. gm.state = g.class_src[ g.class_src.index(' ')+1, g.class_src.length ].strip
  110. else
  111. gm.state = g.class_src.strip
  112. end
  113. gm.tv = $1 if g.extra_src =~ /TV: (.*)/
  114. status = g.status.strip if g.status
  115. time_str = (gm.state == "preview" ? " #{status}" : "")
  116. if sport == 'mlb' then
  117. gm.date = Time.parse(Time.new.strftime('%Y') + g.date_src[2,4] + time_str)
  118. else
  119. gm.date = Time.parse(g.date_src[0,8] + time_str)
  120. end
  121. next if not state.empty? and state != gm.state
  122. games << gm
  123. }
  124. return games
  125. end
  126. # Retrieves team information for the team in the given sport
  127. #
  128. # Source: http://sports.yahoo.com/<sport>/teams/<team>
  129. #
  130. # Team struct has the following keys:
  131. # team.name # full team name
  132. # team.standing # current standing
  133. # team.position # position in the conference
  134. # team.last5 # previous games results
  135. # team.next5 # upcoming scheduled games
  136. # team.live # struct describing in-progress game, if available
  137. #
  138. #
  139. # Games in the last5 and next5 lists have the following keys:
  140. # game.date # date of game
  141. # game.team # full team name
  142. # game.status # score for completed games (e.g. "L 20 - 23") or "preview"
  143. # game.away # boolean value indicating an away game
  144. #
  145. # @param [String] sport sport to list, can be one of ["mlb", "nba", "nfl", "nhl"]
  146. # @param [String] str 3-letter team code or partial team name
  147. # @return [OpenStruct] team info
  148. def self.get_team_stats(sport, str)
  149. sport.downcase!
  150. if sport !~ /^(nba|nhl|nfl|mlb)$/ then
  151. raise sprintf("Invalid param for 'sport' = '%s'", sport)
  152. end
  153. str.downcase!
  154. (team, html) = find_team_page(sport, str)
  155. if html.nil? then
  156. raise sprintf("Can't find team '%s'", str)
  157. end
  158. info = get_team_info(html)
  159. last5, next5 = get_scores_and_schedule(html)
  160. live_game = get_live_game(info.name, html)
  161. return OpenStruct.new({:name => info.name,
  162. :standing => info.standing,
  163. :position => info.position,
  164. :last5 => last5,
  165. :next5 => next5,
  166. :live => live_game})
  167. end
  168. private
  169. def self.get_team_info(html)
  170. info_scraper = Scraper.define do
  171. process "div#team-header div.info h1", :name => :text
  172. process "div#team-header div.info div.stats li.score", :standing => :text
  173. process "div#team-header div.info div.stats li.position", :position => :text
  174. result :name, :standing, :position
  175. end
  176. info_temp = info_scraper.scrape(html)
  177. info = OpenStruct.new
  178. return info if info_temp.nil?
  179. info.name = info_temp.name
  180. info_temp.standing.gsub!(/,/, '')
  181. info.standing = info_temp.standing
  182. info.position = info_temp.position
  183. return info
  184. end
  185. def self.get_scores_and_schedule(html)
  186. last5 = []
  187. next5 = []
  188. games_scraper = Scraper.define do
  189. array :games
  190. array :teams
  191. process "div#team-schedule-list div.bd table tbody tr", :games => :text
  192. process "div#team-schedule-list div.bd table tbody tr td.title span", :teams => :text
  193. result :games, :teams
  194. end
  195. games_temp = games_scraper.scrape(html)
  196. return [last5, next5] if games_temp.nil?
  197. bye = false # bye week support for nfl
  198. bye_added = false # help us put it in the right place (hopefully)
  199. games_temp.games.each_index { |i|
  200. info = games_temp.games[i].split("\n").slice(1, 3)
  201. if info[0] == "Bye"
  202. # team is in a bye week
  203. bye = true
  204. next
  205. else
  206. t = (bye ? i - 1 : i)
  207. team = games_temp.teams[t].strip
  208. end
  209. gm = OpenStruct.new
  210. info[1] =~ /(\([\d-]+\))/
  211. record = $1
  212. status = info[2]
  213. preview = (status !~ /^(W|L)/)
  214. date_str = (preview ? "#{info[0]} #{status}" : info[0])
  215. gm.date = Time.parse(date_str)
  216. gm.team = "#{team} #{record}".strip
  217. gm.status = (preview ? "preview" : status)
  218. gm.away = (info[1] =~ / at / ? true : false)
  219. if preview then
  220. if bye and not bye_added then
  221. gmb = OpenStruct.new
  222. gmb.bye = true
  223. next5 << gmb
  224. bye_added = true
  225. end
  226. next5 << gm
  227. else
  228. if bye and not bye_added then
  229. gmb = OpenStruct.new
  230. gmb.bye = true
  231. last5 << gmb
  232. bye_added = true
  233. end
  234. last5 << gm
  235. end
  236. }
  237. return [last5, next5]
  238. end
  239. def self.get_live_game(team, html)
  240. return nil if html !~ /In Progress Game/
  241. team_scraper = Scraper.define do
  242. process_first "td:nth-child(2)", :name => :text
  243. process_first "td:nth-child(4)", :runs => :text
  244. process_first "td:nth-child(5)", :hits => :text
  245. process_first "td:nth-child(6)", :errors => :text
  246. result :name, :runs, :hits, :errors
  247. end
  248. live_scraper = Scraper.define do
  249. array :teams
  250. process_first "td.yspscores", :inning => :text
  251. process "tr.ysptblclbg5", :teams => team_scraper
  252. result :inning, :teams
  253. end
  254. game = live_scraper.scrape(html)
  255. game = struct_to_ostruct(game)
  256. game.inning.strip!
  257. # they are at home if team 1 (2nd team) is them
  258. if game.teams[1].name.split.size > 1 then
  259. t = game.teams[1].name.split[-1]
  260. else
  261. t = game.teams[1].name
  262. end
  263. if team.include? t then
  264. # home game
  265. game.home = true
  266. else
  267. game.home = false
  268. end
  269. # helpers
  270. game.away_team = game.teams[0]
  271. game.home_team = game.teams[1]
  272. game.delete_field('teams')
  273. return game
  274. end
  275. def self.find_team_page(sport, str)
  276. sport.downcase!
  277. str.downcase!
  278. begin
  279. html = YahooSports.fetchurl("http://sports.yahoo.com/#{sport}/teams/" + str)
  280. rescue => ex
  281. puts ex
  282. return
  283. end
  284. if html !~ %r{<title><MapleRegion id ="page_title_generic"/></title>} then
  285. # got the right page
  286. return [str, html]
  287. end
  288. # look for it
  289. begin
  290. html = YahooSports.fetchurl("http://sports.yahoo.com/#{sport}/teams")
  291. rescue => ex
  292. puts ex
  293. return
  294. end
  295. team_scraper = Scraper.define do
  296. array :teams, :links
  297. process "table.yspcontent tr.ysprow1", :teams => :text
  298. process "table.yspcontent tr.ysprow1 a", :links => "@href"
  299. process "table.yspcontent tr.ysprow2", :teams => :text
  300. process "table.yspcontent tr.ysprow2 a", :links => "@href"
  301. result :teams, :links
  302. end
  303. ret = team_scraper.scrape(html)
  304. return nil if ret.nil?
  305. ret.teams.each_index { |i|
  306. t = ret.teams[i]
  307. l = ret.links[i].strip.gsub(%r{/$}, "") # strip trailing slash for nfl
  308. t = YahooSports.strip_tags(t).strip
  309. if t == str or t.downcase.include? str then
  310. # found a matching team
  311. begin
  312. html = YahooSports.fetchurl("http://sports.yahoo.com#{l}")
  313. rescue => ex
  314. puts ex
  315. return
  316. end
  317. t =~ %r{^/[a-z]+/teams/(.+)$}
  318. return [$1, html]
  319. end
  320. }
  321. return nil
  322. end
  323. def self.struct_to_ostruct(struct)
  324. hash = {}
  325. struct.each_pair { |key,val|
  326. if val.kind_of? Struct then
  327. val = struct_to_ostruct(val)
  328. elsif val.kind_of? Array then
  329. val.map! { |v| v.to_s =~ /struct/ ? struct_to_ostruct(v) : v }
  330. end
  331. hash[key] = val
  332. }
  333. return OpenStruct.new(hash)
  334. end
  335. end
  336. end