PageRenderTime 142ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/recommender/metacritic.py

https://bitbucket.org/alex_fish/vgr
Python | 273 lines | 242 code | 28 blank | 3 comment | 34 complexity | ba33a4773ce123231381e0d844b1171e MD5 | raw file
  1. #!/usr/bin/python
  2. # author: BirdAPI
  3. # source: https://github.com/BirdAPI/Metacritic-Scraper-API/blob/master/metacritic.py
  4. from datetime import datetime
  5. from BeautifulSoup import BeautifulSoup
  6. from pprint import pprint
  7. import urllib2
  8. import sqlite3
  9. import sys
  10. import re
  11. import os
  12. TYPES = [ 'all', 'movie', 'game', 'album', 'tv', 'person', 'video', 'company' ]
  13. class MetacriticInfo:
  14. def __init__(self):
  15. self.id = None
  16. self.title = None
  17. self.type = None
  18. self.link = None
  19. self.boxart = None
  20. self.system = None
  21. self.publisher = None
  22. self.publisher_link = None
  23. self.release_date = None
  24. self.metascore = None
  25. self.metascore_count = None
  26. self.metascore_desc = None
  27. self.user_score = None
  28. self.user_count = None
  29. self.user_score_desc = None
  30. self.summary = None
  31. self.esrb = None
  32. self.official_site = None
  33. self.developer = None
  34. self.genres = None
  35. self.num_players = None
  36. self.esrb_reason = None
  37. self.sound = None
  38. self.connectivity = None
  39. self.resolution = None
  40. self.num_online = None
  41. self.customization = None
  42. class SearchResult:
  43. def __init__(self):
  44. self.id = None
  45. self.title = None
  46. self.type = None
  47. self.link = None
  48. self.system = None
  49. self.metascore = None
  50. self.release_date = None
  51. self.esrb = None
  52. self.publisher = None
  53. self.index = None
  54. self.page = None
  55. self.summary = None
  56. self.user_score = None
  57. self.runtime = None
  58. class Metacritic:
  59. @staticmethod
  60. def search(query, type="all"):
  61. url = get_search_url(query, type)
  62. html = get_html(url)
  63. if not html:
  64. return None
  65. soup = BeautifulSoup(html)
  66. i = 0
  67. page = 0
  68. allresults = []
  69. results = soup.findAll("li", "result")
  70. for result in results:
  71. res = SearchResult()
  72. result_type = result.find("div", "result_type")
  73. if result_type:
  74. strong = result_type.find("strong")
  75. if strong:
  76. res.type = strong.text.strip()
  77. span = result.find("span", "platform")
  78. if span:
  79. res.system = span.text.strip()
  80. product_title = result.find("h3", "product_title")
  81. if product_title:
  82. a = product_title.find("a")
  83. if a:
  84. res.link = "http://www.metacritic.com" + a["href"]
  85. res.id = a["href"][1:].replace("/", "_")
  86. res.title = a.text.strip()
  87. metascore = result.find("span", "metascore")
  88. if metascore:
  89. res.metascore = metascore.text.strip()
  90. res.release_date = get_li_span_data(result, "release_date")
  91. res.esrb = get_li_span_data(result, "maturity_rating")
  92. res.publisher = get_li_span_data(result, "publisher")
  93. deck = result.find("p", "deck")
  94. if deck:
  95. res.summary = deck.text.strip()
  96. res.user_score = get_li_span_data(result, "product_avguserscore")
  97. res.runtime = get_li_span_data(result, "runtime")
  98. res.index = i
  99. res.page = page
  100. allresults.append(res)
  101. i = i + 1
  102. return allresults
  103. @staticmethod
  104. def get_info(id):
  105. url = get_details_url(id)
  106. html = get_html(url)
  107. if not html:
  108. return None
  109. soup = BeautifulSoup(html)
  110. prod = MetacriticInfo()
  111. prod.id = id
  112. og_type = soup.find("meta", attrs={"name":"og:type"})
  113. if og_type:
  114. prod.type = og_type["content"].strip()
  115. og_image = soup.find("meta", attrs={"name":"og:image"})
  116. if og_image:
  117. prod.boxart = og_image["content"].strip()
  118. product_title = soup.find("div", "product_title")
  119. if product_title:
  120. a = product_title.find("a")
  121. if a:
  122. prod.link = "http://www.metacritic.com" + a["href"]
  123. prod.title = a.text.strip()
  124. platform = soup.find("span", "platform")
  125. if platform:
  126. a = platform.find("a")
  127. if a:
  128. prod.system = a.text.strip()
  129. publisher = soup.find("li", "publisher")
  130. if publisher:
  131. a = publisher.find("a")
  132. if a:
  133. prod.publisher = a.text.strip()
  134. prod.publisher_link = "http://www.metacritic.com" + a["href"]
  135. prod.release_date = get_li_span_data(soup, "release_data")
  136. metascore = soup.find("div", "feature_metascore")
  137. if metascore:
  138. score_value = metascore.find("span", "score_value")
  139. if score_value:
  140. prod.metascore = score_value.text.strip()
  141. count = metascore.find("span", "count")
  142. if count:
  143. a = count.find("a")
  144. if a:
  145. span = a.find("span")
  146. if span:
  147. prod.metascore_count = span.text.strip()
  148. desc = metascore.find("span", "desc")
  149. if desc:
  150. prod.metascore_desc = desc.text.strip()
  151. avguserscore = soup.find("div", "feature_userscore")
  152. if avguserscore:
  153. score_value = avguserscore.find("span", "score_value")
  154. if score_value:
  155. prod.user_score = score_value.text.strip()
  156. count = avguserscore.find("span", "count")
  157. if count:
  158. a = count.find("a")
  159. if a:
  160. prod.user_count = a.text[:a.text.find(" ")]
  161. desc = avguserscore.find("span", "desc")
  162. if desc:
  163. prod.user_score_desc = desc.text.strip()
  164. product_summary = soup.find("div", "product_summary")
  165. if product_summary:
  166. data = product_summary.find("span", "data")
  167. if data:
  168. prod.summary = data.text.strip()
  169. product_details = soup.findAll("div", "product_details")
  170. for pd in product_details:
  171. table = pd.find("table")
  172. if table:
  173. trs = table.findAll("tr")
  174. for tr in trs:
  175. process_tr(prod, tr)
  176. return prod
  177. def process_tr(prod, tr):
  178. th = tr.find("th")
  179. td = tr.find("td")
  180. th_val = th.text.replace(":", "").strip()
  181. td_val = td.text.strip()
  182. if th_val == "Rating":
  183. prod.esrb = td_val
  184. elif th_val == "Official Site":
  185. prod.official_site = td_val
  186. elif th_val == "Developer":
  187. prod.developer = td_val
  188. elif th_val == "Genre(s)":
  189. prod.genres = td_val
  190. elif th_val == "Number of Players":
  191. prod.num_players = td_val
  192. elif th_val == "ESRB Descriptors":
  193. prod.esrb_reason = td_val
  194. elif th_val == "Sound":
  195. prod.sound = td_val
  196. elif th_val == "Connectivity":
  197. prod.connectivity = td_val
  198. elif th_val == "Resolution":
  199. prod.resolution = td_val
  200. elif th_val == "Number of Online Players":
  201. prod.num_online = td_val
  202. elif th_val == "Customization":
  203. prod.customization = td_val
  204. def get_li_span_data(node, data_name):
  205. li = node.find("li", data_name)
  206. if li:
  207. data = li.find("span", "data")
  208. if data:
  209. return data.text.strip()
  210. return None
  211. def get_search_url(query, type="all"):
  212. return "http://www.metacritic.com/search/%s/%s/results?sort=relevancy" % (type, query.replace(":", "").replace("-", "").replace("_", "").replace(" ", "+"))
  213. def get_details_url(id):
  214. return "http://www.metacritic.com/%s/details" % id.replace("_", "/")
  215. def get_html(url):
  216. try:
  217. request = urllib2.Request(url)
  218. request.add_header("User-Agent", "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101")
  219. html = urllib2.urlopen(request).read()
  220. return html
  221. except:
  222. print "Error accessing:", url
  223. return None
  224. def main():
  225. if len(sys.argv) == 2:
  226. results = Metacritic.search(sys.argv[1])
  227. elif len(sys.argv) == 3:
  228. results = Metacritic.search(sys.argv[1], sys.argv[2])
  229. else:
  230. return
  231. for result in results:
  232. pprint(vars(result))
  233. print ""
  234. pprint(vars(Metacritic.get_info(result.id)))
  235. print ""
  236. if __name__ == "__main__":
  237. main()