PageRenderTime 47ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/hyperleech.py

https://bitbucket.org/devinjames/hyperleech
Python | 407 lines | 360 code | 20 blank | 27 comment | 21 complexity | b754bae621644213c619b9c573c9ebe1 MD5 | raw file
  1. # encoding=utf-8
  2. """
  3. PSEUDO:
  4. 1. Open up the users page 1 feed and start downloading everything that was loved
  5. 2. Log each download as a success or failure
  6. 3. The leecher will xref the website content with what was previously downloaded so as to not repeat something regardless of whether or not it has been removed from the target download directory
  7. TODO:
  8. - retry skipped entries!
  9. - cross-platform testing
  10. - initial config by user input or config file
  11. track strings come gzipped with this request:
  12. hypem.com/?ax=1&ts=1330439454
  13. """
  14. try:
  15. import shutil
  16. import os
  17. import sys
  18. import json
  19. import platform
  20. from bs4 import BeautifulSoup
  21. import requests
  22. except ImportError:
  23. raise
  24. try:
  25. from msvcrt import getch
  26. except ImportError:
  27. import tty, termios
  28. def getch():
  29. fd = sys.stdin.fileno()
  30. old_settings = termios.tcgetattr(fd)
  31. tty.setraw(sys.stdin.fileno())
  32. getch = sys.stdin.read(1)
  33. username = "rampagejames"
  34. USERAGENT = r"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.21 (KHTML, like Gecko) Chrome/19.0.1042.0 Safari/535.21"
  35. system = platform.system()
  36. scriptpath = os.path.dirname(os.path.realpath(__file__))
  37. # def firstrun():
  38. # print "It looks like this is your first run, please enter some information:"
  39. # username = raw_input("Username: ")
  40. # print "Where would you like to store your mp3s?"
  41. # dest_dir = raw_input("Path (default: %s)" % "")
  42. if os.getenv('COMPUTERNAME') == 'HEISENBERG':
  43. dest_dir = r"E:\mp3\singles\hyperleech"
  44. forcepath = ''
  45. else:
  46. print "I see you're at work, using custom path for log file/downloads dir"
  47. dest_dir = r"C:\Users\devin.sawatzky\Music\Hypemachine"
  48. forcepath = r"C:\Users\devin.sawatzky\Music\Hypemachine"
  49. print "Saving files to to %s " % dest_dir
  50. cookies = {}
  51. class Track():
  52. """
  53. Stores the following keys for a hypem entry:
  54. artist, title, id, key, mp3url, state, errormsg
  55. """
  56. def __getattr__(self, name):
  57. if name in self.__dict__:
  58. return self.__dict__[name]
  59. elif name in ('artist', 'song'):
  60. return "Unknown %s" % name
  61. else:
  62. return None
  63. def __init__(self):
  64. self.destination = dest_dir
  65. def __str__(self):
  66. if 'artist' in self.__dict__ and 'song' in self.__dict__:
  67. return "%s - %s " % (self.artist, self.song)
  68. return "Unknown Track"
  69. def __setattr__(self, name, value):
  70. if name in ('artist', 'song'): # clean the artist and song as they are set
  71. self.__dict__[name] = self._clean(value)
  72. else:
  73. self.__dict__[name] = value
  74. def attrs(self):
  75. r = self.__dict__
  76. del r['destination']
  77. del r['downloading']
  78. return r
  79. def _clean(self, what):
  80. import re
  81. disallowed = "[<>:\"/\\\|?*']"
  82. return re.sub(disallowed, "", what)
  83. def _build_filename(self):
  84. self.filename = "%s - %s.mp3" % (self.artist.decode('utf-8'), self.song.decode('utf-8'))
  85. return self.filename
  86. def getmp3url(self, cookies):
  87. if self.id is None or self.key is None:
  88. print "Key or id is uninitialized"
  89. return None
  90. url = r"http://hypem.com/serve/source/%s/%s" % (self.id, self.key)
  91. headers = {'Referer': r"http://hypem.com/%s" % username, 'User-Agent': USERAGENT}
  92. req = requests.get(url, headers=headers, cookies=cookies)
  93. if req.status_code != 200:
  94. response = None
  95. else:
  96. response = json.loads(req.content).get('url')
  97. return response
  98. def download(self, cookies):
  99. import threading
  100. self.url = self.getmp3url(cookies)
  101. if self.url is not None:
  102. self.downloading = True
  103. threading.Thread(target=self._spinner).start()
  104. r = requests.get(self.url)
  105. if r.status_code == 200:
  106. f = open("%s\\%s" % (self.destination, self._build_filename()), 'wb')
  107. f.write(r.content)
  108. f.close()
  109. retcode = True
  110. else:
  111. retcode = False
  112. else:
  113. retcode = False
  114. self.downloading = False
  115. if retcode is False:
  116. self.skipped = True
  117. return retcode
  118. def _spinner(self):
  119. from time import sleep
  120. #import threading
  121. #condition = threading.Condition
  122. l = ('-', '\\', '|', '/')
  123. while self.downloading:
  124. for i in range(4):
  125. if not self.downloading:
  126. return
  127. sys.stdout.write("[%s\r" % str(l[i]))
  128. sleep(.1)
  129. def tag(self):
  130. """ replace id3 comments with 'hypem,' make album 'hypem' (for iPod shit)? set track artist and song
  131. returns True if succeeded, other stuff if not
  132. """
  133. try:
  134. import eyeD3
  135. except ImportError:
  136. #sys.stdout.write(", Not tagged, ImportError")
  137. return ImportError
  138. else:
  139. tag = eyeD3.Tag()
  140. try:
  141. tag.link(self.destination + "\\%s.mp3" % (self.filename))
  142. except IOError:
  143. #sys.stdout.write(", Not tagged, IOError")
  144. #return False
  145. return IOError
  146. else:
  147. tag.removeComments()
  148. try:
  149. tag.addComment('hypem')
  150. tag.update()
  151. except:
  152. #sys.stdout.write(", Not tagged, TagException")
  153. return False
  154. #except ValueError:
  155. #sys.stdout.write(", Not tagged, ValueError")
  156. #return False
  157. # return ValueError
  158. #try to write album artist tags if possible
  159. # this doesn't work because the stupid eyeD3 module isn't capable of writing id3 v2.2
  160. """
  161. try:
  162. tag.setArtist(trackdata.artist)
  163. tag.setTitle(trackdata.song)
  164. tag.update()
  165. except eyeD3.tag.TagException:
  166. print "Failed to write artist/track id3 tags"
  167. pass
  168. """
  169. #sys.stdout.write(", Tagged")
  170. return True
  171. #def __index__(self, item):
  172. # return 1
  173. class Page():
  174. def __init__(self, index=1):
  175. self.index = str(index)
  176. def _buildcat(self, html):
  177. """ Returns a dictionary of HypeID: HypeKey from their HTML code """
  178. if html is None:
  179. return []
  180. import re
  181. #re_id_key = r"\sid:'([^\n]+)'[^}]+key:\s'([^\n]+)'[^}]+artist:\s?'([^\n]+)'[^}]+song:\s?'([^\n]+)'"
  182. re_id_key = r'{"type":"[^"]+","id":"([^"]+)"[^}]*"key":"([^"]+)"[^}]*"artist":"([^"]+)"[^}]*"song":"([^"]+)"[^}]*}'
  183. match = re.findall(re_id_key, html, re.MULTILINE)
  184. a = []
  185. for i in match:
  186. t = Track()
  187. (t.id, t.key, t.artist, t.song) = i
  188. a.append(t)
  189. if len(a) is 0:
  190. sys.stdout.write("It looks like the regex failed ... that ain't good\n")
  191. sys.exit(1)
  192. return a
  193. def read(self):
  194. """ read the hype page and return the catalog as an array of track classes """
  195. url = r"%s%s/%s?ax=1" % ("http://hypem.com/", username, self.index)
  196. headers = {'Accept-Encoding': 'identity;q=1, *;q=0', 'User-Agent': USERAGENT}
  197. req = requests.get(url, headers=headers)
  198. cookies = req.cookies.get_dict()
  199. self.cookies = req.cookies.get_dict()
  200. if req.status_code != 200:
  201. print "The hypemachine url could not be read."
  202. response = None
  203. else:
  204. response = self._buildcat(req.content)
  205. self.catalog = response
  206. return response
  207. class Log():
  208. def __init__(self, path=None, *kargs, **kwargs):
  209. if path:
  210. self.path = path
  211. else:
  212. self.path = os.path.dirname(os.path.realpath(__file__))
  213. self.read()
  214. def read(self):
  215. """ read the log from the file
  216. self.soup - points to root xml node
  217. self.xml - points to hyperleech node
  218. self.user - points to user node
  219. """
  220. self.logfile = self.path + r"\log.xml"
  221. if (os.path.isfile(self.logfile + ".bak") and not os.path.isfile(self.logfile)) or (os.path.isfile(self.logfile) and os.path.getsize(self.logfile) == 0):
  222. shutil.copy(self.logfile + '.bak', self.logfile)
  223. print "Restoring log file from backup"
  224. if os.path.isfile(self.logfile):
  225. print "The log file exists, reading it."
  226. loghandle = open(self.logfile, "r")
  227. self.soup = BeautifulSoup(loghandle.read()) # , 'xml') # this contains the original soup, never changes.
  228. self.username = self.soup.find('user')
  229. self.xml = self.soup.find('hyperleech') # self.xml holds the hyperleech subset of nodes
  230. loghandle.close()
  231. shutil.copy(self.logfile, self.logfile + '.bak')
  232. else:
  233. print "No log file, creating new"
  234. self.xml = BeautifulSoup('<hyperleech>') # , 'xml')
  235. self.soup = self.xml
  236. #if self.username is None:
  237. # self.username = str(raw_input("No username found in log file, please specify: "))
  238. # username = self.username
  239. return self.soup
  240. def find(self, key, **kwargs):
  241. self.result = self.xml.find(key, **kwargs)
  242. return self.result
  243. def open(self):
  244. self.loghandle = open(self.logfile, 'w')
  245. def close(self):
  246. if type(self.loghandle) is file:
  247. self.loghandle.close()
  248. def write(self):
  249. if type(self.loghandle) is not file or type(self.soup) is not BeautifulSoup:
  250. print "Something with the log handle isn't initialized"
  251. return
  252. self.loghandle.truncate(0)
  253. self.loghandle.seek(0)
  254. self.loghandle.write(self.soup.prettify())
  255. self.loghandle.flush()
  256. def main(argv):
  257. index = ""
  258. if len(argv) > 1 and isinstance(int(argv[1]), int):
  259. index = argv[1]
  260. else:
  261. index = raw_input("What page? (blank for 1, ! to rebuild xml log): ")
  262. if index == "":
  263. index = '1'
  264. elif index == '!':
  265. print "Sorry, I don't rebuild anymore"
  266. index = '1'
  267. #print "Grabbing page # %s" % index
  268. scrape(index)
  269. return
  270. def scrape(index, skipped=None):
  271. if forcepath:
  272. log = Log(forcepath) # read the log from the script root or from the forced path
  273. else:
  274. log = Log()
  275. print "Reading hypemachine feed page #%s for %s" % (index, username)
  276. hypem_page = Page(index) # set up a page class for the desired page
  277. if not hypem_page.read(): # grab the html, and if we can't read the page then GTFO
  278. getch()
  279. exit()
  280. (dl_count, skip_count, current) = (0, 0, 0) # init session counts
  281. log.open() # open the log file for our re-write/write cycle
  282. for track in hypem_page.catalog: # catalog is an array of Track classes at this point
  283. trackname = "%s - %s" % (track.artist, track.song)
  284. sys.stdout.write("[ ] (ID=%5s) [%2d/%d] %s\r" % (str(track.id), current + 1, len(hypem_page.catalog), trackname[:56]))
  285. #log.find('item', id=track.id
  286. if log.find('item', id=track.id, skipped=None): # download all skipped tracks that aren't in the catalog
  287. #if log.result and log.result['skipped'] is None:
  288. sym = " " # skipped, already downloaded
  289. else:
  290. #if skipped is True and log.xml.find('item', id=track.id, skipped=True): # if we're processing skipped and it's not found in the log GTFO
  291. # continue
  292. if track.download(hypem_page.cookies): # true if successfully downloaded
  293. track.tag()
  294. dl_count += 1
  295. sym = "+"
  296. else: # here if download failed
  297. skip_count += 1
  298. sym = "!"
  299. log.find('item', id=track.id, skipped=True) # if it's written in the log as skipped already, overwrite that entry with no skipped attr
  300. if log.result:
  301. del log.result['skipped']
  302. else:
  303. new_tag = log.soup.new_tag('item', **track.attrs()) # if it failed this will be in the track attrs
  304. log.xml.append(new_tag)
  305. sys.stdout.write("[%s] (ID=%s) [%2d/%d] %s\n" % (sym, track.id, current + 1, len(hypem_page.catalog), trackname[:56]))
  306. current += 1
  307. log.write() # writes the updated log to file
  308. log.close() # close the log handle
  309. print "%d new mp3s downloaded, %d failed!" % (dl_count, skip_count)
  310. print "Press something, e=explorer, s=process skipped, enter=quit, n=next page: "
  311. e = getch()
  312. if e.lower() == "e":
  313. os.system(r'C:\windows\explorer.exe "%s"' % dest_dir)
  314. elif e.lower() == "n":
  315. print "Continuing to next page...."
  316. scrape(str(int(index) + 1))
  317. elif e.lower() == 's':
  318. #scrape(index, True)
  319. pass
  320. if __name__ == "__main__":
  321. main(sys.argv)