/hyperleech.py
https://bitbucket.org/devinjames/hyperleech · Python · 407 lines · 334 code · 40 blank · 33 comment · 37 complexity · b754bae621644213c619b9c573c9ebe1 MD5 · raw file
- # encoding=utf-8
- """
- PSEUDO:
- 1. Open up the users page 1 feed and start downloading everything that was loved
- 2. Log each download as a success or failure
- 3. The leecher will xref the website content with what was previously downloaded so as to not repeat something regardless of whether or not it has been removed from the target download directory
- TODO:
- - retry skipped entries!
- - cross-platform testing
- - initial config by user input or config file
- track strings come gzipped with this request:
- hypem.com/?ax=1&ts=1330439454
- """
- try:
- import shutil
- import os
- import sys
- import json
- import platform
- from bs4 import BeautifulSoup
- import requests
- except ImportError:
- raise
- try:
- from msvcrt import getch
- except ImportError:
- import tty, termios
- def getch():
- fd = sys.stdin.fileno()
- old_settings = termios.tcgetattr(fd)
- tty.setraw(sys.stdin.fileno())
- getch = sys.stdin.read(1)
- username = "rampagejames"
- USERAGENT = r"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.21 (KHTML, like Gecko) Chrome/19.0.1042.0 Safari/535.21"
- system = platform.system()
- scriptpath = os.path.dirname(os.path.realpath(__file__))
- # def firstrun():
- # print "It looks like this is your first run, please enter some information:"
- # username = raw_input("Username: ")
- # print "Where would you like to store your mp3s?"
- # dest_dir = raw_input("Path (default: %s)" % "")
- if os.getenv('COMPUTERNAME') == 'HEISENBERG':
- dest_dir = r"E:\mp3\singles\hyperleech"
- forcepath = ''
- else:
- print "I see you're at work, using custom path for log file/downloads dir"
- dest_dir = r"C:\Users\devin.sawatzky\Music\Hypemachine"
- forcepath = r"C:\Users\devin.sawatzky\Music\Hypemachine"
- print "Saving files to to %s " % dest_dir
- cookies = {}
- class Track():
- """
- Stores the following keys for a hypem entry:
- artist, title, id, key, mp3url, state, errormsg
- """
- def __getattr__(self, name):
- if name in self.__dict__:
- return self.__dict__[name]
- elif name in ('artist', 'song'):
- return "Unknown %s" % name
- else:
- return None
- def __init__(self):
- self.destination = dest_dir
- def __str__(self):
- if 'artist' in self.__dict__ and 'song' in self.__dict__:
- return "%s - %s " % (self.artist, self.song)
- return "Unknown Track"
- def __setattr__(self, name, value):
- if name in ('artist', 'song'): # clean the artist and song as they are set
- self.__dict__[name] = self._clean(value)
- else:
- self.__dict__[name] = value
- def attrs(self):
- r = self.__dict__
- del r['destination']
- del r['downloading']
- return r
- def _clean(self, what):
- import re
- disallowed = "[<>:\"/\\\|?*']"
- return re.sub(disallowed, "", what)
- def _build_filename(self):
- self.filename = "%s - %s.mp3" % (self.artist.decode('utf-8'), self.song.decode('utf-8'))
- return self.filename
- def getmp3url(self, cookies):
- if self.id is None or self.key is None:
- print "Key or id is uninitialized"
- return None
- url = r"http://hypem.com/serve/source/%s/%s" % (self.id, self.key)
- headers = {'Referer': r"http://hypem.com/%s" % username, 'User-Agent': USERAGENT}
- req = requests.get(url, headers=headers, cookies=cookies)
- if req.status_code != 200:
- response = None
- else:
- response = json.loads(req.content).get('url')
- return response
- def download(self, cookies):
- import threading
- self.url = self.getmp3url(cookies)
- if self.url is not None:
- self.downloading = True
- threading.Thread(target=self._spinner).start()
- r = requests.get(self.url)
- if r.status_code == 200:
- f = open("%s\\%s" % (self.destination, self._build_filename()), 'wb')
- f.write(r.content)
- f.close()
- retcode = True
- else:
- retcode = False
- else:
- retcode = False
- self.downloading = False
- if retcode is False:
- self.skipped = True
- return retcode
- def _spinner(self):
- from time import sleep
- #import threading
- #condition = threading.Condition
- l = ('-', '\\', '|', '/')
- while self.downloading:
- for i in range(4):
- if not self.downloading:
- return
- sys.stdout.write("[%s\r" % str(l[i]))
- sleep(.1)
- def tag(self):
- """ replace id3 comments with 'hypem,' make album 'hypem' (for iPod shit)? set track artist and song
- returns True if succeeded, other stuff if not
- """
- try:
- import eyeD3
- except ImportError:
- #sys.stdout.write(", Not tagged, ImportError")
- return ImportError
- else:
- tag = eyeD3.Tag()
- try:
- tag.link(self.destination + "\\%s.mp3" % (self.filename))
- except IOError:
- #sys.stdout.write(", Not tagged, IOError")
- #return False
- return IOError
- else:
- tag.removeComments()
- try:
- tag.addComment('hypem')
- tag.update()
- except:
- #sys.stdout.write(", Not tagged, TagException")
- return False
- #except ValueError:
- #sys.stdout.write(", Not tagged, ValueError")
- #return False
- # return ValueError
- #try to write album artist tags if possible
- # this doesn't work because the stupid eyeD3 module isn't capable of writing id3 v2.2
- """
- try:
- tag.setArtist(trackdata.artist)
- tag.setTitle(trackdata.song)
- tag.update()
- except eyeD3.tag.TagException:
- print "Failed to write artist/track id3 tags"
- pass
- """
- #sys.stdout.write(", Tagged")
- return True
- #def __index__(self, item):
- # return 1
- class Page():
- def __init__(self, index=1):
- self.index = str(index)
- def _buildcat(self, html):
- """ Returns a dictionary of HypeID: HypeKey from their HTML code """
- if html is None:
- return []
- import re
- #re_id_key = r"\sid:'([^\n]+)'[^}]+key:\s'([^\n]+)'[^}]+artist:\s?'([^\n]+)'[^}]+song:\s?'([^\n]+)'"
- re_id_key = r'{"type":"[^"]+","id":"([^"]+)"[^}]*"key":"([^"]+)"[^}]*"artist":"([^"]+)"[^}]*"song":"([^"]+)"[^}]*}'
- match = re.findall(re_id_key, html, re.MULTILINE)
- a = []
- for i in match:
- t = Track()
- (t.id, t.key, t.artist, t.song) = i
- a.append(t)
- if len(a) is 0:
- sys.stdout.write("It looks like the regex failed ... that ain't good\n")
- sys.exit(1)
- return a
- def read(self):
- """ read the hype page and return the catalog as an array of track classes """
- url = r"%s%s/%s?ax=1" % ("http://hypem.com/", username, self.index)
- headers = {'Accept-Encoding': 'identity;q=1, *;q=0', 'User-Agent': USERAGENT}
- req = requests.get(url, headers=headers)
- cookies = req.cookies.get_dict()
- self.cookies = req.cookies.get_dict()
- if req.status_code != 200:
- print "The hypemachine url could not be read."
- response = None
- else:
- response = self._buildcat(req.content)
- self.catalog = response
- return response
- class Log():
- def __init__(self, path=None, *kargs, **kwargs):
- if path:
- self.path = path
- else:
- self.path = os.path.dirname(os.path.realpath(__file__))
- self.read()
- def read(self):
- """ read the log from the file
- self.soup - points to root xml node
- self.xml - points to hyperleech node
- self.user - points to user node
- """
- self.logfile = self.path + r"\log.xml"
- if (os.path.isfile(self.logfile + ".bak") and not os.path.isfile(self.logfile)) or (os.path.isfile(self.logfile) and os.path.getsize(self.logfile) == 0):
- shutil.copy(self.logfile + '.bak', self.logfile)
- print "Restoring log file from backup"
- if os.path.isfile(self.logfile):
- print "The log file exists, reading it."
- loghandle = open(self.logfile, "r")
- self.soup = BeautifulSoup(loghandle.read()) # , 'xml') # this contains the original soup, never changes.
- self.username = self.soup.find('user')
- self.xml = self.soup.find('hyperleech') # self.xml holds the hyperleech subset of nodes
- loghandle.close()
- shutil.copy(self.logfile, self.logfile + '.bak')
- else:
- print "No log file, creating new"
- self.xml = BeautifulSoup('<hyperleech>') # , 'xml')
- self.soup = self.xml
- #if self.username is None:
- # self.username = str(raw_input("No username found in log file, please specify: "))
- # username = self.username
- return self.soup
- def find(self, key, **kwargs):
- self.result = self.xml.find(key, **kwargs)
- return self.result
- def open(self):
- self.loghandle = open(self.logfile, 'w')
- def close(self):
- if type(self.loghandle) is file:
- self.loghandle.close()
- def write(self):
- if type(self.loghandle) is not file or type(self.soup) is not BeautifulSoup:
- print "Something with the log handle isn't initialized"
- return
- self.loghandle.truncate(0)
- self.loghandle.seek(0)
- self.loghandle.write(self.soup.prettify())
- self.loghandle.flush()
- def main(argv):
- index = ""
- if len(argv) > 1 and isinstance(int(argv[1]), int):
- index = argv[1]
- else:
- index = raw_input("What page? (blank for 1, ! to rebuild xml log): ")
- if index == "":
- index = '1'
- elif index == '!':
- print "Sorry, I don't rebuild anymore"
- index = '1'
- #print "Grabbing page # %s" % index
- scrape(index)
- return
- def scrape(index, skipped=None):
- if forcepath:
- log = Log(forcepath) # read the log from the script root or from the forced path
- else:
- log = Log()
- print "Reading hypemachine feed page #%s for %s" % (index, username)
- hypem_page = Page(index) # set up a page class for the desired page
- if not hypem_page.read(): # grab the html, and if we can't read the page then GTFO
- getch()
- exit()
- (dl_count, skip_count, current) = (0, 0, 0) # init session counts
- log.open() # open the log file for our re-write/write cycle
- for track in hypem_page.catalog: # catalog is an array of Track classes at this point
- trackname = "%s - %s" % (track.artist, track.song)
- sys.stdout.write("[ ] (ID=%5s) [%2d/%d] %s\r" % (str(track.id), current + 1, len(hypem_page.catalog), trackname[:56]))
- #log.find('item', id=track.id
- if log.find('item', id=track.id, skipped=None): # download all skipped tracks that aren't in the catalog
- #if log.result and log.result['skipped'] is None:
- sym = " " # skipped, already downloaded
- else:
- #if skipped is True and log.xml.find('item', id=track.id, skipped=True): # if we're processing skipped and it's not found in the log GTFO
- # continue
- if track.download(hypem_page.cookies): # true if successfully downloaded
- track.tag()
- dl_count += 1
- sym = "+"
- else: # here if download failed
- skip_count += 1
- sym = "!"
- log.find('item', id=track.id, skipped=True) # if it's written in the log as skipped already, overwrite that entry with no skipped attr
- if log.result:
- del log.result['skipped']
- else:
- new_tag = log.soup.new_tag('item', **track.attrs()) # if it failed this will be in the track attrs
- log.xml.append(new_tag)
- sys.stdout.write("[%s] (ID=%s) [%2d/%d] %s\n" % (sym, track.id, current + 1, len(hypem_page.catalog), trackname[:56]))
- current += 1
- log.write() # writes the updated log to file
- log.close() # close the log handle
- print "%d new mp3s downloaded, %d failed!" % (dl_count, skip_count)
- print "Press something, e=explorer, s=process skipped, enter=quit, n=next page: "
- e = getch()
- if e.lower() == "e":
- os.system(r'C:\windows\explorer.exe "%s"' % dest_dir)
- elif e.lower() == "n":
- print "Continuing to next page...."
- scrape(str(int(index) + 1))
- elif e.lower() == 's':
- #scrape(index, True)
- pass
- if __name__ == "__main__":
- main(sys.argv)