hyperleech.py | searchcode

/hyperleech.py

https://bitbucket.org/devinjames/hyperleech
Python | 407 lines | 360 code | 20 blank | 27 comment | 21 complexity | b754bae621644213c619b9c573c9ebe1 MD5 | raw file

# encoding=utf-8
"""

PSEUDO:
1. Open up the users page 1 feed and start downloading everything that was loved
2. Log each download as a success or failure
3. The leecher will xref the website content with what was previously downloaded so as to not repeat something regardless of whether or not it has been removed from the target download directory

TODO:
    - retry skipped entries!
    - cross-platform testing
    - initial config by user input or config file

track strings come gzipped with this request:
hypem.com/?ax=1&ts=1330439454

"""

try:
    import shutil
    import os
    import sys
    import json
    import platform
    from bs4 import BeautifulSoup
    import requests
except ImportError:
    raise

try:
    from msvcrt import getch
except ImportError:
    import tty, termios
    def getch():
        fd = sys.stdin.fileno()
        old_settings = termios.tcgetattr(fd)
        tty.setraw(sys.stdin.fileno())
        getch = sys.stdin.read(1)

username = "rampagejames"
USERAGENT = r"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.21 (KHTML, like Gecko) Chrome/19.0.1042.0 Safari/535.21"
system = platform.system()
scriptpath = os.path.dirname(os.path.realpath(__file__))

# def firstrun():
    # print "It looks like this is your first run, please enter some information:"
    # username = raw_input("Username: ")
    # print "Where would you like to store your mp3s?"
    # dest_dir = raw_input("Path (default: %s)" % "")

if os.getenv('COMPUTERNAME') == 'HEISENBERG':
    dest_dir = r"E:\mp3\singles\hyperleech"
    forcepath = ''
else:
    print "I see you're at work, using custom path for log file/downloads dir"
    dest_dir = r"C:\Users\devin.sawatzky\Music\Hypemachine"
    forcepath = r"C:\Users\devin.sawatzky\Music\Hypemachine"
print "Saving files to to %s " % dest_dir

cookies = {}

class Track():
    """
        Stores the following keys for a hypem entry:
            artist, title, id, key, mp3url, state, errormsg
    """

    def __getattr__(self, name):
        if name in self.__dict__:
            return self.__dict__[name]
        elif name in ('artist', 'song'):
            return "Unknown %s" % name
        else:
            return None

    def __init__(self):
        self.destination = dest_dir

    def __str__(self):
        if 'artist' in self.__dict__ and 'song' in self.__dict__:
            return "%s - %s " % (self.artist, self.song)
        return "Unknown Track"

    def __setattr__(self, name, value):
        if name in ('artist', 'song'):  # clean the artist and song as they are set
            self.__dict__[name] = self._clean(value)
        else:
            self.__dict__[name] = value

    def attrs(self):
        r = self.__dict__
        del r['destination']
        del r['downloading']
        return r

    def _clean(self, what):
        import re
        disallowed = "[<>:\"/\\\|?*']"
        return re.sub(disallowed, "", what)

    def _build_filename(self):
        self.filename = "%s - %s.mp3" % (self.artist.decode('utf-8'), self.song.decode('utf-8'))
        return self.filename

    def getmp3url(self, cookies):
        if self.id is None or self.key is None:
            print "Key or id is uninitialized"
            return None
        url = r"http://hypem.com/serve/source/%s/%s" % (self.id, self.key)

        headers = {'Referer': r"http://hypem.com/%s" % username, 'User-Agent': USERAGENT}
        req = requests.get(url, headers=headers, cookies=cookies)

        if req.status_code != 200:
            response = None
        else:
            response = json.loads(req.content).get('url')

        return response

    def download(self, cookies):
        import threading

        self.url = self.getmp3url(cookies)

        if self.url is not None:
            self.downloading = True
            threading.Thread(target=self._spinner).start()
            r = requests.get(self.url)
            if r.status_code == 200:
                f = open("%s\\%s" % (self.destination, self._build_filename()), 'wb')
                f.write(r.content)
                f.close()
                retcode = True
            else:
                retcode = False
        else:
            retcode = False

        self.downloading = False

        if retcode is False:
            self.skipped = True

        return retcode

    def _spinner(self):
        from time import sleep
        #import threading
        #condition = threading.Condition
        l = ('-', '\\', '|', '/')
        while self.downloading:
            for i in range(4):
                if not self.downloading:
                    return
                sys.stdout.write("[%s\r" % str(l[i]))
                sleep(.1)

    def tag(self):
        """ replace id3 comments with 'hypem,' make album 'hypem' (for iPod shit)? set track artist and song
            returns True if succeeded, other stuff if not
        """
        try:
            import eyeD3
        except ImportError:
            #sys.stdout.write(", Not tagged, ImportError")
            return ImportError
        else:
            tag = eyeD3.Tag()
            try:
                tag.link(self.destination + "\\%s.mp3" % (self.filename))
            except IOError:
                #sys.stdout.write(", Not tagged, IOError")
                #return False
                return IOError
            else:
                tag.removeComments()
                try:
                    tag.addComment('hypem')
                    tag.update()
                except:
                    #sys.stdout.write(", Not tagged, TagException")
                    return False
                #except ValueError:
                #sys.stdout.write(", Not tagged, ValueError")
                #return False
                #    return ValueError

                #try to write album artist tags if possible
                # this doesn't work because the stupid eyeD3 module isn't capable of writing id3 v2.2
                """
                try:
                    tag.setArtist(trackdata.artist)
                    tag.setTitle(trackdata.song)
                    tag.update()
                except eyeD3.tag.TagException:
                    print "Failed to write artist/track id3 tags"
                    pass
                """
        #sys.stdout.write(", Tagged")
        return True

    #def __index__(self, item):
    #    return 1


class Page():

    def __init__(self, index=1):
        self.index = str(index)

    def _buildcat(self, html):
        """ Returns a dictionary of HypeID: HypeKey from their HTML code """
        if html is None:
            return []
        import re
        #re_id_key = r"\sid:'([^\n]+)'[^}]+key:\s'([^\n]+)'[^}]+artist:\s?'([^\n]+)'[^}]+song:\s?'([^\n]+)'"
        re_id_key = r'{"type":"[^"]+","id":"([^"]+)"[^}]*"key":"([^"]+)"[^}]*"artist":"([^"]+)"[^}]*"song":"([^"]+)"[^}]*}'
        match = re.findall(re_id_key, html, re.MULTILINE)

        a = []

        for i in match:
            t = Track()
            (t.id, t.key, t.artist, t.song) = i
            a.append(t)

        if len(a) is 0:
            sys.stdout.write("It looks like the regex failed ... that ain't good\n")
            sys.exit(1)

        return a

    def read(self):
        """ read the hype page and return the catalog as an array of track classes """

        url = r"%s%s/%s?ax=1" % ("http://hypem.com/", username, self.index)
        headers = {'Accept-Encoding': 'identity;q=1, *;q=0', 'User-Agent': USERAGENT}
        req = requests.get(url, headers=headers)
        cookies = req.cookies.get_dict()
        self.cookies = req.cookies.get_dict()

        if req.status_code != 200:
            print "The hypemachine url could not be read."
            response = None
        else:
            response = self._buildcat(req.content)

        self.catalog = response

        return response


class Log():

    def __init__(self, path=None, *kargs, **kwargs):
        if path:
            self.path = path
        else:
            self.path = os.path.dirname(os.path.realpath(__file__))

        self.read()

    def read(self):
        """ read the log from the file
            self.soup - points to root xml node
            self.xml - points to hyperleech node
            self.user - points to user node
        """

        self.logfile = self.path + r"\log.xml"

        if (os.path.isfile(self.logfile + ".bak") and not os.path.isfile(self.logfile)) or (os.path.isfile(self.logfile) and os.path.getsize(self.logfile) == 0):
            shutil.copy(self.logfile + '.bak', self.logfile)
            print "Restoring log file from backup"

        if os.path.isfile(self.logfile):
            print "The log file exists, reading it."
            loghandle = open(self.logfile, "r")
            self.soup = BeautifulSoup(loghandle.read())  # , 'xml')  # this contains the original soup, never changes.
            self.username = self.soup.find('user')
            self.xml = self.soup.find('hyperleech')  # self.xml holds the hyperleech subset of nodes
            loghandle.close()
            shutil.copy(self.logfile, self.logfile + '.bak')
        else:
            print "No log file, creating new"
            self.xml = BeautifulSoup('<hyperleech>')  # , 'xml')
            self.soup = self.xml

        #if self.username is None:
        #    self.username = str(raw_input("No username found in log file, please specify: "))
        #    username = self.username

        return self.soup

    def find(self, key, **kwargs):
        self.result = self.xml.find(key, **kwargs)
        return self.result

    def open(self):
        self.loghandle = open(self.logfile, 'w')

    def close(self):
        if type(self.loghandle) is file:
            self.loghandle.close()

    def write(self):
        if type(self.loghandle) is not file or type(self.soup) is not BeautifulSoup:
            print "Something with the log handle isn't initialized"
            return

        self.loghandle.truncate(0)
        self.loghandle.seek(0)
        self.loghandle.write(self.soup.prettify())
        self.loghandle.flush()


def main(argv):

    index = ""

    if len(argv) > 1 and isinstance(int(argv[1]), int):
        index = argv[1]
    else:
        index = raw_input("What page? (blank for 1, ! to rebuild xml log): ")

    if index == "":
        index = '1'
    elif index == '!':
        print "Sorry, I don't rebuild anymore"
        index = '1'

    #print "Grabbing page # %s" % index
    scrape(index)
    return


def scrape(index, skipped=None):

    if forcepath:
        log = Log(forcepath)  # read the log from the script root or from the forced path
    else:
        log = Log()

    print "Reading hypemachine feed page #%s for %s" % (index, username)

    hypem_page = Page(index)  # set up a page class for the desired page

    if not hypem_page.read():  # grab the html, and if we can't read the page then GTFO
        getch()
        exit()

    (dl_count, skip_count, current) = (0, 0, 0)  # init session counts

    log.open()  # open the log file for our re-write/write cycle

    for track in hypem_page.catalog:  # catalog is an array of Track classes at this point

        trackname = "%s - %s" % (track.artist, track.song)

        sys.stdout.write("[ ] (ID=%5s) [%2d/%d] %s\r" % (str(track.id), current + 1, len(hypem_page.catalog), trackname[:56]))
        #log.find('item', id=track.id
        if log.find('item', id=track.id, skipped=None):  # download all skipped tracks that aren't in the catalog
        #if log.result and log.result['skipped'] is None:
            sym = " "  # skipped, already downloaded
        else:

            #if skipped is True and log.xml.find('item', id=track.id, skipped=True):  # if we're processing skipped and it's not found in the log GTFO
            #    continue

            if track.download(hypem_page.cookies):  # true if successfully downloaded
                track.tag()
                dl_count += 1
                sym = "+"
            else:  # here if download failed
                skip_count += 1
                sym = "!"

            log.find('item', id=track.id, skipped=True)  # if it's written in the log as skipped already, overwrite that entry with no skipped attr
            if log.result:
                del log.result['skipped']
            else:
                new_tag = log.soup.new_tag('item', **track.attrs())  # if it failed this will be in the track attrs
                log.xml.append(new_tag)

        sys.stdout.write("[%s] (ID=%s) [%2d/%d] %s\n" % (sym, track.id, current + 1, len(hypem_page.catalog), trackname[:56]))

        current += 1

        log.write()  # writes the updated log to file

    log.close()  # close the log handle

    print "%d new mp3s downloaded, %d failed!" % (dl_count, skip_count)
    print "Press something, e=explorer, s=process skipped, enter=quit, n=next page: "
    e = getch()
    if e.lower() == "e":
        os.system(r'C:\windows\explorer.exe "%s"' % dest_dir)
    elif e.lower() == "n":
        print "Continuing to next page...."
        scrape(str(int(index) + 1))
    elif e.lower() == 's':
        #scrape(index, True)
        pass

if __name__ == "__main__":
    main(sys.argv)