get_manmankan_images.py

/python/get_manmankan_images.py

https://github.com/mitnk/stuff
Python | 143 lines | 128 code | 11 blank | 4 comment | 8 complexity | 79d58f74ad834a6c511b1340da4d9310 MD5 | raw file

##################################################
## Get Images from manmankan
## Author: whgking@gmail.com
##################################################
import os
import os.path
import pickle
import re
import urllib
import urllib2
import time

from BeautifulSoup import BeautifulSoup

def save_image_list_to_cache(dir_name, image_list):
    if not image_list or not dir_name:
        return
    output = open('%s/image_list.pkl' % dir_name, 'wb')
    pickle.dump(image_list, output)
    output.close()

def get_image_list_from_cache(dir_name):
    try:
        pkl_file = open('%s/image_list.pkl' % dir_name, 'rb')
    except IOError:
        return None

    image_list = pickle.load(pkl_file)
    pkl_file.close()
    return image_list

def get_chapter_list(url):
    page = urllib2.urlopen(url)
    soup = BeautifulSoup(page, fromEncoding="gb18030")
    print u"Reading information of %s ..." % soup.findAll("h1")[0].string

    tag = soup.find("table", {"class": "list_bg_table"})
    if not tag:
        print "Chapter table not found."

    chapter_list = tag.findAll("td")
    manman_list = []
    for chapter in chapter_list:
        a_tag = chapter.find('a')
        if not a_tag: # Blank td tag
            continue

        chapter_name = a_tag.string
        url = a_tag["href"]
        manman_list.append([chapter_name.strip(), url])

    manman_list.reverse()
    for chapter_name, url in manman_list:
        download_all_images(url, chapter_name)


def get_image_list(url):
    page = urllib2.urlopen(url)
    soup = BeautifulSoup(page)
    javascripts = soup.findAll(text=lambda text: text.parent.name == "script")
    image_script = ""
    for js in javascripts:
        if "new Array" in js:
            image_script = js
            break

    if not image_script:
        print "Javascript of image src not found."

    result = re.search(r'new Array\(([^;]+)\);', image_script)
    if not result:
        print "Image SRC not found."
    return result.group(1).replace('"', '').split(',')

def download_all_images(url, dir_name):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
        print u"Created directory: %s" % dir_name

    # Save image_list to cache, no need fetching from website everytime
    image_list = get_image_list_from_cache(dir_name)
    if not image_list:
        image_list = get_image_list(url)
        save_image_list_to_cache(dir_name, image_list)

    # -1 for image_list.pkl file in the same directory
    image_count_existing = len(os.listdir(dir_name)) - 1
    if len(image_list) == image_count_existing:
        print u"%s seems already finished, skip this chapter." % dir_name
        return

    number_from = 0
    if image_count_existing != 0:
        # Let's fetch from the last image, in case it's not fully downloaded
        number_from = image_count_existing - 1

    print "Begin to download %s ..." % dir_name
    print "Total images to download: %s" % (len(image_list) - number_from)

    host = "http://54.manmankan.com"
    bad_images = 0
    count_downloaded = number_from + 1
    first_image_is_invalid = False
    time_list = []
    for image_src in image_list[number_from:]:
        num = str("%3d" % count_downloaded).replace(' ', '0')
        image_ext = image_src.split(".")[-1]
        local_file_name = "%s/%s.%s" % (dir_name, num, image_ext)
        url = host + image_src

        t = time.time()
        urllib.urlretrieve(url, local_file_name)
        time_list.append(time.time() - t)
        time.sleep(2) # Sleep 2 seconds to download gently
        
        # Image too small, means it is bad images (Smaller than 10K)
        # Sometimes, valid images also are very small, 
        # so, We need to determine whether the first is invalid.
        if os.path.getsize(local_file_name) < 10000: 
            if count_downloaded == 0:
                first_image_is_invalid = True

            if first_image_is_invalid:
                print "Invalid image (too small in size) found:", image_src
                print "Saved to %s\r\n" % local_file_name

            bad_images += 1
            if bad_images >= 3 and first_image_is_invalid:
                error_info = u"Error Images found to fetch %s. Skipped this chapter\r\n" % dir_name
                print error_info
                f = open("%s/errors.log" % dir_name, "w")
                f.write(error_info.encode('utf-8'))
                f.close()
                return

        count_downloaded += 1

        if count_downloaded % 10 == 1:
            print "Downloaded: %s, Speed: %.3fs per image" % (local_file_name, sum(time_list) / len(time_list))
            time_list = []

if __name__ == "__main__":
    get_chapter_list("http://www.manmankan.com/html/13/")