get_manmankan_images.py

/python/get_manmankan_images.py

https://github.com/mitnk/stuff · Python · 143 lines · 107 code · 26 blank · 10 comment · 24 complexity · 79d58f74ad834a6c511b1340da4d9310 MD5 · raw file

##################################################
## Get Images from manmankan
## Author: whgking@gmail.com
##################################################
import os
import os.path
import pickle
import re
import urllib
import urllib2
import time

from BeautifulSoup import BeautifulSoup

def save_image_list_to_cache(dir_name, image_list):
    if not image_list or not dir_name:
        return
    output = open('%s/image_list.pkl' % dir_name, 'wb')
    pickle.dump(image_list, output)
    output.close()

def get_image_list_from_cache(dir_name):
    try:
        pkl_file = open('%s/image_list.pkl' % dir_name, 'rb')
    except IOError:
        return None

    image_list = pickle.load(pkl_file)
    pkl_file.close()
    return image_list

def get_chapter_list(url):
    page = urllib2.urlopen(url)
    soup = BeautifulSoup(page, fromEncoding="gb18030")
    print u"Reading information of %s ..." % soup.findAll("h1")[0].string

    tag = soup.find("table", {"class": "list_bg_table"})
    if not tag:
        print "Chapter table not found."

    chapter_list = tag.findAll("td")
    manman_list = []
    for chapter in chapter_list:
        a_tag = chapter.find('a')
        if not a_tag: # Blank td tag
            continue

        chapter_name = a_tag.string
        url = a_tag["href"]
        manman_list.append([chapter_name.strip(), url])

    manman_list.reverse()
    for chapter_name, url in manman_list:
        download_all_images(url, chapter_name)


def get_image_list(url):
    page = urllib2.urlopen(url)
    soup = BeautifulSoup(page)
    javascripts = soup.findAll(text=lambda text: text.parent.name == "script")
    image_script = ""
    for js in javascripts:
        if "new Array" in js:
            image_script = js
            break

    if not image_script:
        print "Javascript of image src not found."

    result = re.search(r'new Array\(([^;]+)\);', image_script)
    if not result:
        print "Image SRC not found."
    return result.group(1).replace('"', '').split(',')

def download_all_images(url, dir_name):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
        print u"Created directory: %s" % dir_name

    # Save image_list to cache, no need fetching from website everytime
    image_list = get_image_list_from_cache(dir_name)
    if not image_list:
        image_list = get_image_list(url)
        save_image_list_to_cache(dir_name, image_list)

    # -1 for image_list.pkl file in the same directory
    image_count_existing = len(os.listdir(dir_name)) - 1
    if len(image_list) == image_count_existing:
        print u"%s seems already finished, skip this chapter." % dir_name
        return

    number_from = 0
    if image_count_existing != 0:
        # Let's fetch from the last image, in case it's not fully downloaded
        number_from = image_count_existing - 1

    print "Begin to download %s ..." % dir_name
    print "Total images to download: %s" % (len(image_list) - number_from)

    host = "http://54.manmankan.com"
    bad_images = 0
    count_downloaded = number_from + 1
    first_image_is_invalid = False
    time_list = []
    for image_src in image_list[number_from:]:
        num = str("%3d" % count_downloaded).replace(' ', '0')
        image_ext = image_src.split(".")[-1]
        local_file_name = "%s/%s.%s" % (dir_name, num, image_ext)
        url = host + image_src

        t = time.time()
        urllib.urlretrieve(url, local_file_name)
        time_list.append(time.time() - t)
        time.sleep(2) # Sleep 2 seconds to download gently
        
        # Image too small, means it is bad images (Smaller than 10K)
        # Sometimes, valid images also are very small, 
        # so, We need to determine whether the first is invalid.
        if os.path.getsize(local_file_name) < 10000: 
            if count_downloaded == 0:
                first_image_is_invalid = True

            if first_image_is_invalid:
                print "Invalid image (too small in size) found:", image_src
                print "Saved to %s\r\n" % local_file_name

            bad_images += 1
            if bad_images >= 3 and first_image_is_invalid:
                error_info = u"Error Images found to fetch %s. Skipped this chapter\r\n" % dir_name
                print error_info
                f = open("%s/errors.log" % dir_name, "w")
                f.write(error_info.encode('utf-8'))
                f.close()
                return

        count_downloaded += 1

        if count_downloaded % 10 == 1:
            print "Downloaded: %s, Speed: %.3fs per image" % (local_file_name, sum(time_list) / len(time_list))
            time_list = []

if __name__ == "__main__":
    get_chapter_list("http://www.manmankan.com/html/13/")
Tech Fingerprint

Alerts (10)

'def' Ensure functions have docstrings for documentation
15 22 32 57
'open(' Use 'with open()' to ensure Files are properly closed
18 24 33 58 131
'time.sleep(' Avoid blocking; use threading.Timer or asyncio.sleep for non-blocking delays
114