PageRenderTime 84ms CodeModel.GetById 48ms RepoModel.GetById 1ms app.codeStats 0ms

/python/get_manmankan_images.py

https://github.com/mitnk/stuff
Python | 143 lines | 128 code | 11 blank | 4 comment | 8 complexity | 79d58f74ad834a6c511b1340da4d9310 MD5 | raw file
  1. ##################################################
  2. ## Get Images from manmankan
  3. ## Author: whgking@gmail.com
  4. ##################################################
  5. import os
  6. import os.path
  7. import pickle
  8. import re
  9. import urllib
  10. import urllib2
  11. import time
  12. from BeautifulSoup import BeautifulSoup
  13. def save_image_list_to_cache(dir_name, image_list):
  14. if not image_list or not dir_name:
  15. return
  16. output = open('%s/image_list.pkl' % dir_name, 'wb')
  17. pickle.dump(image_list, output)
  18. output.close()
  19. def get_image_list_from_cache(dir_name):
  20. try:
  21. pkl_file = open('%s/image_list.pkl' % dir_name, 'rb')
  22. except IOError:
  23. return None
  24. image_list = pickle.load(pkl_file)
  25. pkl_file.close()
  26. return image_list
  27. def get_chapter_list(url):
  28. page = urllib2.urlopen(url)
  29. soup = BeautifulSoup(page, fromEncoding="gb18030")
  30. print u"Reading information of %s ..." % soup.findAll("h1")[0].string
  31. tag = soup.find("table", {"class": "list_bg_table"})
  32. if not tag:
  33. print "Chapter table not found."
  34. chapter_list = tag.findAll("td")
  35. manman_list = []
  36. for chapter in chapter_list:
  37. a_tag = chapter.find('a')
  38. if not a_tag: # Blank td tag
  39. continue
  40. chapter_name = a_tag.string
  41. url = a_tag["href"]
  42. manman_list.append([chapter_name.strip(), url])
  43. manman_list.reverse()
  44. for chapter_name, url in manman_list:
  45. download_all_images(url, chapter_name)
  46. def get_image_list(url):
  47. page = urllib2.urlopen(url)
  48. soup = BeautifulSoup(page)
  49. javascripts = soup.findAll(text=lambda text: text.parent.name == "script")
  50. image_script = ""
  51. for js in javascripts:
  52. if "new Array" in js:
  53. image_script = js
  54. break
  55. if not image_script:
  56. print "Javascript of image src not found."
  57. result = re.search(r'new Array\(([^;]+)\);', image_script)
  58. if not result:
  59. print "Image SRC not found."
  60. return result.group(1).replace('"', '').split(',')
  61. def download_all_images(url, dir_name):
  62. if not os.path.exists(dir_name):
  63. os.makedirs(dir_name)
  64. print u"Created directory: %s" % dir_name
  65. # Save image_list to cache, no need fetching from website everytime
  66. image_list = get_image_list_from_cache(dir_name)
  67. if not image_list:
  68. image_list = get_image_list(url)
  69. save_image_list_to_cache(dir_name, image_list)
  70. # -1 for image_list.pkl file in the same directory
  71. image_count_existing = len(os.listdir(dir_name)) - 1
  72. if len(image_list) == image_count_existing:
  73. print u"%s seems already finished, skip this chapter." % dir_name
  74. return
  75. number_from = 0
  76. if image_count_existing != 0:
  77. # Let's fetch from the last image, in case it's not fully downloaded
  78. number_from = image_count_existing - 1
  79. print "Begin to download %s ..." % dir_name
  80. print "Total images to download: %s" % (len(image_list) - number_from)
  81. host = "http://54.manmankan.com"
  82. bad_images = 0
  83. count_downloaded = number_from + 1
  84. first_image_is_invalid = False
  85. time_list = []
  86. for image_src in image_list[number_from:]:
  87. num = str("%3d" % count_downloaded).replace(' ', '0')
  88. image_ext = image_src.split(".")[-1]
  89. local_file_name = "%s/%s.%s" % (dir_name, num, image_ext)
  90. url = host + image_src
  91. t = time.time()
  92. urllib.urlretrieve(url, local_file_name)
  93. time_list.append(time.time() - t)
  94. time.sleep(2) # Sleep 2 seconds to download gently
  95. # Image too small, means it is bad images (Smaller than 10K)
  96. # Sometimes, valid images also are very small,
  97. # so, We need to determine whether the first is invalid.
  98. if os.path.getsize(local_file_name) < 10000:
  99. if count_downloaded == 0:
  100. first_image_is_invalid = True
  101. if first_image_is_invalid:
  102. print "Invalid image (too small in size) found:", image_src
  103. print "Saved to %s\r\n" % local_file_name
  104. bad_images += 1
  105. if bad_images >= 3 and first_image_is_invalid:
  106. error_info = u"Error Images found to fetch %s. Skipped this chapter\r\n" % dir_name
  107. print error_info
  108. f = open("%s/errors.log" % dir_name, "w")
  109. f.write(error_info.encode('utf-8'))
  110. f.close()
  111. return
  112. count_downloaded += 1
  113. if count_downloaded % 10 == 1:
  114. print "Downloaded: %s, Speed: %.3fs per image" % (local_file_name, sum(time_list) / len(time_list))
  115. time_list = []
  116. if __name__ == "__main__":
  117. get_chapter_list("http://www.manmankan.com/html/13/")