PageRenderTime 951ms CodeModel.GetById 21ms RepoModel.GetById 1ms app.codeStats 0ms

/google_image_scraper.py

https://gitlab.com/Quantza/DeepClassificationBot
Python | 245 lines | 237 code | 4 blank | 4 comment | 2 complexity | eb35e5d72d5194703c03c9eec87efe46 MD5 | raw file
  1. '''
  2. Google Image Scraper found at https://github.com/shuvronewscred/google-search-image-downloader,
  3. which we adapted for our project. Special thanks to shuvronewscred for his project.
  4. '''
  5. from __future__ import absolute_import
  6. from __future__ import print_function
  7. import os
  8. import time
  9. import re
  10. import socket
  11. from selenium import webdriver
  12. from pattern.web import URL, DOM
  13. class GoogleImageExtractor(object):
  14. def __init__(self, search_key=''):
  15. """ Google image search class
  16. Args:
  17. search_key to be entered.
  18. """
  19. if type(search_key) == str:
  20. # convert to list even for one search keyword to standalize the pulling.
  21. self.g_search_key_list = [search_key]
  22. elif type(search_key) == list:
  23. self.g_search_key_list = search_key
  24. else:
  25. print('google_search_keyword not of type str or list')
  26. raise
  27. self.g_search_key = ''
  28. # user options
  29. self.image_dl_per_search = 200
  30. # url construct string text
  31. self.prefix_of_search_url = "https://www.google.com.sg/search?q="
  32. self.postfix_of_search_url = '&source=lnms&tbm=isch&sa=X&ei=0eZEVbj3IJG5uATalICQAQ&ved=0CAcQ_AUoAQ&biw=939&bih=591' # non changable text
  33. self.target_url_str = ''
  34. # storage
  35. self.pic_url_list = []
  36. self.pic_info_list = []
  37. # file and folder path
  38. self.folder_main_dir_prefix = 'downloaded_images/'
  39. def reformat_search_for_spaces(self):
  40. """
  41. Method call immediately at the initialization stages
  42. get rid of the spaces and replace by the "+"
  43. Use in search term. Eg: "Cookie fast" to "Cookie+fast"
  44. steps:
  45. strip any lagging spaces if present
  46. replace the self.g_search_key
  47. """
  48. self.g_search_key = self.g_search_key.rstrip().replace(' ', '+')
  49. def set_num_image_to_dl(self, num_image):
  50. """ Set the number of image to download. Set to self.image_dl_per_search.
  51. Args:
  52. num_image (int): num of image to download.
  53. """
  54. self.image_dl_per_search = num_image
  55. def get_searchlist_fr_file(self, filename):
  56. """Get search list from filename. Ability to add in a lot of phrases.
  57. Will replace the self.g_search_key_list
  58. Args:
  59. filename (str): full file path
  60. """
  61. with open(filename, 'r') as f:
  62. self.g_search_key_list = f.readlines()
  63. def set_searchlist(self, search_list):
  64. """Get search list from filename. Ability to add in a lot of phrases.
  65. Will replace the self.g_search_key_list
  66. Args:
  67. filename (str): full file path
  68. """
  69. self.g_search_key_list = search_list
  70. def formed_search_url(self):
  71. ''' Form the url either one selected key phrases or multiple search items.
  72. Get the url from the self.g_search_key_list
  73. Set to self.sp_search_url_list
  74. '''
  75. self.reformat_search_for_spaces()
  76. self.target_url_str = self.prefix_of_search_url + self.g_search_key + \
  77. self.postfix_of_search_url
  78. def retrieve_source_fr_html(self, driver):
  79. """ Make use of selenium. Retrieve from html table using pandas table.
  80. """
  81. try:
  82. driver.get(self.target_url_str)
  83. except:
  84. print("Connection refused")
  85. # wait for log in then get the page source.
  86. try:
  87. driver.execute_script("window.scrollTo(0, 30000)")
  88. time.sleep(2)
  89. self.temp_page_source = driver.page_source
  90. # driver.find_element_by_css_selector('ksb _kvc').click() # cant find the class
  91. driver.find_element_by_id('smb').click() # ok
  92. time.sleep(2)
  93. driver.execute_script("window.scrollTo(0, 60000)")
  94. time.sleep(2)
  95. driver.execute_script("window.scrollTo(0, 60000)")
  96. except:
  97. print('not able to find')
  98. # driver.quit()
  99. try:
  100. self.page_source = driver.page_source
  101. except socket.error:
  102. print("Socket broke")
  103. def extract_pic_url(self):
  104. """ extract all the raw pic url in list
  105. """
  106. dom = DOM(self.page_source)
  107. tag_list = dom('a.rg_l')
  108. for tag in tag_list[:self.image_dl_per_search]:
  109. tar_str = re.search('imgurl=(.*)&imgrefurl', tag.attributes['href'])
  110. try:
  111. self.pic_url_list.append(tar_str.group(1))
  112. except:
  113. print('error parsing', tag)
  114. def multi_search_download(self):
  115. """ Mutli search download"""
  116. driver = webdriver.Firefox()
  117. for indiv_search in self.g_search_key_list:
  118. self.pic_url_list = []
  119. self.pic_info_list = []
  120. self.g_search_key = indiv_search
  121. self.formed_search_url()
  122. self.retrieve_source_fr_html(driver)
  123. self.extract_pic_url()
  124. self.downloading_all_photos() # some download might not be jpg?? use selnium to download??
  125. self.save_infolist_to_file()
  126. driver.close()
  127. def downloading_all_photos(self):
  128. """ download all photos to particular folder
  129. """
  130. self.create_folder()
  131. pic_counter = 1
  132. for url_link in self.pic_url_list:
  133. print(pic_counter)
  134. pic_prefix_str = self.g_search_key + "/" + self.g_search_key + str(pic_counter)
  135. self.download_single_image(url_link.encode(), pic_prefix_str)
  136. pic_counter = pic_counter + 1
  137. def download_single_image(self, url_link, pic_prefix_str):
  138. """ Download data according to the url link given.
  139. Args:
  140. url_link (str): url str.
  141. pic_prefix_str (str): pic_prefix_str for unique label the pic
  142. """
  143. self.download_fault = 0
  144. file_ext = os.path.splitext(url_link)[1] # use for checking valid pic ext
  145. temp_filename = pic_prefix_str + file_ext
  146. temp_filename_full_path = os.path.join(self.gs_raw_dirpath, temp_filename)
  147. temp_filename_full_path = temp_filename_full_path.replace("+", " ")
  148. folder_name = temp_filename_full_path.split("/")
  149. if not os.path.exists(temp_filename_full_path.replace(folder_name[-1], "")):
  150. os.makedirs(temp_filename_full_path.replace(folder_name[-1], ""))
  151. valid_image_ext_list = ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff'] # not comprehensive
  152. url = URL(url_link.replace("%2F", "/").replace("%3A", ":"))
  153. try:
  154. if url.redirect:
  155. return # if there is re-direct, return
  156. if file_ext not in valid_image_ext_list:
  157. return # return if not valid image extension
  158. f = open(temp_filename_full_path, 'wb') # save as test.gif
  159. print(url_link)
  160. self.pic_info_list.append(pic_prefix_str + ': ' + url_link)
  161. image = url.download()
  162. # import matplotlib.pyplot as p
  163. # p.imshow(image)
  164. # p.show(image)
  165. f.write(image) # if have problem skip
  166. # if self.__print_download_fault:
  167. print('Problem with processing this data: ', url_link)
  168. self.download_fault = 1
  169. f.close()
  170. except:
  171. pass
  172. def create_folder(self):
  173. """
  174. Create a folder to put the log data segregate by date
  175. """
  176. self.gs_raw_dirpath = os.path.join(self.folder_main_dir_prefix)
  177. if not os.path.exists(self.gs_raw_dirpath):
  178. os.makedirs(self.gs_raw_dirpath)
  179. def save_infolist_to_file(self):
  180. """ Save the info list to file.
  181. """
  182. pass
  183. # temp_filename_full_path = os.path.join(self.gs_raw_dirpath, self.g_search_key + '_info.txt')
  184. # with open(temp_filename_full_path, 'w') as f:
  185. # for n in self.pic_info_list:
  186. # f.write(n)
  187. # f.write('\n')
  188. if __name__ == '__main__':
  189. import argparse
  190. from backports import csv
  191. import codecs
  192. parser = argparse.ArgumentParser()
  193. parser.add_argument('csv', nargs='?', type=argparse.FileType('rb'))
  194. parser.add_argument('-n', type=int, default=350)
  195. parser.add_argument('--dry-run', action='store_true', default=False)
  196. args = parser.parse_args()
  197. csv_input = codecs.getreader('utf8')(args.csv)
  198. queries = [' '.join(row.values()) for row in csv.DictReader(csv_input)]
  199. w = GoogleImageExtractor('') # leave blanks if get the search list from file
  200. w.set_num_image_to_dl(args.n)
  201. w.set_searchlist(queries) # replace the searclist
  202. if not args.dry_run:
  203. w.multi_search_download()