PageRenderTime 73ms CodeModel.GetById 28ms RepoModel.GetById 1ms app.codeStats 0ms

/registerbot.py

https://gitlab.com/Desikan/botebc
Python | 296 lines | 257 code | 20 blank | 19 comment | 4 complexity | efca5608f40f8ba7c2fc000db2dbca34 MD5 | raw file
  1. '''
  2. This file contains the api class that implements the Scrapping
  3. of www.ebcwebstore.com Site product details.
  4. Author: Bharawaj Desikan
  5. Created: 20-07-2013
  6. '''
  7. from PIL import ImageFont
  8. from PIL import Image
  9. from PIL import ImageDraw
  10. from bs4 import BeautifulSoup
  11. from traceback import format_exc
  12. import mechanize
  13. import cookielib
  14. import logging
  15. import PIL
  16. import re
  17. import email
  18. import StringIO
  19. import time
  20. import os
  21. import bs4
  22. import csv
  23. import sys
  24. # Below id the code for creating logger Functionality for this module.
  25. # create logger with 'RegisterBot Api'
  26. logger = logging.getLogger('RegisterBot')
  27. # Constant for Configuring is added here
  28. # TODO: Later move the Configuration Params to Config.py
  29. COUNT = 200
  30. CONFIG_FLAG = 'Partial'
  31. class CountLimitExceeded(Exception):
  32. pass
  33. class RegisterBot(object):
  34. '''RegisterBot class.'''
  35. def __init__(self):
  36. '''Initialize the mechanize and cookielib modules that has to be used
  37. by the bot.'''
  38. self.url_list = []
  39. self.subCategories = {}
  40. self.subjectCategories = {}
  41. self.prodList = {}
  42. self.count = 0
  43. self.response = None
  44. #self.url_list = fetchConfigUrl()
  45. self.url = '---not disclosed for confidentiality---'
  46. #Mechanize Browser object
  47. self.browser = mechanize.Browser()
  48. # Cookie Jar
  49. self.cookiejar = cookielib.LWPCookieJar()
  50. self.browser.set_cookiejar(self.cookiejar)
  51. # Browser options
  52. self.browser.set_handle_equiv(True)
  53. self.browser.set_handle_gzip(True)
  54. self.browser.set_handle_redirect(True)
  55. self.browser.set_handle_referer(True)
  56. self.browser.set_handle_robots(False)
  57. # Follows refresh 0 but not hangs on refresh > 0
  58. self.browser.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(),
  59. max_time=1)
  60. self.browser.addheaders = [('User-agent',
  61. 'Mozilla/5.0 (X11; U; Linux i686;'
  62. ' en-US; rv:1.9.0.1) Gecko/'
  63. '2008071615 Fedora/3.0.1-1.fc9'
  64. ' Firefox/3.0.1')]
  65. self.ofile = open('scrpout.csv', "wb")
  66. self.writer = csv.writer(self.ofile, delimiter='\t', quotechar='"', quoting=csv.QUOTE_ALL, dialect='excel')
  67. logger.info('creating an instance of RegisterBot object')
  68. def getPageSource(self, url='http://www.ebcwebstore.com/'):
  69. os.environ['http_proxy']=''
  70. self.response = self.browser.open(url)
  71. time.sleep(4)
  72. raw_html = self.browser.response().read()
  73. soup = BeautifulSoup(raw_html)
  74. if 'To continue, please type the characters below' in soup.get_text():
  75. if CONFIG_FLAG == 'Manual':
  76. print 'To Procced Please do the Following: 1.Goto URL %s in your Browser \n 2.Then Fix Captcha Manually \n 3. Come to this Prompt again and type YES.',url
  77. inp_sec = raw_input('Enter YES/NO :')
  78. if inp_sec == 'YES':
  79. os.environ['http_proxy']=''
  80. self.response = self.browser.open(url)
  81. time.sleep(4)
  82. raw_html = self.browser.response().read()
  83. return
  84. else:
  85. logger.error('Error in manual fixing of Captcha..')
  86. return
  87. elif CONFIG_FLAG == 'Partial':
  88. try:
  89. img = soup.find('img', src=re.compile('php_captcha.php'))
  90. image_response = self.browser.open_novisit(img['src'])
  91. output = open("captcha.jpg","wb")
  92. output.write(image_response.read())
  93. output.close()
  94. self.browser.form = list(self.browser.forms())[1]
  95. captcha_field = self.browser.form.find_control('number')
  96. print 'To Procced Please do the Following: \n 1.Goto file captcha.jpg in you source folder. \n 2.Enter this value in the prompt below.'
  97. captcha_value = raw_input('Enter the value you can see in the image file:')
  98. captcha_field.value = captcha_value
  99. self.request = self.browser.form.click()
  100. self.browser.open(self.request)
  101. raw_html = self.browser.response().read()
  102. except mechanize.ControlNotFoundError:
  103. pass
  104. return raw_html
  105. def startBot(self):
  106. logger.info('-----------------------------Starting Bot Scrapping---------------------------------')
  107. self.fetchSubjectList(self.getPageSource())
  108. #print self.subjectCategories
  109. for item in self.subjectCategories.keys():
  110. try:
  111. self.fetchMainCategory(self.getPageSource(self.subjectCategories[item]))
  112. except AttributeError:
  113. self.fetchProductList(self.getPageSource(self.subjectCategories[item]))
  114. except CountLimitExceeded:
  115. return
  116. except Exception as e:
  117. logger.info('Issue in top Url : %s,%s',self.subjectCategories[item], str(type(e)))
  118. logger.info('---------------------------------Ending Scrapping---------------------------------')
  119. def findFormNumber(self, form_id=''):
  120. '''Returns the form index on the page to be used by select_form on
  121. mechanize object.'''
  122. soup = BeautifulSoup(self.response)
  123. formcount=0
  124. stat=False
  125. form_list = soup.find_all("form")
  126. for frm in form_list:
  127. if str(frm["id"]) == form_id:
  128. stat=True
  129. break
  130. formcount=formcount+1
  131. if stat==False:
  132. formcount=-1
  133. return formcount
  134. def findFormNumberByClass(self, form_class=''):
  135. '''Returns the form index on the page to be used by select_form on
  136. mechanize object.'''
  137. soup = BeautifulSoup(self.response)
  138. formcount=0
  139. stat=False
  140. form_list = soup.find_all("form")
  141. for frm in form_list:
  142. if str(frm["class"][0]) == form_class:
  143. stat=True
  144. break
  145. formcount=formcount+1
  146. if stat==False:
  147. formcount=-1
  148. return formcount
  149. def fetchSubjectList(self, raw_html):
  150. ''' Displays all the links from the current browser page.'''
  151. soup = BeautifulSoup(raw_html)
  152. categories_table_obj = soup.find("table", attrs={"id":'my_table'})
  153. if categories_table_obj:
  154. logger.info('The Category Table available')
  155. #logger.debug('The table Object: %s',categories_table_obj)
  156. else:
  157. logger.error('The Category Table available')
  158. return
  159. subject_category = categories_table_obj.find_all("span", attrs={"class":re.compile('\\bcat\\b')})
  160. for item in subject_category:
  161. try:
  162. local_key = item.font.contents[0]
  163. local_value = item.parent["href"]
  164. self.subjectCategories[local_key] = local_value
  165. except IndexError:
  166. logger.info('Blank element with class cat')
  167. def fetchMainCategory(self, raw_html):
  168. ''' Displays all the links from the current browser page.'''
  169. soup = BeautifulSoup(raw_html)
  170. sub_categories_table_obj = soup.find("table", attrs={"class":re.compile('\\binfoBoxContents\\b')})
  171. sub_category = sub_categories_table_obj.find_all("td", attrs={"class":re.compile('\\bsmallText\\b')})
  172. for item in sub_category:
  173. try:
  174. local_key = item.a.img["title"]
  175. local_value = item.a["href"]
  176. self.subCategories[local_key] = local_value
  177. except IndexError:
  178. logger.info('Blank element with class cat')
  179. for item in self.subCategories.keys():
  180. try:
  181. self.fetchProductList(self.getPageSource(self.subCategories[item]))
  182. except Exception as e:
  183. logger.info('Issue in sub category Url : %s,%s',self.subCategories[item], str(type(e)))
  184. def fetchProductList(self, raw_html):
  185. soup = BeautifulSoup(raw_html)
  186. #print self.subCategories
  187. prod_list_obj = soup.find("table", attrs={"class":re.compile('\\bproductListing\\b')})
  188. prod_in_category = prod_list_obj.find_all("td", attrs={"class":re.compile('\\bproductListing-data\\b'),"align":None})
  189. for item in prod_in_category:
  190. try:
  191. name = item.a.contents[0]
  192. page_Url = item.a["href"]
  193. self.prodList[name] = page_Url
  194. except IndexError:
  195. logger.info('Blank element with class cat')
  196. for item in self.prodList.keys():
  197. try:
  198. self.scrapDataFromPage(self.getPageSource(self.prodList[item]))
  199. except Exception as e:
  200. logger.info('Issue in Data Page : %s,%s',self.prodList[item], str(type(e)))
  201. def scrapDataFromPage(self, raw_html):
  202. soup = BeautifulSoup(raw_html)
  203. name=''
  204. author=''
  205. price=''
  206. product_details=''
  207. book_details=''
  208. try:
  209. name_obj = soup.find("td", attrs={"class":re.compile('\\bconfparaboldblk2\\b')})
  210. name = name_obj.contents[0].encode('utf-8')
  211. try:
  212. author_obj = soup.find("td", attrs={"class":re.compile('\\bauther11\\b')})
  213. author = author_obj.a.span.contents[0].encode('utf-8')
  214. except AttributeError:
  215. logger.info('No Author field Available..')
  216. try:
  217. price_obj = soup.find("td", attrs={"class":re.compile('\\brsp11\\b')})
  218. price = price_obj.get_text(strip=True)
  219. price = price.encode('utf-8')
  220. except AttributeError:
  221. logger.info('No Price field Available..')
  222. other_obj = soup.find_all("td", attrs={"class":re.compile('\\bproseptrtop\\b')})
  223. for item in other_obj:
  224. heading_obj = item.find("span", attrs={"class":re.compile('\\bsectionheading\\b')})
  225. if heading_obj and ('Product Details' in heading_obj.contents[0]):
  226. product_details = item.parent.parent.get_text(strip=True)
  227. product_details = product_details.encode('utf-8')
  228. elif heading_obj and ('About the Book' in heading_obj.contents[0]):
  229. book_details = item.parent.parent.get_text(strip=True)
  230. book_details = book_details.encode('utf-8')
  231. row = (name, author, price, product_details, book_details)
  232. self.writer.writerow(row)
  233. self.ofile.close
  234. self.count += 1
  235. except UnicodeEncodeError:
  236. logger.info('Unicode encode error ...')
  237. if (self.count >= COUNT):
  238. self.count=0