/registerbot.py
Python | 296 lines | 257 code | 20 blank | 19 comment | 4 complexity | efca5608f40f8ba7c2fc000db2dbca34 MD5 | raw file
- '''
- This file contains the api class that implements the Scrapping
- of www.ebcwebstore.com Site product details.
- Author: Bharawaj Desikan
- Created: 20-07-2013
- '''
- from PIL import ImageFont
- from PIL import Image
- from PIL import ImageDraw
- from bs4 import BeautifulSoup
- from traceback import format_exc
- import mechanize
- import cookielib
- import logging
- import PIL
- import re
- import email
- import StringIO
- import time
- import os
- import bs4
- import csv
- import sys
- # Below id the code for creating logger Functionality for this module.
- # create logger with 'RegisterBot Api'
- logger = logging.getLogger('RegisterBot')
- # Constant for Configuring is added here
- # TODO: Later move the Configuration Params to Config.py
- COUNT = 200
- CONFIG_FLAG = 'Partial'
- class CountLimitExceeded(Exception):
- pass
- class RegisterBot(object):
- '''RegisterBot class.'''
- def __init__(self):
- '''Initialize the mechanize and cookielib modules that has to be used
- by the bot.'''
- self.url_list = []
- self.subCategories = {}
- self.subjectCategories = {}
- self.prodList = {}
- self.count = 0
- self.response = None
- #self.url_list = fetchConfigUrl()
- self.url = '---not disclosed for confidentiality---'
- #Mechanize Browser object
- self.browser = mechanize.Browser()
- # Cookie Jar
- self.cookiejar = cookielib.LWPCookieJar()
- self.browser.set_cookiejar(self.cookiejar)
- # Browser options
- self.browser.set_handle_equiv(True)
- self.browser.set_handle_gzip(True)
- self.browser.set_handle_redirect(True)
- self.browser.set_handle_referer(True)
- self.browser.set_handle_robots(False)
- # Follows refresh 0 but not hangs on refresh > 0
- self.browser.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(),
- max_time=1)
- self.browser.addheaders = [('User-agent',
- 'Mozilla/5.0 (X11; U; Linux i686;'
- ' en-US; rv:1.9.0.1) Gecko/'
- '2008071615 Fedora/3.0.1-1.fc9'
- ' Firefox/3.0.1')]
- self.ofile = open('scrpout.csv', "wb")
- self.writer = csv.writer(self.ofile, delimiter='\t', quotechar='"', quoting=csv.QUOTE_ALL, dialect='excel')
- logger.info('creating an instance of RegisterBot object')
- def getPageSource(self, url='http://www.ebcwebstore.com/'):
- os.environ['http_proxy']=''
- self.response = self.browser.open(url)
- time.sleep(4)
- raw_html = self.browser.response().read()
- soup = BeautifulSoup(raw_html)
- if 'To continue, please type the characters below' in soup.get_text():
- if CONFIG_FLAG == 'Manual':
- print 'To Procced Please do the Following: 1.Goto URL %s in your Browser \n 2.Then Fix Captcha Manually \n 3. Come to this Prompt again and type YES.',url
- inp_sec = raw_input('Enter YES/NO :')
- if inp_sec == 'YES':
- os.environ['http_proxy']=''
- self.response = self.browser.open(url)
- time.sleep(4)
- raw_html = self.browser.response().read()
- return
- else:
- logger.error('Error in manual fixing of Captcha..')
- return
- elif CONFIG_FLAG == 'Partial':
- try:
- img = soup.find('img', src=re.compile('php_captcha.php'))
- image_response = self.browser.open_novisit(img['src'])
- output = open("captcha.jpg","wb")
- output.write(image_response.read())
- output.close()
- self.browser.form = list(self.browser.forms())[1]
- captcha_field = self.browser.form.find_control('number')
- print 'To Procced Please do the Following: \n 1.Goto file captcha.jpg in you source folder. \n 2.Enter this value in the prompt below.'
- captcha_value = raw_input('Enter the value you can see in the image file:')
- captcha_field.value = captcha_value
- self.request = self.browser.form.click()
- self.browser.open(self.request)
- raw_html = self.browser.response().read()
- except mechanize.ControlNotFoundError:
- pass
-
- return raw_html
- def startBot(self):
- logger.info('-----------------------------Starting Bot Scrapping---------------------------------')
- self.fetchSubjectList(self.getPageSource())
- #print self.subjectCategories
- for item in self.subjectCategories.keys():
- try:
- self.fetchMainCategory(self.getPageSource(self.subjectCategories[item]))
- except AttributeError:
- self.fetchProductList(self.getPageSource(self.subjectCategories[item]))
- except CountLimitExceeded:
- return
- except Exception as e:
- logger.info('Issue in top Url : %s,%s',self.subjectCategories[item], str(type(e)))
- logger.info('---------------------------------Ending Scrapping---------------------------------')
- def findFormNumber(self, form_id=''):
- '''Returns the form index on the page to be used by select_form on
- mechanize object.'''
- soup = BeautifulSoup(self.response)
- formcount=0
- stat=False
- form_list = soup.find_all("form")
- for frm in form_list:
- if str(frm["id"]) == form_id:
- stat=True
- break
- formcount=formcount+1
- if stat==False:
- formcount=-1
- return formcount
- def findFormNumberByClass(self, form_class=''):
- '''Returns the form index on the page to be used by select_form on
- mechanize object.'''
- soup = BeautifulSoup(self.response)
- formcount=0
- stat=False
- form_list = soup.find_all("form")
- for frm in form_list:
- if str(frm["class"][0]) == form_class:
- stat=True
- break
- formcount=formcount+1
- if stat==False:
- formcount=-1
- return formcount
-
- def fetchSubjectList(self, raw_html):
- ''' Displays all the links from the current browser page.'''
- soup = BeautifulSoup(raw_html)
- categories_table_obj = soup.find("table", attrs={"id":'my_table'})
- if categories_table_obj:
- logger.info('The Category Table available')
- #logger.debug('The table Object: %s',categories_table_obj)
- else:
- logger.error('The Category Table available')
- return
- subject_category = categories_table_obj.find_all("span", attrs={"class":re.compile('\\bcat\\b')})
-
- for item in subject_category:
- try:
- local_key = item.font.contents[0]
- local_value = item.parent["href"]
- self.subjectCategories[local_key] = local_value
- except IndexError:
- logger.info('Blank element with class cat')
- def fetchMainCategory(self, raw_html):
- ''' Displays all the links from the current browser page.'''
- soup = BeautifulSoup(raw_html)
- sub_categories_table_obj = soup.find("table", attrs={"class":re.compile('\\binfoBoxContents\\b')})
- sub_category = sub_categories_table_obj.find_all("td", attrs={"class":re.compile('\\bsmallText\\b')})
- for item in sub_category:
- try:
- local_key = item.a.img["title"]
- local_value = item.a["href"]
- self.subCategories[local_key] = local_value
- except IndexError:
- logger.info('Blank element with class cat')
- for item in self.subCategories.keys():
- try:
- self.fetchProductList(self.getPageSource(self.subCategories[item]))
- except Exception as e:
- logger.info('Issue in sub category Url : %s,%s',self.subCategories[item], str(type(e)))
- def fetchProductList(self, raw_html):
- soup = BeautifulSoup(raw_html)
- #print self.subCategories
- prod_list_obj = soup.find("table", attrs={"class":re.compile('\\bproductListing\\b')})
- prod_in_category = prod_list_obj.find_all("td", attrs={"class":re.compile('\\bproductListing-data\\b'),"align":None})
- for item in prod_in_category:
- try:
- name = item.a.contents[0]
- page_Url = item.a["href"]
- self.prodList[name] = page_Url
- except IndexError:
- logger.info('Blank element with class cat')
- for item in self.prodList.keys():
- try:
- self.scrapDataFromPage(self.getPageSource(self.prodList[item]))
- except Exception as e:
- logger.info('Issue in Data Page : %s,%s',self.prodList[item], str(type(e)))
-
- def scrapDataFromPage(self, raw_html):
- soup = BeautifulSoup(raw_html)
- name=''
- author=''
- price=''
- product_details=''
- book_details=''
- try:
- name_obj = soup.find("td", attrs={"class":re.compile('\\bconfparaboldblk2\\b')})
- name = name_obj.contents[0].encode('utf-8')
- try:
- author_obj = soup.find("td", attrs={"class":re.compile('\\bauther11\\b')})
- author = author_obj.a.span.contents[0].encode('utf-8')
- except AttributeError:
- logger.info('No Author field Available..')
- try:
- price_obj = soup.find("td", attrs={"class":re.compile('\\brsp11\\b')})
- price = price_obj.get_text(strip=True)
- price = price.encode('utf-8')
- except AttributeError:
- logger.info('No Price field Available..')
- other_obj = soup.find_all("td", attrs={"class":re.compile('\\bproseptrtop\\b')})
- for item in other_obj:
- heading_obj = item.find("span", attrs={"class":re.compile('\\bsectionheading\\b')})
- if heading_obj and ('Product Details' in heading_obj.contents[0]):
- product_details = item.parent.parent.get_text(strip=True)
- product_details = product_details.encode('utf-8')
- elif heading_obj and ('About the Book' in heading_obj.contents[0]):
- book_details = item.parent.parent.get_text(strip=True)
- book_details = book_details.encode('utf-8')
- row = (name, author, price, product_details, book_details)
- self.writer.writerow(row)
- self.ofile.close
- self.count += 1
- except UnicodeEncodeError:
- logger.info('Unicode encode error ...')
- if (self.count >= COUNT):
- self.count=0
-