registerbot.py | searchcode

/registerbot.py

https://gitlab.com/Desikan/botebc
Python | 296 lines | 257 code | 20 blank | 19 comment | 4 complexity | efca5608f40f8ba7c2fc000db2dbca34 MD5 | raw file

'''
	This file contains the api class that implements the Scrapping 
	of www.ebcwebstore.com Site product details.

	Author: Bharawaj Desikan
	Created: 20-07-2013
'''

from PIL import ImageFont
from PIL import Image
from PIL import ImageDraw
from bs4 import BeautifulSoup 


from traceback import format_exc

import mechanize
import cookielib
import logging 
import PIL
import re 
import email
import StringIO
import time
import os
import bs4
import csv
import sys 

# Below id the code for creating logger Functionality for this module. 
# create logger with 'RegisterBot Api' 
logger = logging.getLogger('RegisterBot')

# Constant for Configuring is added here
# TODO: Later move the Configuration Params to Config.py

COUNT = 200
CONFIG_FLAG = 'Partial'

class CountLimitExceeded(Exception):
    pass


class RegisterBot(object):
	'''RegisterBot class.'''

	def __init__(self):
		'''Initialize the mechanize and cookielib modules that has to be used 
		by the bot.'''
		self.url_list = []
		self.subCategories = {}
		self.subjectCategories = {}
		self.prodList = {}
		self.count = 0
		self.response = None
		#self.url_list = fetchConfigUrl()
		self.url = '---not disclosed for confidentiality---'

		#Mechanize Browser object
		self.browser = mechanize.Browser()

		# Cookie Jar
		self.cookiejar = cookielib.LWPCookieJar()
		self.browser.set_cookiejar(self.cookiejar)

		# Browser options
		self.browser.set_handle_equiv(True)
		self.browser.set_handle_gzip(True)
		self.browser.set_handle_redirect(True)
		self.browser.set_handle_referer(True)
		self.browser.set_handle_robots(False)

		# Follows refresh 0 but not hangs on refresh > 0
		self.browser.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(),
							  max_time=1)

		self.browser.addheaders = [('User-agent', 
			                        'Mozilla/5.0 (X11; U; Linux i686;'
			                        ' en-US; rv:1.9.0.1) Gecko/'
									'2008071615 Fedora/3.0.1-1.fc9'
									' Firefox/3.0.1')]

		self.ofile  = open('scrpout.csv', "wb")
		self.writer = csv.writer(self.ofile, delimiter='\t', quotechar='"', quoting=csv.QUOTE_ALL, dialect='excel')

		logger.info('creating an instance of RegisterBot object')


	def getPageSource(self, url='http://www.ebcwebstore.com/'):
		os.environ['http_proxy']=''
		self.response = self.browser.open(url)
		time.sleep(4)
		raw_html = self.browser.response().read()
		soup = BeautifulSoup(raw_html)

		if 'To continue, please type the characters below' in soup.get_text():
			if CONFIG_FLAG == 'Manual':
				print 'To Procced Please do the Following: 1.Goto URL %s in your Browser \n 2.Then Fix Captcha Manually \n 3. Come to this Prompt again and type YES.',url
				inp_sec = raw_input('Enter YES/NO :')
				if inp_sec == 'YES':
					os.environ['http_proxy']=''
					self.response = self.browser.open(url)
					time.sleep(4)
					raw_html = self.browser.response().read()
					return
				else:
					logger.error('Error in manual fixing of Captcha..')
					return
			elif CONFIG_FLAG == 'Partial':
				try: 
						img = soup.find('img', src=re.compile('php_captcha.php'))
						image_response = self.browser.open_novisit(img['src'])
						output = open("captcha.jpg","wb")
						output.write(image_response.read())
						output.close()
						self.browser.form = list(self.browser.forms())[1]
						captcha_field = self.browser.form.find_control('number')
						print 'To Procced Please do the Following: \n 1.Goto file captcha.jpg in you source folder. \n 2.Enter this value in the prompt below.'
						captcha_value = raw_input('Enter the value you can see in the image file:')
						captcha_field.value = captcha_value
						self.request = self.browser.form.click()
						self.browser.open(self.request)
						raw_html = self.browser.response().read()
				except mechanize.ControlNotFoundError:
					pass
			
		return raw_html

	def startBot(self):
		logger.info('-----------------------------Starting Bot Scrapping---------------------------------')

		self.fetchSubjectList(self.getPageSource())
		#print self.subjectCategories

		for item in self.subjectCategories.keys():
			try:
				self.fetchMainCategory(self.getPageSource(self.subjectCategories[item]))
			except AttributeError:
				self.fetchProductList(self.getPageSource(self.subjectCategories[item]))
			except CountLimitExceeded:
				return
			except Exception as e:
				logger.info('Issue in top Url : %s,%s',self.subjectCategories[item], str(type(e)))

		logger.info('---------------------------------Ending Scrapping---------------------------------')

	def findFormNumber(self, form_id=''):
		'''Returns the form index on the page to be used by select_form on 
		   mechanize object.'''
		soup = BeautifulSoup(self.response)
		formcount=0
		stat=False
		form_list = soup.find_all("form")
		for frm in form_list:
			if str(frm["id"]) == form_id:
				stat=True
				break
			formcount=formcount+1
		if stat==False:
			formcount=-1
		return formcount

	def findFormNumberByClass(self, form_class=''):
		'''Returns the form index on the page to be used by select_form on 
		   mechanize object.'''
		soup = BeautifulSoup(self.response)
		formcount=0
		stat=False
		form_list = soup.find_all("form")
		for frm in form_list:
			if str(frm["class"][0]) == form_class:
				stat=True
				break
			formcount=formcount+1
		if stat==False:
			formcount=-1
		return formcount
	
	def fetchSubjectList(self, raw_html):
		''' Displays all the links from the current browser page.'''
		soup = BeautifulSoup(raw_html)
		categories_table_obj = soup.find("table", attrs={"id":'my_table'})

		if categories_table_obj:
			logger.info('The Category Table available')
			#logger.debug('The table Object: %s',categories_table_obj)
		else:
			logger.error('The Category Table available')
			return

		subject_category = categories_table_obj.find_all("span", attrs={"class":re.compile('\\bcat\\b')})
		
		for item in subject_category:
			try:
				local_key = item.font.contents[0]
				local_value = item.parent["href"]
				self.subjectCategories[local_key] = local_value
			except IndexError:
				 logger.info('Blank element with class cat')

	def fetchMainCategory(self, raw_html):
		''' Displays all the links from the current browser page.'''
		soup = BeautifulSoup(raw_html)

		sub_categories_table_obj = soup.find("table", attrs={"class":re.compile('\\binfoBoxContents\\b')})

		sub_category = sub_categories_table_obj.find_all("td", attrs={"class":re.compile('\\bsmallText\\b')})

		for item in sub_category:
			try:
				local_key = item.a.img["title"]
				local_value = item.a["href"]
				self.subCategories[local_key] = local_value
			except IndexError:
				 logger.info('Blank element with class cat')

		for item in self.subCategories.keys():
			try:
				self.fetchProductList(self.getPageSource(self.subCategories[item]))
			except Exception as e:
				logger.info('Issue in sub category Url : %s,%s',self.subCategories[item], str(type(e)))

	def fetchProductList(self, raw_html):
		soup = BeautifulSoup(raw_html)
		#print self.subCategories
		prod_list_obj = soup.find("table", attrs={"class":re.compile('\\bproductListing\\b')})

		prod_in_category = prod_list_obj.find_all("td", attrs={"class":re.compile('\\bproductListing-data\\b'),"align":None})

		for item in prod_in_category:
			try:
				name = item.a.contents[0]
				page_Url = item.a["href"]
				self.prodList[name] = page_Url
			except IndexError:
				 logger.info('Blank element with class cat')

		for item in self.prodList.keys():
			try:
				self.scrapDataFromPage(self.getPageSource(self.prodList[item]))
			except Exception as e:
				logger.info('Issue in Data Page : %s,%s',self.prodList[item], str(type(e)))
	
	def scrapDataFromPage(self, raw_html):
		soup = BeautifulSoup(raw_html)
		name=''
		author=''
		price=''
		product_details=''
		book_details=''

		try:

			name_obj = soup.find("td", attrs={"class":re.compile('\\bconfparaboldblk2\\b')})
			name = name_obj.contents[0].encode('utf-8')
			try:
				author_obj = soup.find("td", attrs={"class":re.compile('\\bauther11\\b')})
				author = author_obj.a.span.contents[0].encode('utf-8')
			except AttributeError:
				logger.info('No Author field Available..')
			try:
				price_obj = soup.find("td", attrs={"class":re.compile('\\brsp11\\b')})
				price = price_obj.get_text(strip=True)
				price = price.encode('utf-8')
			except AttributeError:
				logger.info('No Price field Available..')

			other_obj = soup.find_all("td", attrs={"class":re.compile('\\bproseptrtop\\b')})

			for item in other_obj:
				heading_obj = item.find("span", attrs={"class":re.compile('\\bsectionheading\\b')})
				if heading_obj and ('Product Details' in heading_obj.contents[0]):
					product_details = item.parent.parent.get_text(strip=True)
					product_details = product_details.encode('utf-8')
				elif heading_obj and ('About the Book' in heading_obj.contents[0]):
					book_details = item.parent.parent.get_text(strip=True)
					book_details = book_details.encode('utf-8')

			row = (name, author, price, product_details, book_details)

			self.writer.writerow(row)
			self.ofile.close

			self.count += 1
		except UnicodeEncodeError:
			logger.info('Unicode encode error ...')


		if (self.count >= COUNT):
			self.count=0