/pynav.py

https://bitbucket.org/sloft/pynav/ · Python · 327 lines · 230 code · 42 blank · 55 comment · 77 complexity · 5bb802fe21486b0897b0f681c007d4a9 MD5 · raw file

  1. #!/usr/bin/python
  2. # -*- coding=utf-8 -*-
  3. '''
  4. Created on 15 nov. 2009
  5. @author: Sloft
  6. Licence : GNU General Public License (GPL)
  7. '''
  8. from __future__ import with_statement #for Python 2.5 compatibility
  9. import os
  10. import re
  11. import time
  12. import random
  13. import socket
  14. import urllib
  15. import urllib2
  16. import httplib
  17. import urlparse
  18. import cookielib
  19. try:
  20. import cPickle as pickle
  21. except ImportError:
  22. import pickle
  23. class Pynav(object):
  24. """Programmatic web browser to fetch data and test web sites"""
  25. version = '0.6.5'
  26. verbose = False
  27. def __init__(self, timeout=None, proxy=None):
  28. """ Constructor, many attributes can be used """
  29. self.temps_min = 0
  30. self.temps_max = 0
  31. self.max_page_size = 500000
  32. self.max_history = 200
  33. self.verbose = False
  34. self._set_user_agents_list()
  35. # Add that as an __init__ argument and remove the 'user_agents_list'
  36. self.user_agent = self.user_agent_list['firefox']['windows']
  37. self._headers = {'User-Agent' : self.user_agent}
  38. self._auto_referer = False
  39. self._cookie_jar = cookielib.CookieJar()
  40. self.proxy = proxy
  41. self._url_opener = urllib2.build_opener(*self._get_handlers())
  42. self.history = []
  43. self.current_page = -1
  44. self.page_document_type = None
  45. self.page_info = None
  46. self.real_url = None
  47. self.relative_url = None
  48. self.base_url = None
  49. self.response = None
  50. # Pass that to the download function
  51. self.download_path = os.getcwd()
  52. if timeout:
  53. socket.setdefaulttimeout(timeout)
  54. def _get_handlers(self):
  55. """Private method to get all handlers needed"""
  56. handlers = []
  57. handlers.append(urllib2.HTTPCookieProcessor(self._cookie_jar))
  58. if self.proxy:
  59. handlers.append(urllib2.ProxyHandler({'http': self.proxy}))
  60. return handlers
  61. def _set_user_agents_list(self):
  62. """Private method to set the user agents list"""
  63. self.user_agent_list = {}
  64. self.user_agent_list['firefox'] = \
  65. {'windows' : 'Mozilla/5.0 (Windows; U; Windows NT 6; fr; rv:1.9.1.5) Gecko/Firefox/3.5.5'}
  66. self.user_agent_list['ie'] = {
  67. 'windows' : 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Win64; x64; Trident/4.0)'}
  68. def set_http_auth(self, base_url, username, password):
  69. """Define parameters to set HTTP Basic Authentication"""
  70. password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
  71. password_mgr.add_password(None, base_url, username, password)
  72. handler = urllib2.HTTPBasicAuthHandler(password_mgr)
  73. self._url_opener.add_handler(handler)
  74. def _set_referer(self, referer):
  75. """Decorator to define a referer, the previous visited page"""
  76. self._headers['Referer'] = referer
  77. def _get_referer(self):
  78. """Decorator to get the referer, the previous visited page"""
  79. if self._headers.has_key('Referer'):
  80. return self._headers['Referer']
  81. else:
  82. return None
  83. referer = property(_get_referer, _set_referer)
  84. def _set_auto_referer(self, auto_referer):
  85. """Decorator to set the status of the auto_referer attribute"""
  86. self._auto_referer = auto_referer
  87. if not auto_referer:
  88. if self._headers.has_key('Referer'):
  89. self._headers.pop('Referer')
  90. def _get_auto_referer(self):
  91. """Decorator to get the status of the auto_referer attribute"""
  92. return self._auto_referer
  93. autoReferer = property(_get_auto_referer, _set_auto_referer)
  94. def save_history(self, file_name):
  95. """Save history in a file"""
  96. with open(file_name, 'w') as f:
  97. pickle.dump(self.history, f)
  98. def load_history(self, file_name):
  99. """Load history from a file"""
  100. try:
  101. with open(file_name, 'r') as f:
  102. self.history = pickle.load(f)
  103. except IOError:
  104. print "ERROR: file", file_name, "doesn't exist"
  105. def _init_go(self):
  106. """Private method to initialize some attributes"""
  107. sleep_time = random.randint(self.temps_min, self.temps_max)
  108. if self.verbose and sleep_time > 0:
  109. print 'waiting', sleep_time, 'secs'
  110. if sleep_time:
  111. time.sleep(sleep_time)
  112. if self._auto_referer:
  113. if len(self.history) > 0:
  114. self.referer = self.history[self.current_page]['url']
  115. def go(self, url, values = {}):
  116. """Visite a web page, post values can be used"""
  117. self._init_go()
  118. if not re.search('://', url):
  119. url = 'http://' + url
  120. if url.count('/') < 3:
  121. url = url + '/'
  122. data = urllib.urlencode(values)
  123. if values:
  124. req = urllib2.Request(url, data, self._headers)
  125. else:
  126. req = urllib2.Request(url, headers=self._headers)
  127. self.response = None
  128. handle = None
  129. try:
  130. handle = self._url_opener.open(req)
  131. except urllib2.HTTPError, exception:
  132. if exception.code == 404:
  133. print '(404) Page not found !'
  134. else:
  135. print 'HTTP request failed with error %d (%s)' % (
  136. exception.code, exception.msg)
  137. except urllib2.URLError, exception:
  138. print 'Opening URL failed because:', exception.reason
  139. except httplib.BadStatusLine, exception:
  140. print exception.line #print nothing...
  141. print "BadStatusLine Error! Httplib issue, can't get this page, sorry..."
  142. if handle:
  143. # Maybe pack these attributes into the response object?
  144. self.response = handle.read(self.max_page_size)
  145. self.page_document_type = handle.info().getheader("Content-Type","")
  146. self.page_info = handle.info()
  147. self.real_url = handle.geturl()
  148. if len(self.history) > self.max_history - 1:
  149. del self.history[0]
  150. self.current_page = self.current_page + 1
  151. self.history.append({'url':url, 'post':values, 'response':self.response})
  152. if self.current_page > len(self.history) - 1:
  153. self.current_page = len(self.history) - 1
  154. self.relative_url = self.real_url.replace(self.real_url.split('/')[-1], '')
  155. self.base_url = 'http://'+self.real_url[7:].split('/')[0]+'/'
  156. return self.response
  157. else:
  158. # I think this can be removed and simply return None for each
  159. # except above
  160. return None #Exception ?
  161. def replay(self, begining=0, end=None, print_url=False, print_post=False, print_response=False):
  162. """Replay history, can be used after loading history from a file"""
  163. history, self.history = self.history, []
  164. if not end:
  165. end = len(history)
  166. for page in history[begining:end]:
  167. self.go(page['url'], page['post'])
  168. if print_url:
  169. print page['url']
  170. if print_post:
  171. print page['post']
  172. if print_response:
  173. print page['response']
  174. def search(self, reg):
  175. """Search a regex in the page, return a boolean"""
  176. return re.search(reg, self.response)
  177. # I think this doesn't need to be here, the user can handle that himself
  178. # by using the response
  179. def find(self, reg):
  180. """Return the result found by the regex"""
  181. res = re.findall(reg, self.response, re.S)
  182. if len(res)==1:
  183. return res[0]
  184. else:
  185. return res
  186. # I think this doesn't need to be here, the user can handle that himself
  187. # by using the response
  188. def find_all(self, reg):
  189. """Return all results found by the regex"""
  190. return re.findall(reg, self.response, re.S)
  191. def download(self, url, destination=None):
  192. """Download the file at a url to a file or destination"""
  193. if not destination:
  194. destination = self.download_path
  195. if os.path.isdir(destination):
  196. if destination[-1] in ('/', '\\'):
  197. destination = destination + url.split('/')[-1]
  198. else:
  199. destination = destination + '/' + url.split('/')[-1]
  200. else:
  201. destination = self.download_path + destination
  202. if self.verbose:
  203. print 'Downloading to:', destination
  204. return urllib.urlretrieve(url, destination)
  205. # I think this doesn't need to be here, the user can handle that himself
  206. # by using the response
  207. def save_response(self, destination):
  208. """Save the page to a file"""
  209. f = open(destination, 'w')
  210. try:
  211. f.write(self.response)
  212. finally:
  213. f.close()
  214. def get_cookies(self, web_page=None):
  215. """This always returns the cookies the current visited URL holds,
  216. if web_page is specified this will return the cookies this web_page has
  217. within our browser instance."""
  218. if not web_page:
  219. if not self.base_url:
  220. return None
  221. else:
  222. netloc = urlparse.urlparse(self.base_url).netloc
  223. else:
  224. netloc = urlparse.urlparse(web_page).netloc
  225. return self._cookie_jar._cookies[netloc]['/']
  226. def cookie_exists(self, name='PHPSESSID'):
  227. """Test if a cookie exists. Kept for Pynav 0.6 compatibility"""
  228. return name in [cookie.name for cookie in self._cookie_jar]
  229. def add_path(self, url):
  230. """Correct an url depending on the link, internal use"""
  231. if re.search('://', url):
  232. return url
  233. else:
  234. if url == '':
  235. return self.base_url
  236. if url[0] == '/':
  237. return self.base_url[:-1]+url
  238. else:
  239. return self.relative_url+url
  240. # I think this doesn't need to be here, the user can handle that himself
  241. # by using the response
  242. def get_all_links(self, reg = None):
  243. """Return a list of all links found, a regex can be used"""
  244. links = re.findall('href="(.*?)"', self.response)
  245. if reg:
  246. def match(link): return len(re.findall(reg, link)) > 0
  247. return [self.add_path(link) for link in links if match(link)]
  248. else:
  249. return [self.add_path(link) for link in links]
  250. # I think this doesn't need to be here, the user can handle that himself
  251. # by using the response
  252. def get_all_images(self, reg = None):
  253. """Return a list of all images found, a regex can be used"""
  254. images = re.findall('img.*?src="(.*?)"', self.response)
  255. if reg:
  256. def match(image): return len(re.findall(reg, image)) > 0
  257. return [self.add_path(image) for image in images if match(image)]
  258. else:
  259. return [self.add_path(image) for image in images]
  260. # I think this doesn't need to be here, the user can handle that himself
  261. def set_page_delay(self, temps_min=0, temps_max=0):
  262. """Define the time between pages, random seconds, min and max"""
  263. self.temps_min = temps_min
  264. if(temps_min > temps_max):
  265. self.temps_max = temps_min
  266. else:
  267. self.temps_max = temps_max
  268. if self.verbose:
  269. print 'temps_min:', self.temps_min, ', temps_max:', self.temps_max
  270. # I think this doesn't need to be here, the user can handle that himself
  271. # by using the response
  272. def strip_tags(self, html):
  273. """Strip all tags of an HTML string and return only texts"""
  274. intag = [False]
  275. def chk(c):
  276. if intag[0]:
  277. intag[0] = (c != '>')
  278. return False
  279. elif c == '<':
  280. intag[0] = True
  281. return False
  282. return True
  283. return ''.join(c for c in html if chk(c))