PageRenderTime 22ms CodeModel.GetById 2ms app.highlight 16ms RepoModel.GetById 1ms app.codeStats 1ms

/pynav.py

https://bitbucket.org/sloft/pynav/
Python | 327 lines | 279 code | 23 blank | 25 comment | 17 complexity | 5bb802fe21486b0897b0f681c007d4a9 MD5 | raw file
  1#!/usr/bin/python
  2#  -*- coding=utf-8 -*-
  3'''
  4Created on 15 nov. 2009
  5
  6@author: Sloft
  7
  8Licence : GNU General Public License (GPL)
  9'''
 10
 11from __future__ import with_statement #for Python 2.5 compatibility
 12import os
 13import re
 14import time
 15import random
 16import socket
 17import urllib
 18import urllib2
 19import httplib
 20import urlparse
 21import cookielib
 22try:
 23    import cPickle as pickle
 24except ImportError:
 25    import pickle
 26
 27class Pynav(object):
 28    """Programmatic web browser to fetch data and test web sites"""
 29    version = '0.6.5'
 30    verbose = False
 31    
 32    def __init__(self, timeout=None, proxy=None):
 33        """ Constructor, many attributes can be used """
 34        self.temps_min = 0
 35        self.temps_max = 0
 36        self.max_page_size = 500000
 37        self.max_history = 200
 38        self.verbose = False
 39        self._set_user_agents_list()
 40        # Add that as an __init__ argument and remove the 'user_agents_list'
 41        self.user_agent = self.user_agent_list['firefox']['windows']
 42        self._headers = {'User-Agent' : self.user_agent}
 43        self._auto_referer = False
 44        self._cookie_jar = cookielib.CookieJar()
 45        self.proxy = proxy
 46        self._url_opener = urllib2.build_opener(*self._get_handlers())
 47        self.history = []
 48        self.current_page = -1
 49        self.page_document_type = None
 50        self.page_info = None
 51        self.real_url = None
 52        self.relative_url = None
 53        self.base_url = None
 54        self.response = None
 55        # Pass that to the download function
 56        self.download_path = os.getcwd()
 57        if timeout:
 58            socket.setdefaulttimeout(timeout)
 59    
 60    def _get_handlers(self):
 61        """Private method to get all handlers needed"""
 62        handlers = []
 63        handlers.append(urllib2.HTTPCookieProcessor(self._cookie_jar))
 64        if self.proxy:
 65            handlers.append(urllib2.ProxyHandler({'http': self.proxy}))
 66        return handlers
 67       
 68    def _set_user_agents_list(self):
 69        """Private method to set the user agents list"""
 70        self.user_agent_list = {}
 71        self.user_agent_list['firefox'] = \
 72        {'windows' : 'Mozilla/5.0 (Windows; U; Windows NT 6; fr; rv:1.9.1.5) Gecko/Firefox/3.5.5'}
 73        self.user_agent_list['ie'] = {
 74            'windows' : 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Win64; x64; Trident/4.0)'}
 75    
 76    def set_http_auth(self, base_url, username, password):
 77        """Define parameters to set HTTP Basic Authentication"""
 78        password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
 79        password_mgr.add_password(None, base_url, username, password)
 80        handler = urllib2.HTTPBasicAuthHandler(password_mgr)
 81        self._url_opener.add_handler(handler)
 82
 83    def _set_referer(self, referer):
 84        """Decorator to define a referer, the previous visited page"""
 85        self._headers['Referer'] = referer
 86    
 87    def _get_referer(self):
 88        """Decorator to get the referer, the previous visited page"""
 89        if self._headers.has_key('Referer'):
 90            return self._headers['Referer']
 91        else:
 92            return None
 93    
 94    referer = property(_get_referer, _set_referer)
 95
 96    def _set_auto_referer(self, auto_referer):
 97        """Decorator to set the status of the auto_referer attribute"""
 98        self._auto_referer = auto_referer
 99        if not auto_referer:
100            if self._headers.has_key('Referer'):
101                self._headers.pop('Referer')
102    
103    def _get_auto_referer(self):
104        """Decorator to get the status of the auto_referer attribute"""
105        return self._auto_referer
106    
107    autoReferer = property(_get_auto_referer, _set_auto_referer)
108    
109    def save_history(self, file_name):
110        """Save history in a file"""
111        with open(file_name, 'w') as f:
112            pickle.dump(self.history, f)
113    
114    def load_history(self, file_name):
115        """Load history from a file"""
116        try:
117            with open(file_name, 'r') as f:
118                self.history = pickle.load(f)
119        except IOError:
120            print "ERROR: file", file_name, "doesn't exist"
121    
122    def _init_go(self):
123        """Private method to initialize some attributes"""
124        sleep_time = random.randint(self.temps_min, self.temps_max)
125        if self.verbose and sleep_time > 0:
126            print 'waiting', sleep_time, 'secs'
127        if sleep_time:
128            time.sleep(sleep_time)
129        if self._auto_referer:
130            if len(self.history) > 0:
131                self.referer = self.history[self.current_page]['url']
132        
133    def go(self, url, values = {}):
134        """Visite a web page, post values can be used"""
135        self._init_go()
136        
137        if not re.search('://', url):
138            url = 'http://' + url
139        
140        if url.count('/') < 3:
141            url = url + '/'
142        
143        data = urllib.urlencode(values)
144        
145        if values:
146            req = urllib2.Request(url, data, self._headers)
147        else:
148            req = urllib2.Request(url, headers=self._headers)
149        
150        self.response = None
151        
152        handle = None
153        try:
154            handle = self._url_opener.open(req)
155        except urllib2.HTTPError, exception:
156            if exception.code == 404:
157                print '(404) Page not found !'
158            else:           
159                print 'HTTP request failed with error %d (%s)' % (
160                    exception.code, exception.msg)
161        except urllib2.URLError, exception:
162            print 'Opening URL failed because:', exception.reason
163        except httplib.BadStatusLine, exception:
164            print exception.line #print nothing...
165            print "BadStatusLine Error! Httplib issue, can't get this page, sorry..."
166
167        if handle:
168            # Maybe pack these attributes into the response object?
169            self.response = handle.read(self.max_page_size)
170            self.page_document_type = handle.info().getheader("Content-Type","")
171            self.page_info = handle.info()
172            self.real_url = handle.geturl()
173            
174            if len(self.history) > self.max_history - 1:
175                del self.history[0]
176            self.current_page = self.current_page + 1
177            self.history.append({'url':url, 'post':values, 'response':self.response})
178            
179            if self.current_page > len(self.history) - 1:
180                self.current_page = len(self.history) - 1
181            
182            self.relative_url = self.real_url.replace(self.real_url.split('/')[-1], '')
183            self.base_url = 'http://'+self.real_url[7:].split('/')[0]+'/'
184            return self.response
185        else:
186            # I think this can be removed and simply return None for each
187            # except above
188            return None #Exception ?
189    
190    def replay(self, begining=0, end=None, print_url=False, print_post=False, print_response=False):
191        """Replay history, can be used after loading history from a file"""
192        history, self.history = self.history, []
193        if not end:
194            end = len(history)
195        for page in history[begining:end]:
196            self.go(page['url'], page['post'])
197            if print_url:
198                print page['url']
199            if print_post:
200                print page['post']
201            if print_response:
202                print page['response']
203    
204    def search(self, reg):
205        """Search a regex in the page, return a boolean"""
206        return re.search(reg, self.response)
207    
208    # I think this doesn't need to be here, the user can handle that himself
209    # by using the response
210    def find(self, reg):
211        """Return the result found by the regex"""
212        res = re.findall(reg, self.response, re.S)
213        if len(res)==1:
214            return res[0]
215        else:
216            return res
217    
218    # I think this doesn't need to be here, the user can handle that himself
219    # by using the response
220    def find_all(self, reg):
221        """Return all results found by the regex"""
222        return re.findall(reg, self.response, re.S)
223    
224    def download(self, url, destination=None):
225        """Download the file at a url to a file or destination"""
226        if not destination:
227            destination = self.download_path
228        
229        if os.path.isdir(destination):
230            if destination[-1] in ('/', '\\'):
231                destination = destination + url.split('/')[-1]
232            else:
233                destination = destination + '/' + url.split('/')[-1]
234        else:
235            destination = self.download_path + destination
236
237        if self.verbose:
238            print 'Downloading to:', destination
239        return urllib.urlretrieve(url, destination)
240
241    # I think this doesn't need to be here, the user can handle that himself
242    # by using the response
243    def save_response(self, destination):
244        """Save the page to a file"""
245        f = open(destination, 'w')
246        try:
247            f.write(self.response)
248        finally:
249            f.close()
250    
251    def get_cookies(self, web_page=None):
252        """This always returns the cookies the current visited URL holds,
253           if web_page is specified this will return the cookies this web_page has
254           within our browser instance."""
255        if not web_page:
256            if not self.base_url:
257                return None
258            else:
259                netloc = urlparse.urlparse(self.base_url).netloc
260        else:
261            netloc = urlparse.urlparse(web_page).netloc
262        return self._cookie_jar._cookies[netloc]['/']
263
264    def cookie_exists(self, name='PHPSESSID'):
265        """Test if a cookie exists. Kept for Pynav 0.6 compatibility"""
266        return name in [cookie.name for cookie in self._cookie_jar]
267    
268    def add_path(self, url):
269        """Correct an url depending on the link, internal use"""
270        if re.search('://', url):
271            return url
272        else:
273            if url == '':
274                return self.base_url
275            if url[0] == '/':
276                return self.base_url[:-1]+url
277            else:
278                return self.relative_url+url
279    
280    # I think this doesn't need to be here, the user can handle that himself
281    # by using the response
282    def get_all_links(self, reg = None):
283        """Return a list of all links found, a regex can be used"""
284        links = re.findall('href="(.*?)"', self.response)
285        if reg:
286            def match(link): return len(re.findall(reg, link)) > 0
287            return [self.add_path(link) for link in links if match(link)]
288        else:
289            return [self.add_path(link) for link in links]
290    
291    # I think this doesn't need to be here, the user can handle that himself
292    # by using the response
293    def get_all_images(self, reg = None):
294        """Return a list of all images found, a regex can be used"""
295        images = re.findall('img.*?src="(.*?)"', self.response)
296        if reg:
297            def match(image): return len(re.findall(reg, image)) > 0
298            return [self.add_path(image) for image in images if match(image)]
299        else:
300            return [self.add_path(image) for image in images]
301    
302    # I think this doesn't need to be here, the user can handle that himself
303    def set_page_delay(self, temps_min=0, temps_max=0):
304        """Define the time between pages, random seconds, min and max"""
305        self.temps_min = temps_min
306        if(temps_min > temps_max):
307            self.temps_max = temps_min
308        else:
309            self.temps_max = temps_max
310        if self.verbose:
311            print 'temps_min:', self.temps_min, ', temps_max:', self.temps_max
312
313    # I think this doesn't need to be here, the user can handle that himself
314    # by using the response
315    def strip_tags(self, html):
316        """Strip all tags of an HTML string and return only texts"""
317        intag = [False]
318        def chk(c):
319            if intag[0]:
320                intag[0] = (c != '>')
321                return False
322            elif c == '<':
323                intag[0] = True
324                return False
325            return True
326        return ''.join(c for c in html if chk(c))
327