pynav.py - This is a Python class that simulates web browsi…

/pynav.py

https://bitbucket.org/sloft/pynav/ · Python · 327 lines · 230 code · 42 blank · 55 comment · 77 complexity · 5bb802fe21486b0897b0f681c007d4a9 MD5 · raw file

#!/usr/bin/python
#  -*- coding=utf-8 -*-
'''
Created on 15 nov. 2009

@author: Sloft

Licence : GNU General Public License (GPL)
'''

from __future__ import with_statement #for Python 2.5 compatibility
import os
import re
import time
import random
import socket
import urllib
import urllib2
import httplib
import urlparse
import cookielib
try:
    import cPickle as pickle
except ImportError:
    import pickle

class Pynav(object):
    """Programmatic web browser to fetch data and test web sites"""
    version = '0.6.5'
    verbose = False
    
    def __init__(self, timeout=None, proxy=None):
        """ Constructor, many attributes can be used """
        self.temps_min = 0
        self.temps_max = 0
        self.max_page_size = 500000
        self.max_history = 200
        self.verbose = False
        self._set_user_agents_list()
        # Add that as an __init__ argument and remove the 'user_agents_list'
        self.user_agent = self.user_agent_list['firefox']['windows']
        self._headers = {'User-Agent' : self.user_agent}
        self._auto_referer = False
        self._cookie_jar = cookielib.CookieJar()
        self.proxy = proxy
        self._url_opener = urllib2.build_opener(*self._get_handlers())
        self.history = []
        self.current_page = -1
        self.page_document_type = None
        self.page_info = None
        self.real_url = None
        self.relative_url = None
        self.base_url = None
        self.response = None
        # Pass that to the download function
        self.download_path = os.getcwd()
        if timeout:
            socket.setdefaulttimeout(timeout)
    
    def _get_handlers(self):
        """Private method to get all handlers needed"""
        handlers = []
        handlers.append(urllib2.HTTPCookieProcessor(self._cookie_jar))
        if self.proxy:
            handlers.append(urllib2.ProxyHandler({'http': self.proxy}))
        return handlers
       
    def _set_user_agents_list(self):
        """Private method to set the user agents list"""
        self.user_agent_list = {}
        self.user_agent_list['firefox'] = \
        {'windows' : 'Mozilla/5.0 (Windows; U; Windows NT 6; fr; rv:1.9.1.5) Gecko/Firefox/3.5.5'}
        self.user_agent_list['ie'] = {
            'windows' : 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Win64; x64; Trident/4.0)'}
    
    def set_http_auth(self, base_url, username, password):
        """Define parameters to set HTTP Basic Authentication"""
        password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
        password_mgr.add_password(None, base_url, username, password)
        handler = urllib2.HTTPBasicAuthHandler(password_mgr)
        self._url_opener.add_handler(handler)

    def _set_referer(self, referer):
        """Decorator to define a referer, the previous visited page"""
        self._headers['Referer'] = referer
    
    def _get_referer(self):
        """Decorator to get the referer, the previous visited page"""
        if self._headers.has_key('Referer'):
            return self._headers['Referer']
        else:
            return None
    
    referer = property(_get_referer, _set_referer)

    def _set_auto_referer(self, auto_referer):
        """Decorator to set the status of the auto_referer attribute"""
        self._auto_referer = auto_referer
        if not auto_referer:
            if self._headers.has_key('Referer'):
                self._headers.pop('Referer')
    
    def _get_auto_referer(self):
        """Decorator to get the status of the auto_referer attribute"""
        return self._auto_referer
    
    autoReferer = property(_get_auto_referer, _set_auto_referer)
    
    def save_history(self, file_name):
        """Save history in a file"""
        with open(file_name, 'w') as f:
            pickle.dump(self.history, f)
    
    def load_history(self, file_name):
        """Load history from a file"""
        try:
            with open(file_name, 'r') as f:
                self.history = pickle.load(f)
        except IOError:
            print "ERROR: file", file_name, "doesn't exist"
    
    def _init_go(self):
        """Private method to initialize some attributes"""
        sleep_time = random.randint(self.temps_min, self.temps_max)
        if self.verbose and sleep_time > 0:
            print 'waiting', sleep_time, 'secs'
        if sleep_time:
            time.sleep(sleep_time)
        if self._auto_referer:
            if len(self.history) > 0:
                self.referer = self.history[self.current_page]['url']
        
    def go(self, url, values = {}):
        """Visite a web page, post values can be used"""
        self._init_go()
        
        if not re.search('://', url):
            url = 'http://' + url
        
        if url.count('/') < 3:
            url = url + '/'
        
        data = urllib.urlencode(values)
        
        if values:
            req = urllib2.Request(url, data, self._headers)
        else:
            req = urllib2.Request(url, headers=self._headers)
        
        self.response = None
        
        handle = None
        try:
            handle = self._url_opener.open(req)
        except urllib2.HTTPError, exception:
            if exception.code == 404:
                print '(404) Page not found !'
            else:           
                print 'HTTP request failed with error %d (%s)' % (
                    exception.code, exception.msg)
        except urllib2.URLError, exception:
            print 'Opening URL failed because:', exception.reason
        except httplib.BadStatusLine, exception:
            print exception.line #print nothing...
            print "BadStatusLine Error! Httplib issue, can't get this page, sorry..."

        if handle:
            # Maybe pack these attributes into the response object?
            self.response = handle.read(self.max_page_size)
            self.page_document_type = handle.info().getheader("Content-Type","")
            self.page_info = handle.info()
            self.real_url = handle.geturl()
            
            if len(self.history) > self.max_history - 1:
                del self.history[0]
            self.current_page = self.current_page + 1
            self.history.append({'url':url, 'post':values, 'response':self.response})
            
            if self.current_page > len(self.history) - 1:
                self.current_page = len(self.history) - 1
            
            self.relative_url = self.real_url.replace(self.real_url.split('/')[-1], '')
            self.base_url = 'http://'+self.real_url[7:].split('/')[0]+'/'
            return self.response
        else:
            # I think this can be removed and simply return None for each
            # except above
            return None #Exception ?
    
    def replay(self, begining=0, end=None, print_url=False, print_post=False, print_response=False):
        """Replay history, can be used after loading history from a file"""
        history, self.history = self.history, []
        if not end:
            end = len(history)
        for page in history[begining:end]:
            self.go(page['url'], page['post'])
            if print_url:
                print page['url']
            if print_post:
                print page['post']
            if print_response:
                print page['response']
    
    def search(self, reg):
        """Search a regex in the page, return a boolean"""
        return re.search(reg, self.response)
    
    # I think this doesn't need to be here, the user can handle that himself
    # by using the response
    def find(self, reg):
        """Return the result found by the regex"""
        res = re.findall(reg, self.response, re.S)
        if len(res)==1:
            return res[0]
        else:
            return res
    
    # I think this doesn't need to be here, the user can handle that himself
    # by using the response
    def find_all(self, reg):
        """Return all results found by the regex"""
        return re.findall(reg, self.response, re.S)
    
    def download(self, url, destination=None):
        """Download the file at a url to a file or destination"""
        if not destination:
            destination = self.download_path
        
        if os.path.isdir(destination):
            if destination[-1] in ('/', '\\'):
                destination = destination + url.split('/')[-1]
            else:
                destination = destination + '/' + url.split('/')[-1]
        else:
            destination = self.download_path + destination

        if self.verbose:
            print 'Downloading to:', destination
        return urllib.urlretrieve(url, destination)

    # I think this doesn't need to be here, the user can handle that himself
    # by using the response
    def save_response(self, destination):
        """Save the page to a file"""
        f = open(destination, 'w')
        try:
            f.write(self.response)
        finally:
            f.close()
    
    def get_cookies(self, web_page=None):
        """This always returns the cookies the current visited URL holds,
           if web_page is specified this will return the cookies this web_page has
           within our browser instance."""
        if not web_page:
            if not self.base_url:
                return None
            else:
                netloc = urlparse.urlparse(self.base_url).netloc
        else:
            netloc = urlparse.urlparse(web_page).netloc
        return self._cookie_jar._cookies[netloc]['/']

    def cookie_exists(self, name='PHPSESSID'):
        """Test if a cookie exists. Kept for Pynav 0.6 compatibility"""
        return name in [cookie.name for cookie in self._cookie_jar]
    
    def add_path(self, url):
        """Correct an url depending on the link, internal use"""
        if re.search('://', url):
            return url
        else:
            if url == '':
                return self.base_url
            if url[0] == '/':
                return self.base_url[:-1]+url
            else:
                return self.relative_url+url
    
    # I think this doesn't need to be here, the user can handle that himself
    # by using the response
    def get_all_links(self, reg = None):
        """Return a list of all links found, a regex can be used"""
        links = re.findall('href="(.*?)"', self.response)
        if reg:
            def match(link): return len(re.findall(reg, link)) > 0
            return [self.add_path(link) for link in links if match(link)]
        else:
            return [self.add_path(link) for link in links]
    
    # I think this doesn't need to be here, the user can handle that himself
    # by using the response
    def get_all_images(self, reg = None):
        """Return a list of all images found, a regex can be used"""
        images = re.findall('img.*?src="(.*?)"', self.response)
        if reg:
            def match(image): return len(re.findall(reg, image)) > 0
            return [self.add_path(image) for image in images if match(image)]
        else:
            return [self.add_path(image) for image in images]
    
    # I think this doesn't need to be here, the user can handle that himself
    def set_page_delay(self, temps_min=0, temps_max=0):
        """Define the time between pages, random seconds, min and max"""
        self.temps_min = temps_min
        if(temps_min > temps_max):
            self.temps_max = temps_min
        else:
            self.temps_max = temps_max
        if self.verbose:
            print 'temps_min:', self.temps_min, ', temps_max:', self.temps_max

    # I think this doesn't need to be here, the user can handle that himself
    # by using the response
    def strip_tags(self, html):
        """Strip all tags of an HTML string and return only texts"""
        intag = [False]
        def chk(c):
            if intag[0]:
                intag[0] = (c != '>')
                return False
            elif c == '<':
                intag[0] = True
                return False
            return True
        return ''.join(c for c in html if chk(c))
Summary ✨

This is a Python class that simulates web browsing using a browser instance. It allows users to navigate between URLs, download files, and extract links, images, and cookies from web pages. The class also includes features like page delay, cookie management, and HTML parsing. It can be used for testing or automation purposes.
Tech Fingerprint

Alerts (17)

'time.sleep(' Avoid blocking; use threading.Timer or asyncio.sleep for non-blocking delays
128
'= {}' Avoid mutable defaults like '= []' or '= {}'; use None and initialize inside
133
'del' Avoid unless necessary; Python's garbage collector typically handles object deletion
175
'open(' Use 'with open()' to ensure Files are properly closed
245
Complexity hotspot; lines 285 to 289 (total complexity: 7)
285 286 287 288 289
'def' Ensure functions have docstrings for documentation
286 297 318
Complexity hotspot; lines 296 to 300 (total complexity: 7)
296 297 298 299 300