spider.py | searchcode

/src/pentest/grabber/spider.py

https://github.com/sullivanmatt/Raspberry-Pwn · Python · 578 lines · 474 code · 45 blank · 59 comment · 89 complexity · d6a70c3c3a3252c548f5dffa8f73aabf MD5 · raw file

#!/usr/bin/env python

"""

	Spider Module for Grabber v0.1

	Copyright (C) 2006 - Romain Gaucher - http://rgaucher.info

"""

import urllib

import time

import re,sys,os,string

from BeautifulSoup import BeautifulSoup,SoupStrainer

from urllib2 import URLError, HTTPError

COOKIEFILE = 'cookies.lwp'          # the path and filename that you want to use to save your cookies in

import os.path

cj = None

ClientCookie = None

cookielib = None



import cookielib

import urllib2

urlopen = urllib2.urlopen

cj = cookielib.LWPCookieJar()       # This is a subclass of FileCookieJar that has useful load and save methods

Request = urllib2.Request

txdata = None

refererUrl = "http://google.com/?q=you!"

txheaders = {'User-agent' : 'Grabber/0.1 (X11; U; Linux i686; en-US; rv:1.7)', 'Referer' : refererUrl}



allowed=['php','html','htm','xml','xhtml','xht','xhtm',

         'asp','aspx','msp','mspx','php3','php4','php5','txt','shtm',

	    'shtml','phtm','phtml','jhtml','pl','jsp','cfm','cfml','do','py',

		'js', 'css']

database     = {}

database_url = []

database_css = []

database_js  = []

database_ext = [] # database of unsecure external links

local_url    = []

dumb_params  = [] # if there is no parameters associated with a given URL, associate this list of "whatever looks like"

root = "http://localhost"





outSpiderFile = None



"""

	database = {

	 u"URL" : {'GET' : {'param1':value}, 'POST' : { 'param2' : value }},

	 u"URL" : {'GET' : {'param1':value}, 'POST' : { 'param2' : value }},

	 u"URL" : {'GET' : {'param1':value}, 'POST' : { 'param2' : value }}

	}

"""

_urlEncode = {}

for i in range(256):

	_urlEncode[chr(i)] = '%%%02x' % i

for c in string.letters + string.digits + '_,.-/':

	_urlEncode[c] = c

_urlEncode[' '] = '+'





def urlEncode(s):

	""" 

		Returns the encoded version of the given string, safe for using as a URL. 

	"""

	return string.join(map(lambda c: _urlEncode[c], list(s)), '')







def urlDecode(s):

	""" 

		Returns the decoded version of the given string. Note that invalid URLs will throw exceptons. 

		For example, a URL whose % coding is incorrect. 

	"""

	mychr = chr

	atoi = string.atoi

	parts = string.split(string.replace(s, '+', ' '), '%')

	for i in range(1, len(parts)):

		part = parts[i]

		parts[i] = mychr(atoi(part[:2], 16)) + part[2:]

	return string.join(parts, '')







def htmlencode(s):

	"""

		Escaping the HTML special characters

	"""

 	s = s.replace("&", "&amp;")

	s = s.replace("<", "&lt;")

	s = s.replace(">", "&gt;")

	s = s.replace("\"","&quot;")

	s = s.replace("'", "&apos;")

	return s







def htmldecode(s):

	"""

		Unescaping the HTML special characters

	"""

	s = s.replace("&lt;", "<")

	s = s.replace("&gt;", ">")

	s = s.replace("&quot;", "\"")

	s = s.replace("&apos;","'")

	s = s.replace("&amp;", "&")

	return s







def getContentDirectURL_GET(url, string):

	"""

		Get the content of the url by GET method

	"""

	ret = ""

	try:

		if len(string) > 0:

			url = url + "?" + (string)

		opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))

		urllib2.install_opener(opener)

		req = Request(url, None, txheaders) # create a request object

		ret = urlopen(req)                     # and open it to return a handle on the url

	except HTTPError, e:

		return

	except URLError, e:

		return

	except IOError:

		return

	return ret







def scan(currentURL):

	"""

		The Scanner is the first part of Grabber.

		It retrieves every information of the HTML page

		TODO:

			Reading in every href='' element for CSS and src='' for JavaScript / Image

	"""

	try:

		archives_hDl = getContentDirectURL_GET(currentURL,'')

	except IOError:

		log <= ("IOError @ %s" % currentURL)

	try:

		htmlContent= archives_hDl.read()

	except IOError, e:

		print "Cannot open the file,",(e.strerror)

		return

	except AttributeError:

		print ("Grabber cannot retrieve the given url: %s" % currentURL)

		return

	parseHtmlLinks (currentURL,htmlContent)

	parseHtmlParams(currentURL,htmlContent)



def allowedExtensions(plop):

	for e in allowed:

		if '.'+e in plop:

			return True

	return False







def makeRoot(urlLocal):

	if allowedExtensions(urlLocal):

		return urlLocal[0:urlLocal.rfind('/')+1]

	return urlLocal







def giveGoodURL(href, urlLocal):

	"""

		It should return a good url...

		href = argument retrieven from the href...

	"""

	if 'javascript' in href:

		return htmldecode(urlLocal)

	if 'http://' in href or 'https://' in href:

		if urlLocal in href:

			return htmldecode(href)

		else:

			return urlLocal

	if len(href) < 1:

		return htmldecode(urlLocal)

	if href[0] == '?' and '?' not in urlLocal and not allowedExtensions(urlLocal):

		for e in allowed:

			if '.'+e in urlLocal:

				return htmldecode(urlLocal + href)

		return htmldecode(urlLocal + '/' + href)

	else:

		# simple name

		if allowedExtensions(urlLocal) or '?' in urlLocal:

			return htmldecode(urlLocal[0:urlLocal.rfind('/')+1] + href)

		else:

			return htmldecode(urlLocal + '/' + href)

	return htmldecode(href)





def dl(fileAdress, destFile):

	"""

		Download the file

	"""

	try:

		f =  urllib.urlopen(fileAdress)

		g = f.read()

		file = open(os.path.join('./', destFile), "wb")

	except IOError:

		return False

	file.write(g)

	file.close()

	return True





def removeSESSID(urlssid):

	"""

		Remove the phpsessid information... don't care about it now

	"""

	k = urlssid.find('PHPSESSID')

	if k > 0:

		return urlssid[0:k-1]

	k = urlssid.find('sid')

	if k > 0:

		return urlssid[0:k-1]

	return urlssid



def parseHtmlLinks(currentURL,htmlContent):

	global database_url,database_js,database_css

	"""

		Parse the HTML/XHTML code to get JS, CSS, links etc.

	"""

	links = SoupStrainer('a')

	# listAnchors = [tag['href'] for tag in BeautifulSoup(htmlContent, parseOnlyThese=links)]

	listAnchors = []

	for tag in BeautifulSoup(htmlContent, parseOnlyThese=links):

		try:

			string = str(tag).lower()

			if string.count("href") > 0:

				listAnchors.append(tag['href'])

		except TypeError:

			continue

		except KeyError:

			continue



	for a in listAnchors:

		goodA = giveGoodURL(a,currentURL)

		goodA = removeSESSID(goodA)

		if (root in goodA) and (goodA not in database_url):

			database_url.append(goodA)



	# parse the CSS and the JavaScript

	script = SoupStrainer('script')

	#listScripts = [tag['src'] for tag in BeautifulSoup(htmlContent, parseOnlyThese=script)]

	listScripts = []

	for tag in BeautifulSoup(htmlContent, parseOnlyThese=script):

		try:

			string = str(tag).lower()

			if string.count("src") > 0 and string.count(".src") < 1:

				listScripts.append(tag['src'])

		except TypeError:

			continue

		except KeyError:

			continue



	for a in listScripts:

		sc = giveGoodURL(a,currentURL)

		if sc not in database_js:

			database_js.append(sc)

		if sc == currentURL:

			# remote script

			database_ext.append(sc)

	parseJavaScriptCalls()



	link = SoupStrainer('link')

	# listLinks = [tag['href'] for tag in BeautifulSoup(htmlContent, parseOnlyThese=link)]

	listLinks = []

	for tag in BeautifulSoup(htmlContent, parseOnlyThese=link):

		try:

			string = str(tag).lower()

			if string.count("href") > 0:

				listLinks.append(tag['href'])

		except TypeError:

			continue

		except KeyError:

			continue



	for a in listLinks:

		sc = giveGoodURL(a,currentURL)

		if sc not in database_css:

			database_css.append(sc)

	return True



jsChars = ["'",'"']



def rfindFirstJSChars(string):

	b = [string.rfind(k) for k in jsChars]

	return max(b)



regDumbParam = re.compile(r'(\w+)')

regDumbParamNumber = re.compile(r'(\d+)')



jsParams = ["'",'"','=','+','%','\\',')','(','^','*','-']



def cleanListDumbParams(listDumb):

	newDumbList = []

	for w in listDumb:

		w = w.replace(' ','')

		w = w.replace('\n','')

		#l = [c for c in jsParams if c in w] # no jsParams

		if len(w) > 0 and regDumbParam.match(w) and not regDumbParamNumber.match(w):

			newDumbList.append(w)

	return newDumbList



def unique(L):

	noDupli=[]

	[noDupli.append(i) for i in L if not noDupli.count(i)]

	return noDupli



def flatten(L):

	if type(L) != type([]):

		return [L]

	if L == []:

		return L

	return reduce(lambda L1,L2:L1+L2,map(flatten,L))





def parseJavaScriptContent(jsContent):

	global database_url, database_ext, dumb_params

	"""

		Parse the content of a JavaScript file

	"""

	for l in jsContent.readlines():

		for e in allowed:

			if l.count('.'+e) > 0:

				# we found an external a call

				if l.count('http://') > 0 and l.count(root) < 1:

					# External link

					et= '.'+e

					b1 = l.find('http://')

					b2 = l.find(et) + len(et)

					database_ext.append(l[b1:b2])

				else:

					# Internal link

					et= '.'+e

					b2 = l.find(et) + len(et)

					b1 = rfindFirstJSChars(l[:b2])+1

					database_url.append(giveGoodURL(l[b1:b2],root))

		# try to get a parameter

		k = l.find('?')

		if k > 0:

			results = l[k:].split('?')

			plop = []

			for a in results:

				plop.append(cleanListDumbParams(regDumbParam.split(a)))

			dumb_params.append(flatten(plop))

		k = l.find('&')

		if k > 0:

			results = l[k:].split('&')

			plop = []

			for a in results:

				plop.append(cleanListDumbParams(regDumbParam.split(a)))

			plop = flatten(plop)

			dumb_params.append(flatten(plop))

	dumb_params = unique(flatten(dumb_params))



def parseJavaScriptCalls():

	global database_js

	"""

		Parse the JavaScript and download the files

	"""

	for j in database_js:

		jsName = j[j.rfind('/')+1:]

		if not os.path.exists('local/js/' + jsName):

			# first download the file

			dl(j,'local/js/' + jsName)

			try:

				jsContent = open('local/js/' + jsName, 'r')

			except IOError:

				continue

			parseJavaScriptContent(jsContent)

			jsContent.close()



def splitQuery(query_string):

	"""

		Split the num=plop&truc=kikoo&o=42 into

		a dictionary

	"""

	try:

		d = dict([x.split('=') for x in query_string.split('&') ])

	except ValueError:

		d = {}

	return d



def dict_add(d1,d2):

	"""

		Flatten 2 dictionaries

	"""

	d={}

	if len(d1):

		for s in d1.keys():

			d[s] = d1[s]

	if len(d2):

		for s in d2.keys():

			d[s] = d2[s]

	return d



def dict_add_list(d1,l1):

	d={}

	if len(d1):

		for s in d1.keys():

			d[s] = d1[s]

	if len(l1):

		for s in l1:

			d[s] = 'bar'

	return d



def parseHtmlParams(currentURL, htmlContent):

	global database, database_css, database_js

	"""

		Parse html to get args

	"""

	for url in database_url:

		k = url.find('?')

		if k > 0:

			keyUrl = url[0:k-1]

			query = url[k+1:]

			if not keyUrl in database:

				database[keyUrl] = {}

				database[keyUrl]['GET']  = {}

				database[keyUrl]['POST'] = {}

			lG = database[keyUrl]['GET']

			lG = dict_add(lG,splitQuery(query))

			database[keyUrl]['GET']  = lG

		elif len(dumb_params) > 0:

			keyUrl = url

			# no params in the URL... let's assign the dumb_params

			if not keyUrl in database:

				database[keyUrl] = {}

				database[keyUrl]['GET']  = {}

				database[keyUrl]['POST'] = {}

			lG = database[keyUrl]['GET']

			lP = database[keyUrl]['POST']

			lG = dict_add_list(lG,dumb_params)

			lP = dict_add_list(lP,dumb_params)

			database[keyUrl]['GET']  = lG

			database[keyUrl]['POST'] = lP



	# then, parse the forms

	forms = SoupStrainer('form')

	input = SoupStrainer('input')

	listForm = [tag for tag in BeautifulSoup(htmlContent, parseOnlyThese=forms)]

	for f in listForm:

		method = 'GET'

		if 'method' in f or 'METHOD' in f:

			method = f['method'].upper()

		action = currentURL

		if 'action' in f or 'ACTION' in f:

			action = f['action']

		keyUrl = giveGoodURL(action,currentURL)

		listInput = [tag for tag in BeautifulSoup(str(f), parseOnlyThese=input)]

		for i in listInput:

			if not keyUrl in database:

				database[keyUrl] = {}

				database[keyUrl]['GET']  = {}

				database[keyUrl]['POST'] = {}

			try:

				value = i['value']

			except KeyError:

				value = '42'

			try:

				name = i['name']

			except KeyError:

				name = 'foo'

				value= 'bar'

				continue

			lGP = database[keyUrl][method]

			lGP = dict_add(lGP,{name : value})

			database[keyUrl][method] = lGP

	return True





def runSpiderScan(entryUrl, depth = 0):

	global outSpiderFile

	print "runSpiderScan @ ", entryUrl, " |   #",depth

	if outSpiderFile:

		outSpiderFile.write("\t\t<entryURL>%s</entryURL>\n" % entryUrl)

	scan(entryUrl)

	if depth > 0 and len(database_url) > 0:

		for a in database_url:

			runSpiderScan(a, depth-1)

		return False

	return True





def spider(entryUrl, depth = 0):

	global root,outSpiderFile

	"""

		Retrieve every links

	"""

	if depth > 0:

		root = makeRoot(entryUrl)

	else:

		root = entryUrl

	

	# test if the spider has already be done on this website

	try:

		f = open("local/spiderSite.xml", 'r')

		firstLine = f.readline()

		f.close()

		if firstLine.count(root) > 0:

			alreadyScanned = True

		else:

			alreadyScanned = False

	except IOError:

		alreadyScanned = False



	print "Start scanning...", root

	if depth == 0:

		scan(root)

	else:

		if not alreadyScanned:

			outSpiderFile = open("local/spiderSite.xml","w")

			outSpiderFile.write("<spider root='%s' depth='%d'>\n" % (root,depth) )

			runSpiderScan(root, depth)

			if len(dumb_params) > 0:

				outSpiderFile.write("<dumb_parameters>\n")

				for d in dumb_params:

					outSpiderFile.write("\t<dumb>%s</dumb>\n" % (d))

				outSpiderFile.write("</dumb_parameters>\n")

			outSpiderFile.write("\n</spider>")

			outSpiderFile.close()

		else:

			print "Loading the previous spider results from 'local/spiderSite.xml'"

			# load the XML file

			regUrl = re.compile(r'(.*)<entryURL>(.*)</entryURL>(.*)',re.I)

			regDmb = re.compile(r'(.*)<dumb>(.*)</dumb>(.*)',re.I)



			f = open("local/spiderSite.xml", 'r')

			for l in f.readlines():

				if regUrl.match(l):

					out = regUrl.search(l)

					url = out.group(2)

					database_url.append(url)

				if regDmb.match(l):

					out = regDmb.search(l)

					param = out.group(2)

					dumb_params.append(param)

			f.close()



			# scan every url

			for currentURL in database_url:

				try:

					archives_hDl = getContentDirectURL_GET(currentURL,'')

				except IOError:

					log <= ("IOError @ %s" % currentURL)

					continue

				try:

					htmlContent= archives_hDl.read()

				except IOError, e:

					continue

				except AttributeError, e:

					continue

				parseHtmlParams(currentURL,htmlContent)





	outSpiderFile = open("results/touchFiles.xml","w")

	outSpiderFile.write("<spider root='%s'>\n" % root)

	for i in database_url:

		outSpiderFile.write("\t<url type='anchor'>%s</url>\n" % i)

	for i in database_js:

		outSpiderFile.write("\t<url type='JavaScript'>%s</url>\n" % i)

	for i in database_css:

		outSpiderFile.write("\t<url type='MetaLink'>%s</url>\n" % i)

	outSpiderFile.write("</spider>")

	outSpiderFile.close()



	if len(database_ext) > 0:

		# alert of External calls

		outSpiderFile = open("results/externalCalls.xml","w")

		outSpiderFile.write("<external>\n")

		for i in database_ext:

			outSpiderFile.write("\t<call severity='high'>%s</call>\n" % i)

		outSpiderFile.write("</external>")

		outSpiderFile.close()
Tech Fingerprint

Alerts (41)

'list(' Avoid unnecessary list conversions; use generators where possible
61
'open(' Use 'with open()' to ensure Files are properly closed
117 198 200 370 500 515 531 559 572
'def' Ensure functions have docstrings for documentation
150 158 220 297 307 312 320 359 400 410 475 488
'global' Avoid global variables; use function parameters or class attributes for better scope management
221 321 360 411 476 489
Complexity hotspot; line 303 (total complexity: 3)
303
'type(' Use isinstance() for type checking instead of type()
313
Complexity hotspot; lines 325 to 327 (total complexity: 3)
325 326 327
Complexity hotspot; lines 381 to 383 (total complexity: 3)
381 382 383
Complexity hotspot; lines 453 to 455 (total complexity: 3)
453 454 455
Complexity hotspot; lines 481 to 482 (total complexity: 3)
481 482