/src/pentest/grabber/spider.py
Python | 578 lines | 554 code | 11 blank | 13 comment | 21 complexity | d6a70c3c3a3252c548f5dffa8f73aabf MD5 | raw file
Possible License(s): BSD-3-Clause, AGPL-1.0, MPL-2.0-no-copyleft-exception, GPL-2.0, GPL-3.0
- #!/usr/bin/env python
- """
- Spider Module for Grabber v0.1
- Copyright (C) 2006 - Romain Gaucher - http://rgaucher.info
- """
- import urllib
- import time
- import re,sys,os,string
- from BeautifulSoup import BeautifulSoup,SoupStrainer
- from urllib2 import URLError, HTTPError
- COOKIEFILE = 'cookies.lwp' # the path and filename that you want to use to save your cookies in
- import os.path
- cj = None
- ClientCookie = None
- cookielib = None
-
- import cookielib
- import urllib2
- urlopen = urllib2.urlopen
- cj = cookielib.LWPCookieJar() # This is a subclass of FileCookieJar that has useful load and save methods
- Request = urllib2.Request
- txdata = None
- refererUrl = "http://google.com/?q=you!"
- txheaders = {'User-agent' : 'Grabber/0.1 (X11; U; Linux i686; en-US; rv:1.7)', 'Referer' : refererUrl}
-
- allowed=['php','html','htm','xml','xhtml','xht','xhtm',
- 'asp','aspx','msp','mspx','php3','php4','php5','txt','shtm',
- 'shtml','phtm','phtml','jhtml','pl','jsp','cfm','cfml','do','py',
- 'js', 'css']
- database = {}
- database_url = []
- database_css = []
- database_js = []
- database_ext = [] # database of unsecure external links
- local_url = []
- dumb_params = [] # if there is no parameters associated with a given URL, associate this list of "whatever looks like"
- root = "http://localhost"
-
-
- outSpiderFile = None
-
- """
- database = {
- u"URL" : {'GET' : {'param1':value}, 'POST' : { 'param2' : value }},
- u"URL" : {'GET' : {'param1':value}, 'POST' : { 'param2' : value }},
- u"URL" : {'GET' : {'param1':value}, 'POST' : { 'param2' : value }}
- }
- """
- _urlEncode = {}
- for i in range(256):
- _urlEncode[chr(i)] = '%%%02x' % i
- for c in string.letters + string.digits + '_,.-/':
- _urlEncode[c] = c
- _urlEncode[' '] = '+'
-
-
- def urlEncode(s):
- """
- Returns the encoded version of the given string, safe for using as a URL.
- """
- return string.join(map(lambda c: _urlEncode[c], list(s)), '')
-
-
-
- def urlDecode(s):
- """
- Returns the decoded version of the given string. Note that invalid URLs will throw exceptons.
- For example, a URL whose % coding is incorrect.
- """
- mychr = chr
- atoi = string.atoi
- parts = string.split(string.replace(s, '+', ' '), '%')
- for i in range(1, len(parts)):
- part = parts[i]
- parts[i] = mychr(atoi(part[:2], 16)) + part[2:]
- return string.join(parts, '')
-
-
-
- def htmlencode(s):
- """
- Escaping the HTML special characters
- """
- s = s.replace("&", "&")
- s = s.replace("<", "<")
- s = s.replace(">", ">")
- s = s.replace("\"",""")
- s = s.replace("'", "'")
- return s
-
-
-
- def htmldecode(s):
- """
- Unescaping the HTML special characters
- """
- s = s.replace("<", "<")
- s = s.replace(">", ">")
- s = s.replace(""", "\"")
- s = s.replace("'","'")
- s = s.replace("&", "&")
- return s
-
-
-
- def getContentDirectURL_GET(url, string):
- """
- Get the content of the url by GET method
- """
- ret = ""
- try:
- if len(string) > 0:
- url = url + "?" + (string)
- opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
- urllib2.install_opener(opener)
- req = Request(url, None, txheaders) # create a request object
- ret = urlopen(req) # and open it to return a handle on the url
- except HTTPError, e:
- return
- except URLError, e:
- return
- except IOError:
- return
- return ret
-
-
-
- def scan(currentURL):
- """
- The Scanner is the first part of Grabber.
- It retrieves every information of the HTML page
- TODO:
- Reading in every href='' element for CSS and src='' for JavaScript / Image
- """
- try:
- archives_hDl = getContentDirectURL_GET(currentURL,'')
- except IOError:
- log <= ("IOError @ %s" % currentURL)
- try:
- htmlContent= archives_hDl.read()
- except IOError, e:
- print "Cannot open the file,",(e.strerror)
- return
- except AttributeError:
- print ("Grabber cannot retrieve the given url: %s" % currentURL)
- return
- parseHtmlLinks (currentURL,htmlContent)
- parseHtmlParams(currentURL,htmlContent)
-
- def allowedExtensions(plop):
- for e in allowed:
- if '.'+e in plop:
- return True
- return False
-
-
-
- def makeRoot(urlLocal):
- if allowedExtensions(urlLocal):
- return urlLocal[0:urlLocal.rfind('/')+1]
- return urlLocal
-
-
-
- def giveGoodURL(href, urlLocal):
- """
- It should return a good url...
- href = argument retrieven from the href...
- """
- if 'javascript' in href:
- return htmldecode(urlLocal)
- if 'http://' in href or 'https://' in href:
- if urlLocal in href:
- return htmldecode(href)
- else:
- return urlLocal
- if len(href) < 1:
- return htmldecode(urlLocal)
- if href[0] == '?' and '?' not in urlLocal and not allowedExtensions(urlLocal):
- for e in allowed:
- if '.'+e in urlLocal:
- return htmldecode(urlLocal + href)
- return htmldecode(urlLocal + '/' + href)
- else:
- # simple name
- if allowedExtensions(urlLocal) or '?' in urlLocal:
- return htmldecode(urlLocal[0:urlLocal.rfind('/')+1] + href)
- else:
- return htmldecode(urlLocal + '/' + href)
- return htmldecode(href)
-
-
- def dl(fileAdress, destFile):
- """
- Download the file
- """
- try:
- f = urllib.urlopen(fileAdress)
- g = f.read()
- file = open(os.path.join('./', destFile), "wb")
- except IOError:
- return False
- file.write(g)
- file.close()
- return True
-
-
- def removeSESSID(urlssid):
- """
- Remove the phpsessid information... don't care about it now
- """
- k = urlssid.find('PHPSESSID')
- if k > 0:
- return urlssid[0:k-1]
- k = urlssid.find('sid')
- if k > 0:
- return urlssid[0:k-1]
- return urlssid
-
- def parseHtmlLinks(currentURL,htmlContent):
- global database_url,database_js,database_css
- """
- Parse the HTML/XHTML code to get JS, CSS, links etc.
- """
- links = SoupStrainer('a')
- # listAnchors = [tag['href'] for tag in BeautifulSoup(htmlContent, parseOnlyThese=links)]
- listAnchors = []
- for tag in BeautifulSoup(htmlContent, parseOnlyThese=links):
- try:
- string = str(tag).lower()
- if string.count("href") > 0:
- listAnchors.append(tag['href'])
- except TypeError:
- continue
- except KeyError:
- continue
-
- for a in listAnchors:
- goodA = giveGoodURL(a,currentURL)
- goodA = removeSESSID(goodA)
- if (root in goodA) and (goodA not in database_url):
- database_url.append(goodA)
-
- # parse the CSS and the JavaScript
- script = SoupStrainer('script')
- #listScripts = [tag['src'] for tag in BeautifulSoup(htmlContent, parseOnlyThese=script)]
- listScripts = []
- for tag in BeautifulSoup(htmlContent, parseOnlyThese=script):
- try:
- string = str(tag).lower()
- if string.count("src") > 0 and string.count(".src") < 1:
- listScripts.append(tag['src'])
- except TypeError:
- continue
- except KeyError:
- continue
-
- for a in listScripts:
- sc = giveGoodURL(a,currentURL)
- if sc not in database_js:
- database_js.append(sc)
- if sc == currentURL:
- # remote script
- database_ext.append(sc)
- parseJavaScriptCalls()
-
- link = SoupStrainer('link')
- # listLinks = [tag['href'] for tag in BeautifulSoup(htmlContent, parseOnlyThese=link)]
- listLinks = []
- for tag in BeautifulSoup(htmlContent, parseOnlyThese=link):
- try:
- string = str(tag).lower()
- if string.count("href") > 0:
- listLinks.append(tag['href'])
- except TypeError:
- continue
- except KeyError:
- continue
-
- for a in listLinks:
- sc = giveGoodURL(a,currentURL)
- if sc not in database_css:
- database_css.append(sc)
- return True
-
- jsChars = ["'",'"']
-
- def rfindFirstJSChars(string):
- b = [string.rfind(k) for k in jsChars]
- return max(b)
-
- regDumbParam = re.compile(r'(\w+)')
- regDumbParamNumber = re.compile(r'(\d+)')
-
- jsParams = ["'",'"','=','+','%','\\',')','(','^','*','-']
-
- def cleanListDumbParams(listDumb):
- newDumbList = []
- for w in listDumb:
- w = w.replace(' ','')
- w = w.replace('\n','')
- #l = [c for c in jsParams if c in w] # no jsParams
- if len(w) > 0 and regDumbParam.match(w) and not regDumbParamNumber.match(w):
- newDumbList.append(w)
- return newDumbList
-
- def unique(L):
- noDupli=[]
- [noDupli.append(i) for i in L if not noDupli.count(i)]
- return noDupli
-
- def flatten(L):
- if type(L) != type([]):
- return [L]
- if L == []:
- return L
- return reduce(lambda L1,L2:L1+L2,map(flatten,L))
-
-
- def parseJavaScriptContent(jsContent):
- global database_url, database_ext, dumb_params
- """
- Parse the content of a JavaScript file
- """
- for l in jsContent.readlines():
- for e in allowed:
- if l.count('.'+e) > 0:
- # we found an external a call
- if l.count('http://') > 0 and l.count(root) < 1:
- # External link
- et= '.'+e
- b1 = l.find('http://')
- b2 = l.find(et) + len(et)
- database_ext.append(l[b1:b2])
- else:
- # Internal link
- et= '.'+e
- b2 = l.find(et) + len(et)
- b1 = rfindFirstJSChars(l[:b2])+1
- database_url.append(giveGoodURL(l[b1:b2],root))
- # try to get a parameter
- k = l.find('?')
- if k > 0:
- results = l[k:].split('?')
- plop = []
- for a in results:
- plop.append(cleanListDumbParams(regDumbParam.split(a)))
- dumb_params.append(flatten(plop))
- k = l.find('&')
- if k > 0:
- results = l[k:].split('&')
- plop = []
- for a in results:
- plop.append(cleanListDumbParams(regDumbParam.split(a)))
- plop = flatten(plop)
- dumb_params.append(flatten(plop))
- dumb_params = unique(flatten(dumb_params))
-
- def parseJavaScriptCalls():
- global database_js
- """
- Parse the JavaScript and download the files
- """
- for j in database_js:
- jsName = j[j.rfind('/')+1:]
- if not os.path.exists('local/js/' + jsName):
- # first download the file
- dl(j,'local/js/' + jsName)
- try:
- jsContent = open('local/js/' + jsName, 'r')
- except IOError:
- continue
- parseJavaScriptContent(jsContent)
- jsContent.close()
-
- def splitQuery(query_string):
- """
- Split the num=plop&truc=kikoo&o=42 into
- a dictionary
- """
- try:
- d = dict([x.split('=') for x in query_string.split('&') ])
- except ValueError:
- d = {}
- return d
-
- def dict_add(d1,d2):
- """
- Flatten 2 dictionaries
- """
- d={}
- if len(d1):
- for s in d1.keys():
- d[s] = d1[s]
- if len(d2):
- for s in d2.keys():
- d[s] = d2[s]
- return d
-
- def dict_add_list(d1,l1):
- d={}
- if len(d1):
- for s in d1.keys():
- d[s] = d1[s]
- if len(l1):
- for s in l1:
- d[s] = 'bar'
- return d
-
- def parseHtmlParams(currentURL, htmlContent):
- global database, database_css, database_js
- """
- Parse html to get args
- """
- for url in database_url:
- k = url.find('?')
- if k > 0:
- keyUrl = url[0:k-1]
- query = url[k+1:]
- if not keyUrl in database:
- database[keyUrl] = {}
- database[keyUrl]['GET'] = {}
- database[keyUrl]['POST'] = {}
- lG = database[keyUrl]['GET']
- lG = dict_add(lG,splitQuery(query))
- database[keyUrl]['GET'] = lG
- elif len(dumb_params) > 0:
- keyUrl = url
- # no params in the URL... let's assign the dumb_params
- if not keyUrl in database:
- database[keyUrl] = {}
- database[keyUrl]['GET'] = {}
- database[keyUrl]['POST'] = {}
- lG = database[keyUrl]['GET']
- lP = database[keyUrl]['POST']
- lG = dict_add_list(lG,dumb_params)
- lP = dict_add_list(lP,dumb_params)
- database[keyUrl]['GET'] = lG
- database[keyUrl]['POST'] = lP
-
- # then, parse the forms
- forms = SoupStrainer('form')
- input = SoupStrainer('input')
- listForm = [tag for tag in BeautifulSoup(htmlContent, parseOnlyThese=forms)]
- for f in listForm:
- method = 'GET'
- if 'method' in f or 'METHOD' in f:
- method = f['method'].upper()
- action = currentURL
- if 'action' in f or 'ACTION' in f:
- action = f['action']
- keyUrl = giveGoodURL(action,currentURL)
- listInput = [tag for tag in BeautifulSoup(str(f), parseOnlyThese=input)]
- for i in listInput:
- if not keyUrl in database:
- database[keyUrl] = {}
- database[keyUrl]['GET'] = {}
- database[keyUrl]['POST'] = {}
- try:
- value = i['value']
- except KeyError:
- value = '42'
- try:
- name = i['name']
- except KeyError:
- name = 'foo'
- value= 'bar'
- continue
- lGP = database[keyUrl][method]
- lGP = dict_add(lGP,{name : value})
- database[keyUrl][method] = lGP
- return True
-
-
- def runSpiderScan(entryUrl, depth = 0):
- global outSpiderFile
- print "runSpiderScan @ ", entryUrl, " | #",depth
- if outSpiderFile:
- outSpiderFile.write("\t\t<entryURL>%s</entryURL>\n" % entryUrl)
- scan(entryUrl)
- if depth > 0 and len(database_url) > 0:
- for a in database_url:
- runSpiderScan(a, depth-1)
- return False
- return True
-
-
- def spider(entryUrl, depth = 0):
- global root,outSpiderFile
- """
- Retrieve every links
- """
- if depth > 0:
- root = makeRoot(entryUrl)
- else:
- root = entryUrl
-
- # test if the spider has already be done on this website
- try:
- f = open("local/spiderSite.xml", 'r')
- firstLine = f.readline()
- f.close()
- if firstLine.count(root) > 0:
- alreadyScanned = True
- else:
- alreadyScanned = False
- except IOError:
- alreadyScanned = False
-
- print "Start scanning...", root
- if depth == 0:
- scan(root)
- else:
- if not alreadyScanned:
- outSpiderFile = open("local/spiderSite.xml","w")
- outSpiderFile.write("<spider root='%s' depth='%d'>\n" % (root,depth) )
- runSpiderScan(root, depth)
- if len(dumb_params) > 0:
- outSpiderFile.write("<dumb_parameters>\n")
- for d in dumb_params:
- outSpiderFile.write("\t<dumb>%s</dumb>\n" % (d))
- outSpiderFile.write("</dumb_parameters>\n")
- outSpiderFile.write("\n</spider>")
- outSpiderFile.close()
- else:
- print "Loading the previous spider results from 'local/spiderSite.xml'"
- # load the XML file
- regUrl = re.compile(r'(.*)<entryURL>(.*)</entryURL>(.*)',re.I)
- regDmb = re.compile(r'(.*)<dumb>(.*)</dumb>(.*)',re.I)
-
- f = open("local/spiderSite.xml", 'r')
- for l in f.readlines():
- if regUrl.match(l):
- out = regUrl.search(l)
- url = out.group(2)
- database_url.append(url)
- if regDmb.match(l):
- out = regDmb.search(l)
- param = out.group(2)
- dumb_params.append(param)
- f.close()
-
- # scan every url
- for currentURL in database_url:
- try:
- archives_hDl = getContentDirectURL_GET(currentURL,'')
- except IOError:
- log <= ("IOError @ %s" % currentURL)
- continue
- try:
- htmlContent= archives_hDl.read()
- except IOError, e:
- continue
- except AttributeError, e:
- continue
- parseHtmlParams(currentURL,htmlContent)
-
-
- outSpiderFile = open("results/touchFiles.xml","w")
- outSpiderFile.write("<spider root='%s'>\n" % root)
- for i in database_url:
- outSpiderFile.write("\t<url type='anchor'>%s</url>\n" % i)
- for i in database_js:
- outSpiderFile.write("\t<url type='JavaScript'>%s</url>\n" % i)
- for i in database_css:
- outSpiderFile.write("\t<url type='MetaLink'>%s</url>\n" % i)
- outSpiderFile.write("</spider>")
- outSpiderFile.close()
-
- if len(database_ext) > 0:
- # alert of External calls
- outSpiderFile = open("results/externalCalls.xml","w")
- outSpiderFile.write("<external>\n")
- for i in database_ext:
- outSpiderFile.write("\t<call severity='high'>%s</call>\n" % i)
- outSpiderFile.write("</external>")
- outSpiderFile.close()
-