PageRenderTime 46ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/src/pentest/grabber/spider.py

https://github.com/sullivanmatt/Raspberry-Pwn
Python | 578 lines | 554 code | 11 blank | 13 comment | 21 complexity | d6a70c3c3a3252c548f5dffa8f73aabf MD5 | raw file
Possible License(s): BSD-3-Clause, AGPL-1.0, MPL-2.0-no-copyleft-exception, GPL-2.0, GPL-3.0
  1. #!/usr/bin/env python
  2. """
  3. Spider Module for Grabber v0.1
  4. Copyright (C) 2006 - Romain Gaucher - http://rgaucher.info
  5. """
  6. import urllib
  7. import time
  8. import re,sys,os,string
  9. from BeautifulSoup import BeautifulSoup,SoupStrainer
  10. from urllib2 import URLError, HTTPError
  11. COOKIEFILE = 'cookies.lwp' # the path and filename that you want to use to save your cookies in
  12. import os.path
  13. cj = None
  14. ClientCookie = None
  15. cookielib = None
  16. import cookielib
  17. import urllib2
  18. urlopen = urllib2.urlopen
  19. cj = cookielib.LWPCookieJar() # This is a subclass of FileCookieJar that has useful load and save methods
  20. Request = urllib2.Request
  21. txdata = None
  22. refererUrl = "http://google.com/?q=you!"
  23. txheaders = {'User-agent' : 'Grabber/0.1 (X11; U; Linux i686; en-US; rv:1.7)', 'Referer' : refererUrl}
  24. allowed=['php','html','htm','xml','xhtml','xht','xhtm',
  25. 'asp','aspx','msp','mspx','php3','php4','php5','txt','shtm',
  26. 'shtml','phtm','phtml','jhtml','pl','jsp','cfm','cfml','do','py',
  27. 'js', 'css']
  28. database = {}
  29. database_url = []
  30. database_css = []
  31. database_js = []
  32. database_ext = [] # database of unsecure external links
  33. local_url = []
  34. dumb_params = [] # if there is no parameters associated with a given URL, associate this list of "whatever looks like"
  35. root = "http://localhost"
  36. outSpiderFile = None
  37. """
  38. database = {
  39. u"URL" : {'GET' : {'param1':value}, 'POST' : { 'param2' : value }},
  40. u"URL" : {'GET' : {'param1':value}, 'POST' : { 'param2' : value }},
  41. u"URL" : {'GET' : {'param1':value}, 'POST' : { 'param2' : value }}
  42. }
  43. """
  44. _urlEncode = {}
  45. for i in range(256):
  46. _urlEncode[chr(i)] = '%%%02x' % i
  47. for c in string.letters + string.digits + '_,.-/':
  48. _urlEncode[c] = c
  49. _urlEncode[' '] = '+'
  50. def urlEncode(s):
  51. """
  52. Returns the encoded version of the given string, safe for using as a URL.
  53. """
  54. return string.join(map(lambda c: _urlEncode[c], list(s)), '')
  55. def urlDecode(s):
  56. """
  57. Returns the decoded version of the given string. Note that invalid URLs will throw exceptons.
  58. For example, a URL whose % coding is incorrect.
  59. """
  60. mychr = chr
  61. atoi = string.atoi
  62. parts = string.split(string.replace(s, '+', ' '), '%')
  63. for i in range(1, len(parts)):
  64. part = parts[i]
  65. parts[i] = mychr(atoi(part[:2], 16)) + part[2:]
  66. return string.join(parts, '')
  67. def htmlencode(s):
  68. """
  69. Escaping the HTML special characters
  70. """
  71. s = s.replace("&", "&")
  72. s = s.replace("<", "&lt;")
  73. s = s.replace(">", "&gt;")
  74. s = s.replace("\"","&quot;")
  75. s = s.replace("'", "&apos;")
  76. return s
  77. def htmldecode(s):
  78. """
  79. Unescaping the HTML special characters
  80. """
  81. s = s.replace("&lt;", "<")
  82. s = s.replace("&gt;", ">")
  83. s = s.replace("&quot;", "\"")
  84. s = s.replace("&apos;","'")
  85. s = s.replace("&amp;", "&")
  86. return s
  87. def getContentDirectURL_GET(url, string):
  88. """
  89. Get the content of the url by GET method
  90. """
  91. ret = ""
  92. try:
  93. if len(string) > 0:
  94. url = url + "?" + (string)
  95. opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
  96. urllib2.install_opener(opener)
  97. req = Request(url, None, txheaders) # create a request object
  98. ret = urlopen(req) # and open it to return a handle on the url
  99. except HTTPError, e:
  100. return
  101. except URLError, e:
  102. return
  103. except IOError:
  104. return
  105. return ret
  106. def scan(currentURL):
  107. """
  108. The Scanner is the first part of Grabber.
  109. It retrieves every information of the HTML page
  110. TODO:
  111. Reading in every href='' element for CSS and src='' for JavaScript / Image
  112. """
  113. try:
  114. archives_hDl = getContentDirectURL_GET(currentURL,'')
  115. except IOError:
  116. log <= ("IOError @ %s" % currentURL)
  117. try:
  118. htmlContent= archives_hDl.read()
  119. except IOError, e:
  120. print "Cannot open the file,",(e.strerror)
  121. return
  122. except AttributeError:
  123. print ("Grabber cannot retrieve the given url: %s" % currentURL)
  124. return
  125. parseHtmlLinks (currentURL,htmlContent)
  126. parseHtmlParams(currentURL,htmlContent)
  127. def allowedExtensions(plop):
  128. for e in allowed:
  129. if '.'+e in plop:
  130. return True
  131. return False
  132. def makeRoot(urlLocal):
  133. if allowedExtensions(urlLocal):
  134. return urlLocal[0:urlLocal.rfind('/')+1]
  135. return urlLocal
  136. def giveGoodURL(href, urlLocal):
  137. """
  138. It should return a good url...
  139. href = argument retrieven from the href...
  140. """
  141. if 'javascript' in href:
  142. return htmldecode(urlLocal)
  143. if 'http://' in href or 'https://' in href:
  144. if urlLocal in href:
  145. return htmldecode(href)
  146. else:
  147. return urlLocal
  148. if len(href) < 1:
  149. return htmldecode(urlLocal)
  150. if href[0] == '?' and '?' not in urlLocal and not allowedExtensions(urlLocal):
  151. for e in allowed:
  152. if '.'+e in urlLocal:
  153. return htmldecode(urlLocal + href)
  154. return htmldecode(urlLocal + '/' + href)
  155. else:
  156. # simple name
  157. if allowedExtensions(urlLocal) or '?' in urlLocal:
  158. return htmldecode(urlLocal[0:urlLocal.rfind('/')+1] + href)
  159. else:
  160. return htmldecode(urlLocal + '/' + href)
  161. return htmldecode(href)
  162. def dl(fileAdress, destFile):
  163. """
  164. Download the file
  165. """
  166. try:
  167. f = urllib.urlopen(fileAdress)
  168. g = f.read()
  169. file = open(os.path.join('./', destFile), "wb")
  170. except IOError:
  171. return False
  172. file.write(g)
  173. file.close()
  174. return True
  175. def removeSESSID(urlssid):
  176. """
  177. Remove the phpsessid information... don't care about it now
  178. """
  179. k = urlssid.find('PHPSESSID')
  180. if k > 0:
  181. return urlssid[0:k-1]
  182. k = urlssid.find('sid')
  183. if k > 0:
  184. return urlssid[0:k-1]
  185. return urlssid
  186. def parseHtmlLinks(currentURL,htmlContent):
  187. global database_url,database_js,database_css
  188. """
  189. Parse the HTML/XHTML code to get JS, CSS, links etc.
  190. """
  191. links = SoupStrainer('a')
  192. # listAnchors = [tag['href'] for tag in BeautifulSoup(htmlContent, parseOnlyThese=links)]
  193. listAnchors = []
  194. for tag in BeautifulSoup(htmlContent, parseOnlyThese=links):
  195. try:
  196. string = str(tag).lower()
  197. if string.count("href") > 0:
  198. listAnchors.append(tag['href'])
  199. except TypeError:
  200. continue
  201. except KeyError:
  202. continue
  203. for a in listAnchors:
  204. goodA = giveGoodURL(a,currentURL)
  205. goodA = removeSESSID(goodA)
  206. if (root in goodA) and (goodA not in database_url):
  207. database_url.append(goodA)
  208. # parse the CSS and the JavaScript
  209. script = SoupStrainer('script')
  210. #listScripts = [tag['src'] for tag in BeautifulSoup(htmlContent, parseOnlyThese=script)]
  211. listScripts = []
  212. for tag in BeautifulSoup(htmlContent, parseOnlyThese=script):
  213. try:
  214. string = str(tag).lower()
  215. if string.count("src") > 0 and string.count(".src") < 1:
  216. listScripts.append(tag['src'])
  217. except TypeError:
  218. continue
  219. except KeyError:
  220. continue
  221. for a in listScripts:
  222. sc = giveGoodURL(a,currentURL)
  223. if sc not in database_js:
  224. database_js.append(sc)
  225. if sc == currentURL:
  226. # remote script
  227. database_ext.append(sc)
  228. parseJavaScriptCalls()
  229. link = SoupStrainer('link')
  230. # listLinks = [tag['href'] for tag in BeautifulSoup(htmlContent, parseOnlyThese=link)]
  231. listLinks = []
  232. for tag in BeautifulSoup(htmlContent, parseOnlyThese=link):
  233. try:
  234. string = str(tag).lower()
  235. if string.count("href") > 0:
  236. listLinks.append(tag['href'])
  237. except TypeError:
  238. continue
  239. except KeyError:
  240. continue
  241. for a in listLinks:
  242. sc = giveGoodURL(a,currentURL)
  243. if sc not in database_css:
  244. database_css.append(sc)
  245. return True
  246. jsChars = ["'",'"']
  247. def rfindFirstJSChars(string):
  248. b = [string.rfind(k) for k in jsChars]
  249. return max(b)
  250. regDumbParam = re.compile(r'(\w+)')
  251. regDumbParamNumber = re.compile(r'(\d+)')
  252. jsParams = ["'",'"','=','+','%','\\',')','(','^','*','-']
  253. def cleanListDumbParams(listDumb):
  254. newDumbList = []
  255. for w in listDumb:
  256. w = w.replace(' ','')
  257. w = w.replace('\n','')
  258. #l = [c for c in jsParams if c in w] # no jsParams
  259. if len(w) > 0 and regDumbParam.match(w) and not regDumbParamNumber.match(w):
  260. newDumbList.append(w)
  261. return newDumbList
  262. def unique(L):
  263. noDupli=[]
  264. [noDupli.append(i) for i in L if not noDupli.count(i)]
  265. return noDupli
  266. def flatten(L):
  267. if type(L) != type([]):
  268. return [L]
  269. if L == []:
  270. return L
  271. return reduce(lambda L1,L2:L1+L2,map(flatten,L))
  272. def parseJavaScriptContent(jsContent):
  273. global database_url, database_ext, dumb_params
  274. """
  275. Parse the content of a JavaScript file
  276. """
  277. for l in jsContent.readlines():
  278. for e in allowed:
  279. if l.count('.'+e) > 0:
  280. # we found an external a call
  281. if l.count('http://') > 0 and l.count(root) < 1:
  282. # External link
  283. et= '.'+e
  284. b1 = l.find('http://')
  285. b2 = l.find(et) + len(et)
  286. database_ext.append(l[b1:b2])
  287. else:
  288. # Internal link
  289. et= '.'+e
  290. b2 = l.find(et) + len(et)
  291. b1 = rfindFirstJSChars(l[:b2])+1
  292. database_url.append(giveGoodURL(l[b1:b2],root))
  293. # try to get a parameter
  294. k = l.find('?')
  295. if k > 0:
  296. results = l[k:].split('?')
  297. plop = []
  298. for a in results:
  299. plop.append(cleanListDumbParams(regDumbParam.split(a)))
  300. dumb_params.append(flatten(plop))
  301. k = l.find('&')
  302. if k > 0:
  303. results = l[k:].split('&')
  304. plop = []
  305. for a in results:
  306. plop.append(cleanListDumbParams(regDumbParam.split(a)))
  307. plop = flatten(plop)
  308. dumb_params.append(flatten(plop))
  309. dumb_params = unique(flatten(dumb_params))
  310. def parseJavaScriptCalls():
  311. global database_js
  312. """
  313. Parse the JavaScript and download the files
  314. """
  315. for j in database_js:
  316. jsName = j[j.rfind('/')+1:]
  317. if not os.path.exists('local/js/' + jsName):
  318. # first download the file
  319. dl(j,'local/js/' + jsName)
  320. try:
  321. jsContent = open('local/js/' + jsName, 'r')
  322. except IOError:
  323. continue
  324. parseJavaScriptContent(jsContent)
  325. jsContent.close()
  326. def splitQuery(query_string):
  327. """
  328. Split the num=plop&truc=kikoo&o=42 into
  329. a dictionary
  330. """
  331. try:
  332. d = dict([x.split('=') for x in query_string.split('&') ])
  333. except ValueError:
  334. d = {}
  335. return d
  336. def dict_add(d1,d2):
  337. """
  338. Flatten 2 dictionaries
  339. """
  340. d={}
  341. if len(d1):
  342. for s in d1.keys():
  343. d[s] = d1[s]
  344. if len(d2):
  345. for s in d2.keys():
  346. d[s] = d2[s]
  347. return d
  348. def dict_add_list(d1,l1):
  349. d={}
  350. if len(d1):
  351. for s in d1.keys():
  352. d[s] = d1[s]
  353. if len(l1):
  354. for s in l1:
  355. d[s] = 'bar'
  356. return d
  357. def parseHtmlParams(currentURL, htmlContent):
  358. global database, database_css, database_js
  359. """
  360. Parse html to get args
  361. """
  362. for url in database_url:
  363. k = url.find('?')
  364. if k > 0:
  365. keyUrl = url[0:k-1]
  366. query = url[k+1:]
  367. if not keyUrl in database:
  368. database[keyUrl] = {}
  369. database[keyUrl]['GET'] = {}
  370. database[keyUrl]['POST'] = {}
  371. lG = database[keyUrl]['GET']
  372. lG = dict_add(lG,splitQuery(query))
  373. database[keyUrl]['GET'] = lG
  374. elif len(dumb_params) > 0:
  375. keyUrl = url
  376. # no params in the URL... let's assign the dumb_params
  377. if not keyUrl in database:
  378. database[keyUrl] = {}
  379. database[keyUrl]['GET'] = {}
  380. database[keyUrl]['POST'] = {}
  381. lG = database[keyUrl]['GET']
  382. lP = database[keyUrl]['POST']
  383. lG = dict_add_list(lG,dumb_params)
  384. lP = dict_add_list(lP,dumb_params)
  385. database[keyUrl]['GET'] = lG
  386. database[keyUrl]['POST'] = lP
  387. # then, parse the forms
  388. forms = SoupStrainer('form')
  389. input = SoupStrainer('input')
  390. listForm = [tag for tag in BeautifulSoup(htmlContent, parseOnlyThese=forms)]
  391. for f in listForm:
  392. method = 'GET'
  393. if 'method' in f or 'METHOD' in f:
  394. method = f['method'].upper()
  395. action = currentURL
  396. if 'action' in f or 'ACTION' in f:
  397. action = f['action']
  398. keyUrl = giveGoodURL(action,currentURL)
  399. listInput = [tag for tag in BeautifulSoup(str(f), parseOnlyThese=input)]
  400. for i in listInput:
  401. if not keyUrl in database:
  402. database[keyUrl] = {}
  403. database[keyUrl]['GET'] = {}
  404. database[keyUrl]['POST'] = {}
  405. try:
  406. value = i['value']
  407. except KeyError:
  408. value = '42'
  409. try:
  410. name = i['name']
  411. except KeyError:
  412. name = 'foo'
  413. value= 'bar'
  414. continue
  415. lGP = database[keyUrl][method]
  416. lGP = dict_add(lGP,{name : value})
  417. database[keyUrl][method] = lGP
  418. return True
  419. def runSpiderScan(entryUrl, depth = 0):
  420. global outSpiderFile
  421. print "runSpiderScan @ ", entryUrl, " | #",depth
  422. if outSpiderFile:
  423. outSpiderFile.write("\t\t<entryURL>%s</entryURL>\n" % entryUrl)
  424. scan(entryUrl)
  425. if depth > 0 and len(database_url) > 0:
  426. for a in database_url:
  427. runSpiderScan(a, depth-1)
  428. return False
  429. return True
  430. def spider(entryUrl, depth = 0):
  431. global root,outSpiderFile
  432. """
  433. Retrieve every links
  434. """
  435. if depth > 0:
  436. root = makeRoot(entryUrl)
  437. else:
  438. root = entryUrl
  439. # test if the spider has already be done on this website
  440. try:
  441. f = open("local/spiderSite.xml", 'r')
  442. firstLine = f.readline()
  443. f.close()
  444. if firstLine.count(root) > 0:
  445. alreadyScanned = True
  446. else:
  447. alreadyScanned = False
  448. except IOError:
  449. alreadyScanned = False
  450. print "Start scanning...", root
  451. if depth == 0:
  452. scan(root)
  453. else:
  454. if not alreadyScanned:
  455. outSpiderFile = open("local/spiderSite.xml","w")
  456. outSpiderFile.write("<spider root='%s' depth='%d'>\n" % (root,depth) )
  457. runSpiderScan(root, depth)
  458. if len(dumb_params) > 0:
  459. outSpiderFile.write("<dumb_parameters>\n")
  460. for d in dumb_params:
  461. outSpiderFile.write("\t<dumb>%s</dumb>\n" % (d))
  462. outSpiderFile.write("</dumb_parameters>\n")
  463. outSpiderFile.write("\n</spider>")
  464. outSpiderFile.close()
  465. else:
  466. print "Loading the previous spider results from 'local/spiderSite.xml'"
  467. # load the XML file
  468. regUrl = re.compile(r'(.*)<entryURL>(.*)</entryURL>(.*)',re.I)
  469. regDmb = re.compile(r'(.*)<dumb>(.*)</dumb>(.*)',re.I)
  470. f = open("local/spiderSite.xml", 'r')
  471. for l in f.readlines():
  472. if regUrl.match(l):
  473. out = regUrl.search(l)
  474. url = out.group(2)
  475. database_url.append(url)
  476. if regDmb.match(l):
  477. out = regDmb.search(l)
  478. param = out.group(2)
  479. dumb_params.append(param)
  480. f.close()
  481. # scan every url
  482. for currentURL in database_url:
  483. try:
  484. archives_hDl = getContentDirectURL_GET(currentURL,'')
  485. except IOError:
  486. log <= ("IOError @ %s" % currentURL)
  487. continue
  488. try:
  489. htmlContent= archives_hDl.read()
  490. except IOError, e:
  491. continue
  492. except AttributeError, e:
  493. continue
  494. parseHtmlParams(currentURL,htmlContent)
  495. outSpiderFile = open("results/touchFiles.xml","w")
  496. outSpiderFile.write("<spider root='%s'>\n" % root)
  497. for i in database_url:
  498. outSpiderFile.write("\t<url type='anchor'>%s</url>\n" % i)
  499. for i in database_js:
  500. outSpiderFile.write("\t<url type='JavaScript'>%s</url>\n" % i)
  501. for i in database_css:
  502. outSpiderFile.write("\t<url type='MetaLink'>%s</url>\n" % i)
  503. outSpiderFile.write("</spider>")
  504. outSpiderFile.close()
  505. if len(database_ext) > 0:
  506. # alert of External calls
  507. outSpiderFile = open("results/externalCalls.xml","w")
  508. outSpiderFile.write("<external>\n")
  509. for i in database_ext:
  510. outSpiderFile.write("\t<call severity='high'>%s</call>\n" % i)
  511. outSpiderFile.write("</external>")
  512. outSpiderFile.close()