PageRenderTime 57ms CodeModel.GetById 32ms RepoModel.GetById 1ms app.codeStats 0ms

/XSSer/dork.py

https://bitbucket.org/badc0re/xsser_gsoc
Python | 273 lines | 201 code | 7 blank | 65 comment | 79 complexity | a1ec67e19f976071f3c753d76fe20b25 MD5 | raw file
  1. #!/usr/bin/python
  2. # -*- coding: iso-8859-15 -*-
  3. """
  4. $Id$
  5. This file is part of the xsser project, http://xsser.sourceforge.net.
  6. Copyright (c) 2011/2012 psy <root@lordepsylon.net> - <epsylon@riseup.net>
  7. xsser is free software; you can redistribute it and/or modify it under
  8. the terms of the GNU General Public License as published by the Free
  9. Software Foundation version 3 of the License.
  10. xsser is distributed in the hope that it will be useful, but WITHOUT ANY
  11. WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  12. FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  13. details.
  14. You should have received a copy of the GNU General Public License along
  15. with xsser; if not, write to the Free Software Foundation, Inc., 51
  16. Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  17. """
  18. import urlparse
  19. import urllib2
  20. import traceback
  21. urllib2.socket.setdefaulttimeout(5.0)
  22. from BeautifulSoup import BeautifulSoup
  23. DEBUG = 1
  24. class Dorker(object):
  25. def __init__(self, engine='bing'):
  26. self._engine = engine
  27. def dork(self, search):
  28. """
  29. Perform a search and return links.
  30. Uses -bing- engine by default.
  31. (http://en.wikipedia.org/wiki/List_of_search_engines)
  32. """
  33. urlpar = None
  34. divid = None
  35. unpack_func = None
  36. css_class = None
  37. raw_extract = None
  38. html_tok = 'a'
  39. paging_arg = None # allow to do paging
  40. if self._engine == 'bing' or not self._engine: # works at 20-02-2011
  41. search_url = "http://www.bing.com/search?q=" + urllib2.quote(search)
  42. divid = 'results_container'
  43. elif self._engine == 'scroogle':
  44. search_url = "http://www.scroogle.org/cgi-bin/nbbw.cgi?q=" + urllib2.quote(search)
  45. elif self._engine == 'altavista': # works at 20-02-2011
  46. def altavista_func(href):
  47. href = href['href']
  48. # http://search.yahoo.com/r/_ylt=A0oG7p45zGBNl0MAuhQPxQt.;_ylu=X3oDMTByMTNuNTZzBHNlYwNzcgRwb3MDMgRjb2xvA2FjMgR2dGlkAw--/SIG=11942um5m/EXP=1298275769/**http%3a//money.cnn.com/
  49. if "**" in href:
  50. return {'href':urlparse.unquote(href[href.rfind('**')+2:])}
  51. #divid = 'results' -> in other altavista=?
  52. def raw_extract(html_data, encoding):
  53. results = []
  54. for line in html_data.split("\n"):
  55. if "<a class='res'" in line and "http" in line:
  56. href = line[line.find("http"):line.rfind("'")]
  57. results.append({'href': href})
  58. return results
  59. css_class = 'res'
  60. #unpack_func = altavista_func -> in otherS?
  61. #search_url = "http://us.yhs4.search.yahoo.com/yhs/search?fr=altavista&itag=ody&q=" + urllib2.quote(search)
  62. search_url = "http://es.altavista.com/web/results?fr=altavista&itag=ody&q=" + urllib2.quote(search)
  63. elif self._engine == 'duck': # seems hopeless at 20-02-2011
  64. search_url = "https://duckduckgo.com/?q=" + urllib2.quote(search)
  65. elif self._engine == 'baidu': # works at 20-02-2011
  66. #html_tok = 'span'
  67. #css_class = 'g'
  68. def raw_extract(html_data, encoding):
  69. results = []
  70. pos = 0
  71. while pos < len(html_data):
  72. pos = html_data.find('span class="g">', pos)
  73. if pos == -1:
  74. break;
  75. href = html_data[pos+15:html_data.find('<', pos)].strip()
  76. pos = pos + 1
  77. if not href:
  78. continue
  79. href = href.split(" ")[0]
  80. if not href.startswith('http'):
  81. href = 'http://'+href
  82. results.append({'href': href})
  83. return results
  84. search_url = "http://www.baidu.com/s?wd=" + urllib2.quote(search)
  85. elif self._engine == 'yandex': # works at 20-02-2011
  86. def raw_extract(html_data, encoding):
  87. results = []
  88. for line in html_data.split("\n"):
  89. if 'class="b-serp-url__link"' in line and "http" in line:
  90. href = line[line.find("http"):line.find('"', line.find("http")+10)]
  91. results.append({'href': href})
  92. return results
  93. #css_class = 'b-serp-url__link'
  94. search_url = "http://yandex.ru/yandsearch?text=" + urllib2.quote(search)
  95. elif self._engine == 'yebol':
  96. divid = "Scrollbar-SearchResultsc"
  97. search_url = "http://www.yebol.com/a.jsp?x=0&y=0&key=" + urllib2.quote(search)
  98. elif self._engine == 'youdao':
  99. search_url = "http://www.youdao.com/search?q=" + urllib2.quote(search)
  100. #elif self._engine == 'ask': # not works
  101. # def raw_extract(html_data, encoding):
  102. # results = []
  103. # prevline = ""
  104. # for line in html_data.split("\n"):
  105. # if 'class="title txt_lg"' in line and "http" in prevline:
  106. # href = prevline[prevline.find("http"):prevline.find('"',
  107. # prevline.find("http")+10)]
  108. # results.append({'href': href})
  109. # prevline = line
  110. # return results
  111. # search_url = "http://www.ask.com/web?q=" + urllib2.quote(search)
  112. elif self._engine == 'google': # works at 11/11/2011
  113. #def raw_extract(html_data, encoding):
  114. # results = []
  115. # prevline = ""
  116. # for line in html_data.split("\n"):
  117. # if 'class="r"' in line and "http" in prevline:
  118. # href = prevline[prevline.find("http"):prevline.find('"',
  119. # prevline.find("http")+10)]
  120. # results.append({'href': href})
  121. # prevline = line
  122. # return results
  123. search_url = "https://encrypted.google.com/search?hl=en&q=" + urllib2.quote(search)
  124. elif self._engine == 'yahoo': # works at 20-02-2011
  125. def raw_extract(html_data, encoding):
  126. results = []
  127. for line in html_data.split("\n"):
  128. if 'class="yschttl spt"' in line and "http" in line:
  129. href = line[line.find("http"):line.find('"', line.find("http")+10)]
  130. results.append({'href': href})
  131. return results
  132. search_url = "http://search.yahoo.com/search?p=" + urllib2.quote(search)
  133. elif self._engine == 'sogou':
  134. search_url = "http://www.sogou.com/web?query=" + urllib2.quote(search)
  135. elif self._engine == 'rediff':
  136. search_url = "http://search1.rediff.com/dirsrch/default.asp?src=web&MT=" + urllib2.quote(search)
  137. elif self._engine == 'blekko':
  138. search_url = "http://blekko.com/ws/?q=" + urllib2.quote(search)
  139. elif self._engine == 'kosmix': # doesnt work properly
  140. def raw_extract(html_data, encoding):
  141. print html_data
  142. results = []
  143. is_next = False
  144. for line in html_data.split("\n"):
  145. #if 'class="www_result_url"' in line and "http" in line:
  146. if '<h4>' in line and "http" in line:
  147. href = line[line.find("http"):line.find('"', line.find("http")+10)]
  148. results.append({'href': href})
  149. is_next=False
  150. if is_next and "http" in line:
  151. href = line[line.find("http"):line.find('"', line.find("http")+10)]
  152. results.append({'href': href})
  153. is_next=False
  154. elif '<h4>' in line:
  155. is_next=True
  156. else:
  157. is_next=False
  158. return results
  159. search_url = "http://www.kosmix.com/topic/lala?q=" + urllib2.quote(search)
  160. elif self._engine == 'search': # works at 20-02-2011
  161. def raw_extract(html_data, encoding):
  162. results = []
  163. for line in html_data.split("\n"):
  164. if 'class="www_result_url"' in line and "http" in line:
  165. #if 'class="www_result_title"' in line and "http" in line:
  166. href = line[line.find("http"):line.find('"', line.find("http")+10)]
  167. results.append({'href': href})
  168. return results
  169. search_url = "http://www.search.ch/?q=" + urllib2.quote(search)
  170. elif self._engine == 'ifacnet':
  171. search_url = "http://www.ifacnet.com/?q=" + urllib2.quote(search)
  172. elif self._engine == 'bussines':
  173. search_url = "http://www.business.com/search/rslt_default.asp?vt=all&type=web&query=" + urllib2.quote(search)
  174. elif self._engine == 'globalspec':
  175. search_url = "http://search.globalspec.com/Search?query=" + urllib2.quote(search)
  176. elif self._engine == 'taptu':
  177. search_url = "http://www.taptu.com/search/lite/results?term=" + urllib2.quote(search)
  178. elif self._engine == 'topix':
  179. search_url = "http://www.topix.com/search/article?q=" + urllib2.quote(search)
  180. elif self._engine == 'hakia':
  181. search_url = "http://hakia.com/search?q=" + urllib2.quote(search)
  182. elif self._engine == 'leapfish':
  183. search_url = "http://www.leapfish.com/web.aspx?q=" + urllib2.quote(search)
  184. #elif self._engine == 'webcrawler': # works at 20-02-2011
  185. # urlpar = "rawURL"
  186. # search_url = "http://www.webcrawler.com/webcrawler203/ws/results/Web/" + urllib2.quote(search) + "/1/417/TopNavigation/Relevance/iq=true/zoom=off/_iceUrlFlag=7?_IceUrl=true"
  187. elif self._engine == 'excite':
  188. search_url = "http://msxml.excite.com/excite/ws/results/Web/" + urllib2.quote(search) + "/1/0/0/Relevance/iq=true/zoom=off/_iceUrlFlag=7?_IceUrl=true"
  189. elif self._engine == 'yolink':
  190. search_url = "http://cloud.yolink.com/search/search?keywords=" + urllib2.quote(search)
  191. elif self._engine == 'lycos':
  192. search_url = "http://search.lycos.com/?tab=web&query=" + urllib2.quote(search)
  193. else:
  194. print "\nThis search engine is not allowed. Check dork.py file to see a complete list\n"
  195. try:
  196. self.search_url = search_url
  197. url = urllib2.urlopen(urllib2.Request(search_url,
  198. headers={'User-Agent':
  199. "Googlebot/2.1 (+http://www.google.com/bot.html"}))
  200. except urllib2.URLError, e:
  201. if DEBUG:
  202. traceback.print_exc()
  203. raise Exception("Internal error dorking: " + e.message)
  204. html_data = url.read()
  205. html_data = html_data.replace(">",">\n")
  206. html_data = html_data.replace("target=_",'target="_')
  207. html_data = html_data.replace('\ >','/>')
  208. html_data = html_data.replace('\>','/>')
  209. html_data = html_data.replace('"">','">')
  210. html_data = html_data.replace('</scr"+"ipt>','</script>')
  211. content_type = url.headers['content-type']
  212. try:
  213. encoding = content_type.split(";")[1].split("=")[1].strip()
  214. except:
  215. encoding = 'utf-8'
  216. if raw_extract:
  217. links = raw_extract(html_data, encoding)
  218. else:
  219. try:
  220. soup = BeautifulSoup(html_data, fromEncoding=encoding)
  221. except Exception, e:
  222. traceback.print_exc()
  223. raise Exception("Internal error dorking:" + e.message)
  224. if divid:
  225. #print(html_data)
  226. soup = soup.find('div', {'id':divid})
  227. if css_class:
  228. links = soup.findAll(html_tok, {'class':css_class})
  229. else:
  230. links = soup.findAll(html_tok)
  231. found_links = []
  232. if unpack_func:
  233. links = map(unpack_func, links)
  234. links = filter(lambda s: s, links)
  235. for link in links:
  236. try:
  237. href = str(link['href'].encode('utf-8'))
  238. except KeyError:
  239. # this link has no href
  240. pass
  241. else:
  242. if not href.startswith("/") and not "microsofttranslator" in href and not "bingj" in href and not "live.com" in href and not "scroogle" in href:
  243. if urlpar:
  244. parsed = urlparse.urlparse(href)
  245. q = urlparse.parse_qs(parsed.query)
  246. if urlpar in q and q[urlpar]:
  247. href = urlparse.unquote(q[urlpar][0])
  248. found_links.append(href)
  249. else:
  250. found_links.append(href)
  251. return found_links
  252. if __name__ == '__main__':
  253. for a in ['google', 'altavista', 'yahoo', 'baidu', 'bing', 'webcrawler',
  254. 'youdao', 'yandex']:
  255. dork = Dorker(a)
  256. res = dork.dork("lorea")
  257. print a,len(res)
  258. for b in res:
  259. print " *", b