/XSSer/dork.py
Python | 273 lines | 201 code | 7 blank | 65 comment | 79 complexity | a1ec67e19f976071f3c753d76fe20b25 MD5 | raw file
- #!/usr/bin/python
- # -*- coding: iso-8859-15 -*-
- """
- $Id$
- This file is part of the xsser project, http://xsser.sourceforge.net.
- Copyright (c) 2011/2012 psy <root@lordepsylon.net> - <epsylon@riseup.net>
- xsser is free software; you can redistribute it and/or modify it under
- the terms of the GNU General Public License as published by the Free
- Software Foundation version 3 of the License.
- xsser is distributed in the hope that it will be useful, but WITHOUT ANY
- WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
- details.
- You should have received a copy of the GNU General Public License along
- with xsser; if not, write to the Free Software Foundation, Inc., 51
- Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- """
- import urlparse
- import urllib2
- import traceback
- urllib2.socket.setdefaulttimeout(5.0)
- from BeautifulSoup import BeautifulSoup
- DEBUG = 1
- class Dorker(object):
- def __init__(self, engine='bing'):
- self._engine = engine
- def dork(self, search):
- """
- Perform a search and return links.
- Uses -bing- engine by default.
- (http://en.wikipedia.org/wiki/List_of_search_engines)
- """
- urlpar = None
- divid = None
- unpack_func = None
- css_class = None
- raw_extract = None
- html_tok = 'a'
- paging_arg = None # allow to do paging
- if self._engine == 'bing' or not self._engine: # works at 20-02-2011
- search_url = "http://www.bing.com/search?q=" + urllib2.quote(search)
- divid = 'results_container'
- elif self._engine == 'scroogle':
- search_url = "http://www.scroogle.org/cgi-bin/nbbw.cgi?q=" + urllib2.quote(search)
- elif self._engine == 'altavista': # works at 20-02-2011
- def altavista_func(href):
- href = href['href']
- # http://search.yahoo.com/r/_ylt=A0oG7p45zGBNl0MAuhQPxQt.;_ylu=X3oDMTByMTNuNTZzBHNlYwNzcgRwb3MDMgRjb2xvA2FjMgR2dGlkAw--/SIG=11942um5m/EXP=1298275769/**http%3a//money.cnn.com/
- if "**" in href:
- return {'href':urlparse.unquote(href[href.rfind('**')+2:])}
- #divid = 'results' -> in other altavista=?
- def raw_extract(html_data, encoding):
- results = []
- for line in html_data.split("\n"):
- if "<a class='res'" in line and "http" in line:
- href = line[line.find("http"):line.rfind("'")]
- results.append({'href': href})
- return results
- css_class = 'res'
- #unpack_func = altavista_func -> in otherS?
- #search_url = "http://us.yhs4.search.yahoo.com/yhs/search?fr=altavista&itag=ody&q=" + urllib2.quote(search)
- search_url = "http://es.altavista.com/web/results?fr=altavista&itag=ody&q=" + urllib2.quote(search)
- elif self._engine == 'duck': # seems hopeless at 20-02-2011
- search_url = "https://duckduckgo.com/?q=" + urllib2.quote(search)
- elif self._engine == 'baidu': # works at 20-02-2011
- #html_tok = 'span'
- #css_class = 'g'
- def raw_extract(html_data, encoding):
- results = []
- pos = 0
- while pos < len(html_data):
- pos = html_data.find('span class="g">', pos)
- if pos == -1:
- break;
- href = html_data[pos+15:html_data.find('<', pos)].strip()
- pos = pos + 1
- if not href:
- continue
- href = href.split(" ")[0]
- if not href.startswith('http'):
- href = 'http://'+href
- results.append({'href': href})
- return results
- search_url = "http://www.baidu.com/s?wd=" + urllib2.quote(search)
- elif self._engine == 'yandex': # works at 20-02-2011
- def raw_extract(html_data, encoding):
- results = []
- for line in html_data.split("\n"):
- if 'class="b-serp-url__link"' in line and "http" in line:
- href = line[line.find("http"):line.find('"', line.find("http")+10)]
- results.append({'href': href})
- return results
- #css_class = 'b-serp-url__link'
- search_url = "http://yandex.ru/yandsearch?text=" + urllib2.quote(search)
- elif self._engine == 'yebol':
- divid = "Scrollbar-SearchResultsc"
- search_url = "http://www.yebol.com/a.jsp?x=0&y=0&key=" + urllib2.quote(search)
- elif self._engine == 'youdao':
- search_url = "http://www.youdao.com/search?q=" + urllib2.quote(search)
- #elif self._engine == 'ask': # not works
- # def raw_extract(html_data, encoding):
- # results = []
- # prevline = ""
- # for line in html_data.split("\n"):
- # if 'class="title txt_lg"' in line and "http" in prevline:
- # href = prevline[prevline.find("http"):prevline.find('"',
- # prevline.find("http")+10)]
- # results.append({'href': href})
- # prevline = line
- # return results
- # search_url = "http://www.ask.com/web?q=" + urllib2.quote(search)
- elif self._engine == 'google': # works at 11/11/2011
- #def raw_extract(html_data, encoding):
- # results = []
- # prevline = ""
- # for line in html_data.split("\n"):
- # if 'class="r"' in line and "http" in prevline:
- # href = prevline[prevline.find("http"):prevline.find('"',
- # prevline.find("http")+10)]
- # results.append({'href': href})
- # prevline = line
- # return results
- search_url = "https://encrypted.google.com/search?hl=en&q=" + urllib2.quote(search)
- elif self._engine == 'yahoo': # works at 20-02-2011
- def raw_extract(html_data, encoding):
- results = []
- for line in html_data.split("\n"):
- if 'class="yschttl spt"' in line and "http" in line:
- href = line[line.find("http"):line.find('"', line.find("http")+10)]
- results.append({'href': href})
- return results
- search_url = "http://search.yahoo.com/search?p=" + urllib2.quote(search)
- elif self._engine == 'sogou':
- search_url = "http://www.sogou.com/web?query=" + urllib2.quote(search)
- elif self._engine == 'rediff':
- search_url = "http://search1.rediff.com/dirsrch/default.asp?src=web&MT=" + urllib2.quote(search)
- elif self._engine == 'blekko':
- search_url = "http://blekko.com/ws/?q=" + urllib2.quote(search)
- elif self._engine == 'kosmix': # doesnt work properly
- def raw_extract(html_data, encoding):
- print html_data
- results = []
- is_next = False
- for line in html_data.split("\n"):
- #if 'class="www_result_url"' in line and "http" in line:
- if '<h4>' in line and "http" in line:
- href = line[line.find("http"):line.find('"', line.find("http")+10)]
- results.append({'href': href})
- is_next=False
- if is_next and "http" in line:
- href = line[line.find("http"):line.find('"', line.find("http")+10)]
- results.append({'href': href})
- is_next=False
- elif '<h4>' in line:
- is_next=True
- else:
- is_next=False
- return results
- search_url = "http://www.kosmix.com/topic/lala?q=" + urllib2.quote(search)
- elif self._engine == 'search': # works at 20-02-2011
- def raw_extract(html_data, encoding):
- results = []
- for line in html_data.split("\n"):
- if 'class="www_result_url"' in line and "http" in line:
- #if 'class="www_result_title"' in line and "http" in line:
- href = line[line.find("http"):line.find('"', line.find("http")+10)]
- results.append({'href': href})
- return results
- search_url = "http://www.search.ch/?q=" + urllib2.quote(search)
- elif self._engine == 'ifacnet':
- search_url = "http://www.ifacnet.com/?q=" + urllib2.quote(search)
- elif self._engine == 'bussines':
- search_url = "http://www.business.com/search/rslt_default.asp?vt=all&type=web&query=" + urllib2.quote(search)
- elif self._engine == 'globalspec':
- search_url = "http://search.globalspec.com/Search?query=" + urllib2.quote(search)
- elif self._engine == 'taptu':
- search_url = "http://www.taptu.com/search/lite/results?term=" + urllib2.quote(search)
- elif self._engine == 'topix':
- search_url = "http://www.topix.com/search/article?q=" + urllib2.quote(search)
- elif self._engine == 'hakia':
- search_url = "http://hakia.com/search?q=" + urllib2.quote(search)
- elif self._engine == 'leapfish':
- search_url = "http://www.leapfish.com/web.aspx?q=" + urllib2.quote(search)
- #elif self._engine == 'webcrawler': # works at 20-02-2011
- # urlpar = "rawURL"
- # search_url = "http://www.webcrawler.com/webcrawler203/ws/results/Web/" + urllib2.quote(search) + "/1/417/TopNavigation/Relevance/iq=true/zoom=off/_iceUrlFlag=7?_IceUrl=true"
- elif self._engine == 'excite':
- search_url = "http://msxml.excite.com/excite/ws/results/Web/" + urllib2.quote(search) + "/1/0/0/Relevance/iq=true/zoom=off/_iceUrlFlag=7?_IceUrl=true"
- elif self._engine == 'yolink':
- search_url = "http://cloud.yolink.com/search/search?keywords=" + urllib2.quote(search)
- elif self._engine == 'lycos':
- search_url = "http://search.lycos.com/?tab=web&query=" + urllib2.quote(search)
- else:
- print "\nThis search engine is not allowed. Check dork.py file to see a complete list\n"
- try:
- self.search_url = search_url
- url = urllib2.urlopen(urllib2.Request(search_url,
- headers={'User-Agent':
- "Googlebot/2.1 (+http://www.google.com/bot.html"}))
- except urllib2.URLError, e:
- if DEBUG:
- traceback.print_exc()
- raise Exception("Internal error dorking: " + e.message)
- html_data = url.read()
- html_data = html_data.replace(">",">\n")
- html_data = html_data.replace("target=_",'target="_')
- html_data = html_data.replace('\ >','/>')
- html_data = html_data.replace('\>','/>')
- html_data = html_data.replace('"">','">')
- html_data = html_data.replace('</scr"+"ipt>','</script>')
- content_type = url.headers['content-type']
- try:
- encoding = content_type.split(";")[1].split("=")[1].strip()
- except:
- encoding = 'utf-8'
- if raw_extract:
- links = raw_extract(html_data, encoding)
- else:
- try:
- soup = BeautifulSoup(html_data, fromEncoding=encoding)
- except Exception, e:
- traceback.print_exc()
- raise Exception("Internal error dorking:" + e.message)
- if divid:
- #print(html_data)
- soup = soup.find('div', {'id':divid})
- if css_class:
- links = soup.findAll(html_tok, {'class':css_class})
- else:
- links = soup.findAll(html_tok)
- found_links = []
- if unpack_func:
- links = map(unpack_func, links)
- links = filter(lambda s: s, links)
- for link in links:
- try:
- href = str(link['href'].encode('utf-8'))
- except KeyError:
- # this link has no href
- pass
- else:
- if not href.startswith("/") and not "microsofttranslator" in href and not "bingj" in href and not "live.com" in href and not "scroogle" in href:
- if urlpar:
- parsed = urlparse.urlparse(href)
- q = urlparse.parse_qs(parsed.query)
- if urlpar in q and q[urlpar]:
- href = urlparse.unquote(q[urlpar][0])
- found_links.append(href)
- else:
- found_links.append(href)
- return found_links
- if __name__ == '__main__':
- for a in ['google', 'altavista', 'yahoo', 'baidu', 'bing', 'webcrawler',
- 'youdao', 'yandex']:
- dork = Dorker(a)
- res = dork.dork("lorea")
- print a,len(res)
- for b in res:
- print " *", b