PageRenderTime 36ms CodeModel.GetById 14ms RepoModel.GetById 1ms app.codeStats 0ms

/src/pentest/fimap/xgoogle/search.py

https://github.com/sullivanmatt/Raspberry-Pwn
Python | 241 lines | 194 code | 27 blank | 20 comment | 21 complexity | 0929c7d127e502f3c0cd5497689fc3a6 MD5 | raw file
Possible License(s): BSD-3-Clause, AGPL-1.0, MPL-2.0-no-copyleft-exception, GPL-2.0, GPL-3.0
  1. #!/usr/bin/python
  2. #
  3. # Peteris Krumins (peter@catonmat.net)
  4. # http://www.catonmat.net -- good coders code, great reuse
  5. #
  6. # http://www.catonmat.net/blog/python-library-for-google-search/
  7. #
  8. # Code is licensed under MIT license.
  9. #
  10. import re
  11. import urllib
  12. from htmlentitydefs import name2codepoint
  13. from BeautifulSoup import BeautifulSoup
  14. from browser import Browser, BrowserError
  15. class SearchError(Exception):
  16. """
  17. Base class for Google Search exceptions.
  18. """
  19. pass
  20. class ParseError(SearchError):
  21. """
  22. Parse error in Google results.
  23. self.msg attribute contains explanation why parsing failed
  24. self.tag attribute contains BeautifulSoup object with the most relevant tag that failed to parse
  25. Thrown only in debug mode
  26. """
  27. def __init__(self, msg, tag):
  28. self.msg = msg
  29. self.tag = tag
  30. def __str__(self):
  31. return self.msg
  32. def html(self):
  33. return self.tag.prettify()
  34. class SearchResult:
  35. def __init__(self, title, url, desc):
  36. self.title = title
  37. self.url = url
  38. self.desc = desc
  39. def __str__(self):
  40. return 'Google Search Result: "%s"' % self.title
  41. class GoogleSearch(object):
  42. SEARCH_URL_0 = "http://www.google.com/search?q=%(query)s&btnG=Google+Search"
  43. NEXT_PAGE_0 = "http://www.google.com/search?q=%(query)s&start=%(start)d"
  44. SEARCH_URL_1 = "http://www.google.com/search?q=%(query)s&num=%(num)d&btnG=Google+Search"
  45. NEXT_PAGE_1 = "http://www.google.com/search?q=%(query)s&num=%(num)d&start=%(start)d"
  46. def __init__(self, query, random_agent=False, debug=False, page=0):
  47. self.query = query
  48. self.debug = debug
  49. self.browser = Browser(debug=debug)
  50. self.results_info = None
  51. self.eor = False # end of results
  52. self._page = page
  53. self._results_per_page = 10
  54. self._last_from = 0
  55. if random_agent:
  56. self.browser.set_random_user_agent()
  57. @property
  58. def num_results(self):
  59. if not self.results_info:
  60. page = self._get_results_page()
  61. self.results_info = self._extract_info(page)
  62. if self.results_info['total'] == 0:
  63. self.eor = True
  64. return self.results_info['total']
  65. def _get_page(self):
  66. return self._page
  67. def _set_page(self, page):
  68. self._page = page
  69. page = property(_get_page, _set_page)
  70. def _get_results_per_page(self):
  71. return self._results_per_page
  72. def _set_results_par_page(self, rpp):
  73. self._results_per_page = rpp
  74. results_per_page = property(_get_results_per_page, _set_results_par_page)
  75. def get_results(self):
  76. """ Gets a page of results """
  77. if self.eor:
  78. return []
  79. MAX_VALUE = 1000000
  80. page = self._get_results_page()
  81. #search_info = self._extract_info(page)
  82. results = self._extract_results(page)
  83. search_info = {'from': self.results_per_page*self._page,
  84. 'to': self.results_per_page*self._page + len(results),
  85. 'total': MAX_VALUE}
  86. if not self.results_info:
  87. self.results_info = search_info
  88. if self.num_results == 0:
  89. self.eor = True
  90. return []
  91. if not results:
  92. self.eor = True
  93. return []
  94. if self._page > 0 and search_info['from'] == self._last_from:
  95. self.eor = True
  96. return []
  97. if search_info['to'] == search_info['total']:
  98. self.eor = True
  99. self._page += 1
  100. self._last_from = search_info['from']
  101. return results
  102. def _maybe_raise(self, cls, *arg):
  103. if self.debug:
  104. raise cls(*arg)
  105. def _get_results_page(self):
  106. if self._page == 0:
  107. if self._results_per_page == 10:
  108. url = GoogleSearch.SEARCH_URL_0
  109. else:
  110. url = GoogleSearch.SEARCH_URL_1
  111. else:
  112. if self._results_per_page == 10:
  113. url = GoogleSearch.NEXT_PAGE_0
  114. else:
  115. url = GoogleSearch.NEXT_PAGE_1
  116. safe_url = url % { 'query': urllib.quote_plus(self.query),
  117. 'start': self._page * self._results_per_page,
  118. 'num': self._results_per_page }
  119. try:
  120. page = self.browser.get_page(safe_url)
  121. except BrowserError, e:
  122. raise SearchError, "Failed getting %s: %s" % (e.url, e.error)
  123. return BeautifulSoup(page)
  124. def _extract_info(self, soup):
  125. empty_info = {'from': 0, 'to': 0, 'total': 0}
  126. div_ssb = soup.find('div', id='ssb')
  127. if not div_ssb:
  128. self._maybe_raise(ParseError, "Div with number of results was not found on Google search page", soup)
  129. return empty_info
  130. p = div_ssb.find('p')
  131. if not p:
  132. self._maybe_raise(ParseError, """<p> tag within <div id="ssb"> was not found on Google search page""", soup)
  133. return empty_info
  134. txt = ''.join(p.findAll(text=True))
  135. txt = txt.replace(',', '')
  136. matches = re.search(r'Results (\d+) - (\d+) of (?:about )?(\d+)', txt, re.U)
  137. if not matches:
  138. return empty_info
  139. return {'from': int(matches.group(1)), 'to': int(matches.group(2)), 'total': int(matches.group(3))}
  140. def _extract_results(self, soup):
  141. results = soup.findAll('li', {'class': 'g'})
  142. ret_res = []
  143. for result in results:
  144. eres = self._extract_result(result)
  145. if eres:
  146. ret_res.append(eres)
  147. return ret_res
  148. def _extract_result(self, result):
  149. title, url = self._extract_title_url(result)
  150. desc = self._extract_description(result)
  151. if not title or not url or not desc:
  152. return None
  153. return SearchResult(title, url, desc)
  154. def _extract_title_url(self, result):
  155. #title_a = result.find('a', {'class': re.compile(r'\bl\b')})
  156. title_a = result.find('a')
  157. if not title_a:
  158. self._maybe_raise(ParseError, "Title tag in Google search result was not found", result)
  159. return None, None
  160. title = ''.join(title_a.findAll(text=True))
  161. title = self._html_unescape(title)
  162. url = title_a['href']
  163. match = re.match(r'/url\?q=(http[^&]+)&', url)
  164. if match:
  165. url = urllib.unquote(match.group(1))
  166. return title, url
  167. def _extract_description(self, result):
  168. desc_div = result.find('div', {'class': re.compile(r'\bs\b')})
  169. if not desc_div:
  170. self._maybe_raise(ParseError, "Description tag in Google search result was not found", result)
  171. return None
  172. desc_strs = []
  173. def looper(tag):
  174. if not tag: return
  175. for t in tag:
  176. try:
  177. if t.name == 'br': break
  178. except AttributeError:
  179. pass
  180. try:
  181. desc_strs.append(t.string)
  182. except AttributeError:
  183. desc_strs.append(t)
  184. looper(desc_div)
  185. looper(desc_div.find('wbr')) # BeautifulSoup does not self-close <wbr>
  186. desc = ''.join(s for s in desc_strs if s)
  187. return self._html_unescape(desc)
  188. def _html_unescape(self, str):
  189. def entity_replacer(m):
  190. entity = m.group(1)
  191. if entity in name2codepoint:
  192. return unichr(name2codepoint[entity])
  193. else:
  194. return m.group(0)
  195. def ascii_replacer(m):
  196. cp = int(m.group(1))
  197. if cp <= 255:
  198. return unichr(cp)
  199. else:
  200. return m.group(0)
  201. s = re.sub(r'&#(\d+);', ascii_replacer, str, re.U)
  202. return re.sub(r'&([^;]+);', entity_replacer, s, re.U)