PageRenderTime 185ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 1ms

/src/pentest/fimap/xgoogle/sponsoredlinks.py

https://bitbucket.org/manaphassan/raspberry-pwn
Python | 235 lines | 192 code | 23 blank | 20 comment | 13 complexity | 28e3696907a8755afbcadf1f746cea2d MD5 | raw file
  1. #!/usr/bin/python
  2. #
  3. # Peteris Krumins (peter@catonmat.net)
  4. # http://www.catonmat.net -- good coders code, great reuse
  5. #
  6. # http://www.catonmat.net/blog/python-library-for-google-sponsored-links-search/
  7. #
  8. # Code is licensed under MIT license.
  9. #
  10. import re
  11. import urllib
  12. import random
  13. from htmlentitydefs import name2codepoint
  14. from BeautifulSoup import BeautifulSoup
  15. from browser import Browser, BrowserError
  16. #
  17. # TODO: join GoogleSearch and SponsoredLinks classes under a single base class
  18. #
  19. class SLError(Exception):
  20. """ Sponsored Links Error """
  21. pass
  22. class SLParseError(Exception):
  23. """
  24. Parse error in Google results.
  25. self.msg attribute contains explanation why parsing failed
  26. self.tag attribute contains BeautifulSoup object with the most relevant tag that failed to parse
  27. Thrown only in debug mode
  28. """
  29. def __init__(self, msg, tag):
  30. self.msg = msg
  31. self.tag = tag
  32. def __str__(self):
  33. return self.msg
  34. def html(self):
  35. return self.tag.prettify()
  36. GET_ALL_SLEEP_FUNCTION = object()
  37. class SponsoredLink(object):
  38. """ a single sponsored link """
  39. def __init__(self, title, url, display_url, desc):
  40. self.title = title
  41. self.url = url
  42. self.display_url = display_url
  43. self.desc = desc
  44. class SponsoredLinks(object):
  45. SEARCH_URL_0 = "http://www.google.com/sponsoredlinks?q=%(query)s&btnG=Search+Sponsored+Links&hl=en"
  46. NEXT_PAGE_0 = "http://www.google.com/sponsoredlinks?q=%(query)s&sa=N&start=%(start)d&hl=en"
  47. SEARCH_URL_1 = "http://www.google.com/sponsoredlinks?q=%(query)s&num=%(num)d&btnG=Search+Sponsored+Links&hl=en"
  48. NEXT_PAGE_1 = "http://www.google.com/sponsoredlinks?q=%(query)s&num=%(num)d&sa=N&start=%(start)d&hl=en"
  49. def __init__(self, query, random_agent=False, debug=False):
  50. self.query = query
  51. self.debug = debug
  52. self.browser = Browser(debug=debug)
  53. self._page = 0
  54. self.eor = False
  55. self.results_info = None
  56. self._results_per_page = 10
  57. if random_agent:
  58. self.browser.set_random_user_agent()
  59. @property
  60. def num_results(self):
  61. if not self.results_info:
  62. page = self._get_results_page()
  63. self.results_info = self._extract_info(page)
  64. if self.results_info['total'] == 0:
  65. self.eor = True
  66. return self.results_info['total']
  67. def _get_results_per_page(self):
  68. return self._results_per_page
  69. def _set_results_par_page(self, rpp):
  70. self._results_per_page = rpp
  71. results_per_page = property(_get_results_per_page, _set_results_par_page)
  72. def get_results(self):
  73. if self.eor:
  74. return []
  75. page = self._get_results_page()
  76. info = self._extract_info(page)
  77. if self.results_info is None:
  78. self.results_info = info
  79. if info['to'] == info['total']:
  80. self.eor = True
  81. results = self._extract_results(page)
  82. if not results:
  83. self.eor = True
  84. return []
  85. self._page += 1
  86. return results
  87. def _get_all_results_sleep_fn(self):
  88. return random.random()*5 + 1 # sleep from 1 - 6 seconds
  89. def get_all_results(self, sleep_function=None):
  90. if sleep_function is GET_ALL_SLEEP_FUNCTION:
  91. sleep_function = self._get_all_results_sleep_fn
  92. if sleep_function is None:
  93. sleep_function = lambda: None
  94. ret_results = []
  95. while True:
  96. res = self.get_results()
  97. if not res:
  98. return ret_results
  99. ret_results.extend(res)
  100. return ret_results
  101. def _maybe_raise(self, cls, *arg):
  102. if self.debug:
  103. raise cls(*arg)
  104. def _extract_info(self, soup):
  105. empty_info = { 'from': 0, 'to': 0, 'total': 0 }
  106. stats_span = soup.find('span', id='stats')
  107. if not stats_span:
  108. return empty_info
  109. txt = ''.join(stats_span.findAll(text=True))
  110. txt = txt.replace(',', '').replace(" ", ' ')
  111. matches = re.search(r'Results (\d+) - (\d+) of (?:about )?(\d+)', txt)
  112. if not matches:
  113. return empty_info
  114. return {'from': int(matches.group(1)), 'to': int(matches.group(2)), 'total': int(matches.group(3))}
  115. def _get_results_page(self):
  116. if self._page == 0:
  117. if self._results_per_page == 10:
  118. url = SponsoredLinks.SEARCH_URL_0
  119. else:
  120. url = SponsoredLinks.SEARCH_URL_1
  121. else:
  122. if self._results_per_page == 10:
  123. url = SponsoredLinks.NEXT_PAGE_0
  124. else:
  125. url = SponsoredLinks.NEXT_PAGE_1
  126. safe_url = url % { 'query': urllib.quote_plus(self.query),
  127. 'start': self._page * self._results_per_page,
  128. 'num': self._results_per_page }
  129. try:
  130. page = self.browser.get_page(safe_url)
  131. except BrowserError, e:
  132. raise SLError, "Failed getting %s: %s" % (e.url, e.error)
  133. return BeautifulSoup(page)
  134. def _extract_results(self, soup):
  135. results = soup.findAll('div', {'class': 'g'})
  136. ret_res = []
  137. for result in results:
  138. eres = self._extract_result(result)
  139. if eres:
  140. ret_res.append(eres)
  141. return ret_res
  142. def _extract_result(self, result):
  143. title, url = self._extract_title_url(result)
  144. display_url = self._extract_display_url(result) # Warning: removes 'cite' from the result
  145. desc = self._extract_description(result)
  146. if not title or not url or not display_url or not desc:
  147. return None
  148. return SponsoredLink(title, url, display_url, desc)
  149. def _extract_title_url(self, result):
  150. title_a = result.find('a')
  151. if not title_a:
  152. self._maybe_raise(SLParseError, "Title tag in sponsored link was not found", result)
  153. return None, None
  154. title = ''.join(title_a.findAll(text=True))
  155. title = self._html_unescape(title)
  156. url = title_a['href']
  157. match = re.search(r'q=(http[^&]+)&', url)
  158. if not match:
  159. self._maybe_raise(SLParseError, "URL inside a sponsored link was not found", result)
  160. return None, None
  161. url = urllib.unquote(match.group(1))
  162. return title, url
  163. def _extract_display_url(self, result):
  164. cite = result.find('cite')
  165. if not cite:
  166. self._maybe_raise(SLParseError, "<cite> not found inside result", result)
  167. return None
  168. return ''.join(cite.findAll(text=True))
  169. def _extract_description(self, result):
  170. cite = result.find('cite')
  171. if not cite:
  172. return None
  173. cite.extract()
  174. desc_div = result.find('div', {'class': 'line23'})
  175. if not desc_div:
  176. self._maybe_raise(ParseError, "Description tag not found in sponsored link", result)
  177. return None
  178. desc_strs = desc_div.findAll(text=True)[0:-1]
  179. desc = ''.join(desc_strs)
  180. desc = desc.replace("\n", " ")
  181. desc = desc.replace(" ", " ")
  182. return self._html_unescape(desc)
  183. def _html_unescape(self, str):
  184. def entity_replacer(m):
  185. entity = m.group(1)
  186. if entity in name2codepoint:
  187. return unichr(name2codepoint[entity])
  188. else:
  189. return m.group(0)
  190. def ascii_replacer(m):
  191. cp = int(m.group(1))
  192. if cp <= 255:
  193. return unichr(cp)
  194. else:
  195. return m.group(0)
  196. s = re.sub(r'&#(\d+);', ascii_replacer, str, re.U)
  197. return re.sub(r'&([^;]+);', entity_replacer, s, re.U)