PageRenderTime 51ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 0ms

/modules/search.py

https://github.com/0x705h/jenni
Python | 341 lines | 317 code | 10 blank | 14 comment | 12 complexity | eb6bffd213022e93d1800049ef3dc4f9 MD5 | raw file
  1. #!/usr/bin/env python
  2. """
  3. search.py - jenni Web Search Module
  4. Copyright 2009-2013, Michael Yanovich (yanovich.net)
  5. Copyright 2013, Edward Powell (embolalia.net)
  6. Copyright 2008-2013 Sean B. Palmer (inamidst.com)
  7. Licensed under the Eiffel Forum License 2.
  8. More info:
  9. * jenni: https://github.com/myano/jenni/
  10. * Phenny: http://inamidst.com/phenny/
  11. """
  12. import json
  13. import re
  14. import urllib
  15. import web
  16. r_tag = re.compile(r'<(?!!)[^>]+>')
  17. def remove_spaces(x):
  18. if ' ' in x:
  19. x = x.replace(' ', ' ')
  20. return remove_spaces(x)
  21. else:
  22. return x
  23. class Grab(web.urllib.URLopener):
  24. def __init__(self, *args):
  25. self.version = 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0'
  26. web.urllib.URLopener.__init__(self, *args)
  27. self.addheader('Referer', 'https://github.com/myano/jenni')
  28. self.addheader('Accept', '*/*')
  29. def http_error_default(self, url, fp, errcode, errmsg, headers):
  30. return web.urllib.addinfourl(fp, [headers, errcode], "http:" + url)
  31. def google_ajax(query):
  32. """Search using AjaxSearch, and return its JSON."""
  33. if isinstance(query, unicode):
  34. query = query.encode('utf-8')
  35. uri = 'https://ajax.googleapis.com/ajax/services/search/web'
  36. args = '?v=1.0&safe=off&q=' + web.urllib.quote(query)
  37. handler = web.urllib._urlopener
  38. web.urllib._urlopener = Grab()
  39. bytes = web.get(uri + args)
  40. web.urllib._urlopener = handler
  41. return json.loads(bytes)
  42. def google_search(query):
  43. results = google_ajax(query)
  44. try: return results['responseData']['results'][0]['unescapedUrl']
  45. except IndexError: return None
  46. except TypeError:
  47. print results
  48. return False
  49. def google_count(query):
  50. results = google_ajax(query)
  51. if not results.has_key('responseData'): return '0'
  52. if not results['responseData'].has_key('cursor'): return '0'
  53. if not results['responseData']['cursor'].has_key('estimatedResultCount'):
  54. return '0'
  55. return results['responseData']['cursor']['estimatedResultCount']
  56. def formatnumber(n):
  57. """Format a number with beautiful commas."""
  58. parts = list(str(n))
  59. for i in range((len(parts) - 3), 0, -3):
  60. parts.insert(i, ',')
  61. return ''.join(parts)
  62. def g(jenni, input):
  63. """Queries Google for the specified input."""
  64. query = input.group(2)
  65. if not query:
  66. return jenni.reply('.g what?')
  67. query = query.encode('utf-8')
  68. uri = google_search(query)
  69. if uri:
  70. if 'wikipedia.org/' in uri:
  71. uri = uri.replace('http:', 'https:')
  72. jenni.reply(uri)
  73. if not hasattr(jenni, 'last_seen_uri'):
  74. jenni.bot.last_seen_uri = {}
  75. jenni.bot.last_seen_uri[input.sender] = uri
  76. elif uri is False: jenni.reply("Problem getting data from Google.")
  77. else: jenni.reply("No results found for '%s'." % query)
  78. g.commands = ['g']
  79. g.priority = 'high'
  80. g.example = '.g swhack'
  81. def gc(jenni, input):
  82. """Returns the number of Google results for the specified input."""
  83. query = input.group(2)
  84. if not query:
  85. return jenni.reply('.gc what?')
  86. query = query.encode('utf-8')
  87. num = formatnumber(google_count(query))
  88. jenni.say(query + ': ' + num)
  89. gc.commands = ['gc']
  90. gc.priority = 'high'
  91. gc.example = '.gc extrapolate'
  92. r_query = re.compile(
  93. r'\+?"[^"\\]*(?:\\.[^"\\]*)*"|\[[^]\\]*(?:\\.[^]\\]*)*\]|\S+'
  94. )
  95. def gcs(jenni, input):
  96. if not input.group(2):
  97. return jenni.reply("Nothing to compare.")
  98. queries = r_query.findall(input.group(2))
  99. if len(queries) > 6:
  100. return jenni.reply('Sorry, can only compare up to six things.')
  101. results = []
  102. for i, query in enumerate(queries):
  103. query = query.strip('[]')
  104. query = query.encode('utf-8')
  105. n = int((formatnumber(google_count(query)) or '0').replace(',', ''))
  106. results.append((n, query))
  107. if i >= 2: __import__('time').sleep(0.25)
  108. if i >= 4: __import__('time').sleep(0.25)
  109. results = [(term, n) for (n, term) in reversed(sorted(results))]
  110. reply = ', '.join('%s (%s)' % (t, formatnumber(n)) for (t, n) in results)
  111. jenni.say(reply)
  112. gcs.commands = ['gcs', 'comp']
  113. r_bing = re.compile(r'<h3><a href="([^"]+)"')
  114. def bing_search(query, lang='en-GB'):
  115. query = web.urllib.quote(query)
  116. base = 'http://www.bing.com/search?mkt=%s&q=' % lang
  117. bytes = web.get(base + query)
  118. m = r_bing.search(bytes)
  119. if m: return m.group(1)
  120. def bing(jenni, input):
  121. """Queries Bing for the specified input."""
  122. query = input.group(2)
  123. if query.startswith(':'):
  124. lang, query = query.split(' ', 1)
  125. lang = lang[1:]
  126. else: lang = 'en-GB'
  127. if not query:
  128. return jenni.reply('.bing what?')
  129. query = query.encode('utf-8')
  130. uri = bing_search(query, lang)
  131. if uri:
  132. jenni.reply(uri)
  133. if not hasattr(jenni, 'last_seen_uri'):
  134. jenni.bot.last_seen_uri = {}
  135. jenni.bot.last_seen_uri[input.sender] = uri
  136. else: jenni.reply("No results found for '%s'." % query)
  137. bing.commands = ['bing']
  138. bing.example = '.bing swhack'
  139. def duck_sanitize(incoming):
  140. return web.decode((incoming).decode('utf-8'))
  141. def duck_zero_click_scrape(html):
  142. '''Scrape DDG HTML page for Zero-Click'''
  143. try:
  144. ## prefer to use BeautifulSoup
  145. from BeautifulSoup import BeautifulSoup
  146. except:
  147. ## if BS is not available, just fail out here
  148. return str()
  149. soup = BeautifulSoup(html)
  150. zero_click = str()
  151. if soup('div', {'class': 'zero-click-result'}):
  152. zero_click = str(soup('div', {'class': 'zero-click-result'})[0])
  153. output = r_tag.sub('', zero_click).strip()
  154. output = output.replace('\n', '').replace('\t', '')
  155. output = remove_spaces(output)
  156. return output
  157. def duck_search(query):
  158. '''Do a DuckDuckGo Search'''
  159. ## grab results from the API for the query
  160. duck_api_results = duck_api(query)
  161. ## output is a string of the URL result
  162. ## try to find the first result
  163. if 'Results' in duck_api_results and min_size('Results', duck_api_results):
  164. ## 'Results' is the most common place to look for the first result
  165. output = duck_api_results['Results'][0]['FirstURL']
  166. elif 'AbstractURL' in duck_api_results and min_size('AbstractURL', duck_api_results):
  167. ## if there is no 'result', let's try AbstractURL
  168. ## this is usually a wikipedia article
  169. output = duck_api_results['AbstractURL']
  170. elif 'RelatedTopics' in duck_api_results and min_size('RelatedTopics', duck_api_results):
  171. ## if we still can't find a search result, let's grab a topic URL
  172. ## this is usually vaguely related to the search query
  173. ## many times this is a wikipedia result
  174. for topic in duck_api_results['RelatedTopics']:
  175. output = '%s - %s' % (topic['Name'], topic['Topics'][0]['FirstURL'])
  176. if 'duckduckgo.com' in output:
  177. ## as a last resort, DuckDuckGo will provide links to the query on its site
  178. ## it doesn't appear to ever return a https URL
  179. output = output.replace('http://', 'https://')
  180. break
  181. else:
  182. ## if we still can't find a search result via the API
  183. ## let's try scraping the html page
  184. uri = 'https://duckduckgo.com/html/?q=%s&kl=us-en&kp=-1' % web.urllib.quote(query)
  185. page = web.get(uri)
  186. r_duck = re.compile(r'nofollow" class="[^"]+" href="(.*?)">')
  187. m = r_duck.findall(page)
  188. output = str()
  189. if m:
  190. for result in m:
  191. if '/y.js?' not in result and '//ad.ddg.gg/' not in result:
  192. ## ignore ads
  193. output = result
  194. break
  195. else:
  196. ## if we absolustely can't find a URL, let's try scraping the HTML
  197. ## page for a zero_click info
  198. output = duck_zero_click_scrape(page)
  199. return duck_sanitize(output)
  200. def min_size(key, dictt):
  201. ## I am lazy
  202. return len(dictt[key]) > 0
  203. def duck_api(query):
  204. '''Send 'query' to DDG's API and return results as a dictionary'''
  205. query = web.urllib.quote(query)
  206. uri = 'https://api.duckduckgo.com/?q=%s&format=json&no_html=1&no_redirect=1&kp=-1' % query
  207. results = web.get(uri)
  208. results = json.loads(web.get(uri))
  209. return results
  210. def duck_zero_click_api(query):
  211. output = list()
  212. header = 'Zero Click: '
  213. results = duck_api(query)
  214. ## look for any possible Zero Click answers
  215. if 'Redirect' in results and min_size('Redirect', results):
  216. ## this is used when it is a !bang
  217. output.append(results['Redirect'].strip())
  218. if 'AbstractText' in results and min_size('AbstractText', results):
  219. ## topic summary (with no HTML)
  220. output.append(header + results['AbstractText'].strip())
  221. if 'Answer' in results and min_size('Answer', results):
  222. output.append(header + results['Answer'].strip())
  223. if 'Definition' in results and min_size('Definition', results):
  224. output.append(header + results['Definition'].strip())
  225. if not output:
  226. ## if we can't find anything in the API for Zero-Click
  227. ## give up
  228. return None
  229. return output
  230. def duck(jenni, input):
  231. '''Perform a DuckDuckGo Search and Zero-Click lookup'''
  232. query = input.group(2)
  233. if not query:
  234. return jenni.reply('.ddg what?')
  235. query = query.encode('utf-8')
  236. ## try to find a search result via the API
  237. uri = duck_search(query)
  238. if uri:
  239. jenni.say(uri)
  240. if not hasattr(jenni, 'last_seen_uri'):
  241. jenni.bot.last_seen_uri = dict()
  242. jenni.bot.last_seen_uri[input.sender] = uri
  243. ## try to find any Zero-Click stuff
  244. result = duck_zero_click_api(query)
  245. ## loop through zero-click results
  246. if result and len(result) >= 1:
  247. k = 0
  248. for each in result:
  249. if len(each) > 0:
  250. jenni.say(remove_spaces(each))
  251. k += 1
  252. if k > 3:
  253. ## only show 3 zero-click results
  254. ## we don't want to be too spammy
  255. break
  256. ## if we didn't get a search result
  257. ## nor did we get a Zero-Click result
  258. ## fail
  259. if not uri and (not result or not len(result) >= 1):
  260. return jenni.reply("No results found for '%s'." % query)
  261. duck.commands = ['duck', 'ddg']
  262. def search(jenni, input):
  263. if not input.group(2):
  264. return jenni.reply('.search for what?')
  265. query = input.group(2).encode('utf-8')
  266. gu = google_search(query) or '-'
  267. bu = bing_search(query) or '-'
  268. du = duck_search(query) or '-'
  269. if (gu == bu) and (bu == du):
  270. result = '%s (g, b, d)' % gu
  271. elif (gu == bu):
  272. result = '%s (g, b), %s (d)' % (gu, du)
  273. elif (bu == du):
  274. result = '%s (b, d), %s (g)' % (bu, gu)
  275. elif (gu == du):
  276. result = '%s (g, d), %s (b)' % (gu, bu)
  277. else:
  278. if len(gu) > 250: gu = '(extremely long link)'
  279. if len(bu) > 150: bu = '(extremely long link)'
  280. if len(du) > 150: du = '(extremely long link)'
  281. result = '%s (g), %s (b), %s (d)' % (gu, bu, du)
  282. jenni.reply(result)
  283. search.commands = ['search']
  284. def suggest(jenni, input):
  285. if not input.group(2):
  286. return jenni.reply("No query term.")
  287. query = input.group(2).encode('utf-8')
  288. uri = 'http://websitedev.de/temp-bin/suggest.pl?q='
  289. answer = web.get(uri + web.urllib.quote(query).replace('+', '%2B'))
  290. if answer:
  291. jenni.say(answer)
  292. else: jenni.reply('Sorry, no result.')
  293. suggest.commands = ['suggest']
  294. if __name__ == '__main__':
  295. print __doc__.strip()