PageRenderTime 45ms CodeModel.GetById 12ms RepoModel.GetById 1ms app.codeStats 0ms

/modules/search.py

http://github.com/myano/jenni
Python | 230 lines | 176 code | 24 blank | 30 comment | 31 complexity | 9f63d9a8b0b0c518681232a2f9966c37 MD5 | raw file
  1. #!/usr/bin/env python
  2. """
  3. search.py - jenni Web Search Module
  4. Copyright 2009-2013, yano (yanovich.net)
  5. Copyright 2013, Edward Powell (embolalia.net)
  6. Copyright 2008-2013 Sean B. Palmer (inamidst.com)
  7. Licensed under the Eiffel Forum License 2.
  8. More info:
  9. * jenni: https://github.com/myano/jenni/
  10. * Phenny: http://inamidst.com/phenny/
  11. """
  12. import json
  13. import re
  14. import urllib
  15. import web
  16. from modules import proxy
  17. r_tag = re.compile(r'<(?!!)[^>]+>')
  18. r_bing = re.compile(r'<h2><a href="([^"]+)"')
  19. def remove_spaces(x):
  20. if ' ' in x:
  21. x = x.replace(' ', ' ')
  22. return remove_spaces(x)
  23. else:
  24. return x
  25. def bing_search(query, lang='en-GB'):
  26. query = web.urllib.quote(query)
  27. base = 'https://www.bing.com/search?mkt=%s&q=' % lang
  28. page = proxy.get(base + query)
  29. m = r_bing.search(page)
  30. if m: return m.group(1)
  31. def bing(jenni, input):
  32. """Queries Bing for the specified input."""
  33. query = input.group(2)
  34. if query.startswith(':'):
  35. lang, query = query.split(' ', 1)
  36. lang = lang[1:]
  37. else: lang = 'en-GB'
  38. if not query:
  39. return jenni.reply('.bing what?')
  40. query = query.encode('utf-8')
  41. uri = bing_search(query, lang)
  42. if uri:
  43. jenni.say(uri)
  44. if not hasattr(jenni, 'last_seen_uri'):
  45. jenni.last_seen_uri = {}
  46. jenni.last_seen_uri[input.sender] = uri
  47. else: jenni.reply("No results found for '%s'." % query)
  48. bing.commands = ['bing']
  49. bing.example = '.bing swhack'
  50. def duck_sanitize(incoming):
  51. return web.decode((incoming).decode('utf-8'))
  52. def duck_zero_click_scrape(html):
  53. '''Scrape DDG HTML page for Zero-Click'''
  54. try:
  55. ## prefer to use BeautifulSoup
  56. from BeautifulSoup import BeautifulSoup
  57. except:
  58. ## if BS is not available, just fail out here
  59. return str()
  60. soup = BeautifulSoup(html)
  61. zero_click = str()
  62. if soup('div', {'class': 'zero-click-result'}):
  63. zero_click = str(soup('div', {'class': 'zero-click-result'})[0])
  64. output = r_tag.sub('', zero_click).strip()
  65. output = output.replace('\n', '').replace('\t', '')
  66. output = remove_spaces(output)
  67. return output
  68. def duck_search(query):
  69. '''Do a DuckDuckGo Search'''
  70. ## grab results from the API for the query
  71. duck_api_results = duck_api(query)
  72. ## output is a string of the URL result
  73. ## try to find the first result
  74. if 'Results' in duck_api_results and min_size('Results', duck_api_results):
  75. ## 'Results' is the most common place to look for the first result
  76. output = duck_api_results['Results'][0]['FirstURL']
  77. elif 'AbstractURL' in duck_api_results and min_size('AbstractURL', duck_api_results):
  78. ## if there is no 'result', let's try AbstractURL
  79. ## this is usually a wikipedia article
  80. output = duck_api_results['AbstractURL']
  81. elif 'RelatedTopics' in duck_api_results and min_size('RelatedTopics', duck_api_results):
  82. ## if we still can't find a search result, let's grab a topic URL
  83. ## this is usually vaguely related to the search query
  84. ## many times this is a wikipedia result
  85. for topic in duck_api_results['RelatedTopics']:
  86. output = '%s - %s' % (topic['Name'], topic['Topics'][0]['FirstURL'])
  87. if 'duckduckgo.com' in output:
  88. ## as a last resort, DuckDuckGo will provide links to the query on its site
  89. ## it doesn't appear to ever return a https URL
  90. output = output.replace('http://', 'https://')
  91. break
  92. else:
  93. ## if we still can't find a search result via the API
  94. ## let's try scraping the html page
  95. uri = 'https://duckduckgo.com/html/?q=%s&kl=us-en&kp=-1' % web.urllib.quote(query)
  96. page = proxy.get(uri)
  97. r_duck = re.compile(r'nofollow" class="[^"]+" href="(.*?)">')
  98. bad_results = ['/y.js?', '//ad.ddg.gg/', '.msn.com/', 'r.search.yahoo.com/',]
  99. m = r_duck.findall(page)
  100. output = str()
  101. if m:
  102. for result in m:
  103. valid_result = True
  104. for each in bad_results:
  105. if each in result:
  106. valid_result = False
  107. if valid_result:
  108. output = result
  109. break
  110. else:
  111. ## if we absolustely can't find a URL, let's try scraping the HTML
  112. ## page for a zero_click info
  113. return((duck_zero_click_scrape(page), False))
  114. return((duck_sanitize(output), True))
  115. def min_size(key, dictt):
  116. ## I am lazy
  117. return len(dictt[key]) > 0
  118. def duck_api(query):
  119. '''Send 'query' to DDG's API and return results as a dictionary'''
  120. #query = web.urllib.quote(query)
  121. uri = 'https://api.duckduckgo.com/?q=%s&format=json&no_html=1&no_redirect=1&kp=-1' % query
  122. results = proxy.get(uri)
  123. results = json.loads(results)
  124. return results
  125. def duck_zero_click_api(query):
  126. output = list()
  127. header = 'Zero Click: '
  128. results = duck_api(query)
  129. ## look for any possible Zero Click answers
  130. if 'Redirect' in results and min_size('Redirect', results):
  131. ## this is used when it is a !bang
  132. output.append(results['Redirect'].strip())
  133. if 'AbstractText' in results and min_size('AbstractText', results):
  134. ## topic summary (with no HTML)
  135. output.append(header + results['AbstractText'].strip())
  136. if 'Answer' in results and min_size('Answer', results):
  137. output.append(header + results['Answer'].strip())
  138. if 'Definition' in results and min_size('Definition', results):
  139. output.append(header + results['Definition'].strip())
  140. if not output:
  141. ## if we can't find anything in the API for Zero-Click
  142. ## give up
  143. return None
  144. return output
  145. def duck(jenni, input):
  146. '''Perform a DuckDuckGo Search and Zero-Click lookup'''
  147. query = input.group(2)
  148. if not query:
  149. return jenni.reply('.ddg what?')
  150. #query = query.encode('utf-8')
  151. #jenni.say('query: ' + query)
  152. ## try to find a search result via the API
  153. uri, only_url = duck_search(query)
  154. if uri:
  155. jenni.say(uri)
  156. if hasattr(jenni, 'last_seen_uri') and input.sender in jenni.last_seen_uri:
  157. jenni.last_seen_uri[input.sender] = uri
  158. ## try to find any Zero-Click stuff
  159. result = duck_zero_click_api(query)
  160. if result and len(result) == 1:
  161. if hasattr(jenni, 'last_seen_uri') and input.sender in jenni.last_seen_uri:
  162. jenni.last_seen_uri[input.sender] = result[0]
  163. ## loop through zero-click results
  164. if result and len(result) >= 1:
  165. k = 0
  166. for each in result:
  167. if len(each) > 0:
  168. jenni.say(remove_spaces(each))
  169. k += 1
  170. if k > 3:
  171. ## only show 3 zero-click results
  172. ## we don't want to be too spammy
  173. break
  174. ## if we didn't get a search result
  175. ## nor did we get a Zero-Click result
  176. ## fail
  177. if not uri and (not result or not len(result) >= 1):
  178. return jenni.reply("No results found for '%s'." % query)
  179. duck.commands = ['duck', 'ddg', 'g', 'search']
  180. def suggest(jenni, input):
  181. if not input.group(2):
  182. return jenni.reply("No query term.")
  183. query = input.group(2).encode('utf-8')
  184. uri = 'http://websitedev.de/temp-bin/suggest.pl?q='
  185. answer = web.get(uri + web.urllib.quote(query).replace('+', '%2B'))
  186. if answer:
  187. jenni.say(answer)
  188. else: jenni.reply('Sorry, no result.')
  189. suggest.commands = ['suggest']
  190. if __name__ == '__main__':
  191. print __doc__.strip()