/modules/search.py
Python | 230 lines | 176 code | 24 blank | 30 comment | 34 complexity | 9f63d9a8b0b0c518681232a2f9966c37 MD5 | raw file
1#!/usr/bin/env python 2""" 3search.py - jenni Web Search Module 4Copyright 2009-2013, yano (yanovich.net) 5Copyright 2013, Edward Powell (embolalia.net) 6Copyright 2008-2013 Sean B. Palmer (inamidst.com) 7Licensed under the Eiffel Forum License 2. 8 9More info: 10 * jenni: https://github.com/myano/jenni/ 11 * Phenny: http://inamidst.com/phenny/ 12""" 13 14import json 15import re 16import urllib 17import web 18from modules import proxy 19 20r_tag = re.compile(r'<(?!!)[^>]+>') 21r_bing = re.compile(r'<h2><a href="([^"]+)"') 22 23 24def remove_spaces(x): 25 if ' ' in x: 26 x = x.replace(' ', ' ') 27 return remove_spaces(x) 28 else: 29 return x 30 31 32def bing_search(query, lang='en-GB'): 33 query = web.urllib.quote(query) 34 base = 'https://www.bing.com/search?mkt=%s&q=' % lang 35 page = proxy.get(base + query) 36 m = r_bing.search(page) 37 if m: return m.group(1) 38 39 40def bing(jenni, input): 41 """Queries Bing for the specified input.""" 42 query = input.group(2) 43 if query.startswith(':'): 44 lang, query = query.split(' ', 1) 45 lang = lang[1:] 46 else: lang = 'en-GB' 47 if not query: 48 return jenni.reply('.bing what?') 49 50 query = query.encode('utf-8') 51 uri = bing_search(query, lang) 52 if uri: 53 jenni.say(uri) 54 if not hasattr(jenni, 'last_seen_uri'): 55 jenni.last_seen_uri = {} 56 jenni.last_seen_uri[input.sender] = uri 57 else: jenni.reply("No results found for '%s'." % query) 58bing.commands = ['bing'] 59bing.example = '.bing swhack' 60 61 62def duck_sanitize(incoming): 63 return web.decode((incoming).decode('utf-8')) 64 65 66def duck_zero_click_scrape(html): 67 '''Scrape DDG HTML page for Zero-Click''' 68 try: 69 ## prefer to use BeautifulSoup 70 from BeautifulSoup import BeautifulSoup 71 except: 72 ## if BS is not available, just fail out here 73 return str() 74 75 soup = BeautifulSoup(html) 76 zero_click = str() 77 if soup('div', {'class': 'zero-click-result'}): 78 zero_click = str(soup('div', {'class': 'zero-click-result'})[0]) 79 output = r_tag.sub('', zero_click).strip() 80 output = output.replace('\n', '').replace('\t', '') 81 output = remove_spaces(output) 82 return output 83 84 85def duck_search(query): 86 '''Do a DuckDuckGo Search''' 87 88 ## grab results from the API for the query 89 duck_api_results = duck_api(query) 90 91 ## output is a string of the URL result 92 93 ## try to find the first result 94 if 'Results' in duck_api_results and min_size('Results', duck_api_results): 95 ## 'Results' is the most common place to look for the first result 96 output = duck_api_results['Results'][0]['FirstURL'] 97 elif 'AbstractURL' in duck_api_results and min_size('AbstractURL', duck_api_results): 98 ## if there is no 'result', let's try AbstractURL 99 ## this is usually a wikipedia article 100 output = duck_api_results['AbstractURL'] 101 elif 'RelatedTopics' in duck_api_results and min_size('RelatedTopics', duck_api_results): 102 ## if we still can't find a search result, let's grab a topic URL 103 ## this is usually vaguely related to the search query 104 ## many times this is a wikipedia result 105 for topic in duck_api_results['RelatedTopics']: 106 output = '%s - %s' % (topic['Name'], topic['Topics'][0]['FirstURL']) 107 if 'duckduckgo.com' in output: 108 ## as a last resort, DuckDuckGo will provide links to the query on its site 109 ## it doesn't appear to ever return a https URL 110 output = output.replace('http://', 'https://') 111 break 112 else: 113 ## if we still can't find a search result via the API 114 ## let's try scraping the html page 115 uri = 'https://duckduckgo.com/html/?q=%s&kl=us-en&kp=-1' % web.urllib.quote(query) 116 page = proxy.get(uri) 117 118 r_duck = re.compile(r'nofollow" class="[^"]+" href="(.*?)">') 119 120 bad_results = ['/y.js?', '//ad.ddg.gg/', '.msn.com/', 'r.search.yahoo.com/',] 121 m = r_duck.findall(page) 122 output = str() 123 if m: 124 for result in m: 125 valid_result = True 126 for each in bad_results: 127 if each in result: 128 valid_result = False 129 if valid_result: 130 output = result 131 break 132 else: 133 ## if we absolustely can't find a URL, let's try scraping the HTML 134 ## page for a zero_click info 135 return((duck_zero_click_scrape(page), False)) 136 137 return((duck_sanitize(output), True)) 138 139def min_size(key, dictt): 140 ## I am lazy 141 return len(dictt[key]) > 0 142 143 144def duck_api(query): 145 '''Send 'query' to DDG's API and return results as a dictionary''' 146 #query = web.urllib.quote(query) 147 uri = 'https://api.duckduckgo.com/?q=%s&format=json&no_html=1&no_redirect=1&kp=-1' % query 148 results = proxy.get(uri) 149 results = json.loads(results) 150 return results 151 152 153def duck_zero_click_api(query): 154 output = list() 155 header = 'Zero Click: ' 156 results = duck_api(query) 157 ## look for any possible Zero Click answers 158 if 'Redirect' in results and min_size('Redirect', results): 159 ## this is used when it is a !bang 160 output.append(results['Redirect'].strip()) 161 if 'AbstractText' in results and min_size('AbstractText', results): 162 ## topic summary (with no HTML) 163 output.append(header + results['AbstractText'].strip()) 164 if 'Answer' in results and min_size('Answer', results): 165 output.append(header + results['Answer'].strip()) 166 if 'Definition' in results and min_size('Definition', results): 167 output.append(header + results['Definition'].strip()) 168 if not output: 169 ## if we can't find anything in the API for Zero-Click 170 ## give up 171 return None 172 return output 173 174 175def duck(jenni, input): 176 '''Perform a DuckDuckGo Search and Zero-Click lookup''' 177 query = input.group(2) 178 if not query: 179 return jenni.reply('.ddg what?') 180 181 #query = query.encode('utf-8') 182 #jenni.say('query: ' + query) 183 184 ## try to find a search result via the API 185 uri, only_url = duck_search(query) 186 if uri: 187 jenni.say(uri) 188 if hasattr(jenni, 'last_seen_uri') and input.sender in jenni.last_seen_uri: 189 jenni.last_seen_uri[input.sender] = uri 190 191 ## try to find any Zero-Click stuff 192 result = duck_zero_click_api(query) 193 194 if result and len(result) == 1: 195 if hasattr(jenni, 'last_seen_uri') and input.sender in jenni.last_seen_uri: 196 jenni.last_seen_uri[input.sender] = result[0] 197 198 ## loop through zero-click results 199 if result and len(result) >= 1: 200 k = 0 201 for each in result: 202 if len(each) > 0: 203 jenni.say(remove_spaces(each)) 204 k += 1 205 if k > 3: 206 ## only show 3 zero-click results 207 ## we don't want to be too spammy 208 break 209 210 ## if we didn't get a search result 211 ## nor did we get a Zero-Click result 212 ## fail 213 if not uri and (not result or not len(result) >= 1): 214 return jenni.reply("No results found for '%s'." % query) 215duck.commands = ['duck', 'ddg', 'g', 'search'] 216 217 218def suggest(jenni, input): 219 if not input.group(2): 220 return jenni.reply("No query term.") 221 query = input.group(2).encode('utf-8') 222 uri = 'http://websitedev.de/temp-bin/suggest.pl?q=' 223 answer = web.get(uri + web.urllib.quote(query).replace('+', '%2B')) 224 if answer: 225 jenni.say(answer) 226 else: jenni.reply('Sorry, no result.') 227suggest.commands = ['suggest'] 228 229if __name__ == '__main__': 230 print __doc__.strip()