/modules/search.py
Python | 344 lines | 320 code | 10 blank | 14 comment | 13 complexity | f14baf24e26b48fc58a0add018120fe8 MD5 | raw file
1#!/usr/bin/env python 2""" 3search.py - jenni Web Search Module 4Copyright 2009-2013, Michael Yanovich (yanovich.net) 5Copyright 2013, Edward Powell (embolalia.net) 6Copyright 2008-2013 Sean B. Palmer (inamidst.com) 7Licensed under the Eiffel Forum License 2. 8 9More info: 10 * jenni: https://github.com/myano/jenni/ 11 * Phenny: http://inamidst.com/phenny/ 12""" 13 14import json 15import re 16import urllib 17import web 18 19r_tag = re.compile(r'<(?!!)[^>]+>') 20 21 22def remove_spaces(x): 23 if ' ' in x: 24 x = x.replace(' ', ' ') 25 return remove_spaces(x) 26 else: 27 return x 28 29 30class Grab(web.urllib.URLopener): 31 def __init__(self, *args): 32 self.version = 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0' 33 web.urllib.URLopener.__init__(self, *args) 34 self.addheader('Referer', 'https://github.com/myano/jenni') 35 self.addheader('Accept', '*/*') 36 def http_error_default(self, url, fp, errcode, errmsg, headers): 37 return web.urllib.addinfourl(fp, [headers, errcode], "http:" + url) 38 39def google_ajax(query): 40 """Search using AjaxSearch, and return its JSON.""" 41 if isinstance(query, unicode): 42 query = query.encode('utf-8') 43 uri = 'https://ajax.googleapis.com/ajax/services/search/web' 44 args = '?v=1.0&safe=off&q=' + web.urllib.quote(query) 45 handler = web.urllib._urlopener 46 web.urllib._urlopener = Grab() 47 bytes = web.get(uri + args) 48 web.urllib._urlopener = handler 49 return json.loads(bytes) 50 51def google_search(query): 52 results = google_ajax(query) 53 try: return results['responseData']['results'][0]['unescapedUrl'] 54 except IndexError: return None 55 except TypeError: 56 print results 57 return False 58 59def google_count(query): 60 results = google_ajax(query) 61 if not results.has_key('responseData'): return '0' 62 if not results['responseData'].has_key('cursor'): return '0' 63 if not results['responseData']['cursor'].has_key('estimatedResultCount'): 64 return '0' 65 return results['responseData']['cursor']['estimatedResultCount'] 66 67def formatnumber(n): 68 """Format a number with beautiful commas.""" 69 parts = list(str(n)) 70 for i in range((len(parts) - 3), 0, -3): 71 parts.insert(i, ',') 72 return ''.join(parts) 73 74def g(jenni, input): 75 """Queries Google for the specified input.""" 76 query = input.group(2) 77 if not query: 78 return jenni.reply('.g what?') 79 query = query.encode('utf-8') 80 uri = google_search(query) 81 if uri: 82 if 'wikipedia.org/' in uri: 83 uri = uri.replace('http:', 'https:') 84 jenni.reply(uri) 85 if not hasattr(jenni, 'last_seen_uri'): 86 jenni.last_seen_uri = {} 87 jenni.last_seen_uri[input.sender] = uri 88 elif uri is False: jenni.reply("Problem getting data from Google.") 89 else: jenni.reply("No results found for '%s'." % query) 90g.commands = ['g'] 91g.priority = 'high' 92g.example = '.g swhack' 93 94def gc(jenni, input): 95 """Returns the number of Google results for the specified input.""" 96 query = input.group(2) 97 if not query: 98 return jenni.reply('.gc what?') 99 query = query.encode('utf-8') 100 num = formatnumber(google_count(query)) 101 jenni.say(query + ': ' + num) 102gc.commands = ['gc'] 103gc.priority = 'high' 104gc.example = '.gc extrapolate' 105 106r_query = re.compile( 107 r'\+?"[^"\\]*(?:\\.[^"\\]*)*"|\[[^]\\]*(?:\\.[^]\\]*)*\]|\S+' 108) 109 110def gcs(jenni, input): 111 if not input.group(2): 112 return jenni.reply("Nothing to compare.") 113 queries = r_query.findall(input.group(2)) 114 if len(queries) > 6: 115 return jenni.reply('Sorry, can only compare up to six things.') 116 117 results = [] 118 for i, query in enumerate(queries): 119 query = query.strip('[]') 120 query = query.encode('utf-8') 121 n = int((formatnumber(google_count(query)) or '0').replace(',', '')) 122 results.append((n, query)) 123 if i >= 2: __import__('time').sleep(0.25) 124 if i >= 4: __import__('time').sleep(0.25) 125 126 results = [(term, n) for (n, term) in reversed(sorted(results))] 127 reply = ', '.join('%s (%s)' % (t, formatnumber(n)) for (t, n) in results) 128 jenni.say(reply) 129gcs.commands = ['gcs', 'comp'] 130 131r_bing = re.compile(r'<h3><a href="([^"]+)"') 132 133def bing_search(query, lang='en-GB'): 134 query = web.urllib.quote(query) 135 base = 'http://www.bing.com/search?mkt=%s&q=' % lang 136 bytes = web.get(base + query) 137 m = r_bing.search(bytes) 138 if m: return m.group(1) 139 140def bing(jenni, input): 141 """Queries Bing for the specified input.""" 142 query = input.group(2) 143 if query.startswith(':'): 144 lang, query = query.split(' ', 1) 145 lang = lang[1:] 146 else: lang = 'en-GB' 147 if not query: 148 return jenni.reply('.bing what?') 149 150 query = query.encode('utf-8') 151 uri = bing_search(query, lang) 152 if uri: 153 jenni.reply(uri) 154 if not hasattr(jenni, 'last_seen_uri'): 155 jenni.last_seen_uri = {} 156 jenni.last_seen_uri[input.sender] = uri 157 else: jenni.reply("No results found for '%s'." % query) 158bing.commands = ['bing'] 159bing.example = '.bing swhack' 160 161 162def duck_sanitize(incoming): 163 return web.decode((incoming).decode('utf-8')) 164 165 166def duck_zero_click_scrape(html): 167 '''Scrape DDG HTML page for Zero-Click''' 168 try: 169 ## prefer to use BeautifulSoup 170 from BeautifulSoup import BeautifulSoup 171 except: 172 ## if BS is not available, just fail out here 173 return str() 174 175 soup = BeautifulSoup(html) 176 zero_click = str() 177 if soup('div', {'class': 'zero-click-result'}): 178 zero_click = str(soup('div', {'class': 'zero-click-result'})[0]) 179 output = r_tag.sub('', zero_click).strip() 180 output = output.replace('\n', '').replace('\t', '') 181 output = remove_spaces(output) 182 return output 183 184 185def duck_search(query): 186 '''Do a DuckDuckGo Search''' 187 188 ## grab results from the API for the query 189 duck_api_results = duck_api(query) 190 191 ## output is a string of the URL result 192 193 ## try to find the first result 194 if 'Results' in duck_api_results and min_size('Results', duck_api_results): 195 ## 'Results' is the most common place to look for the first result 196 output = duck_api_results['Results'][0]['FirstURL'] 197 elif 'AbstractURL' in duck_api_results and min_size('AbstractURL', duck_api_results): 198 ## if there is no 'result', let's try AbstractURL 199 ## this is usually a wikipedia article 200 output = duck_api_results['AbstractURL'] 201 elif 'RelatedTopics' in duck_api_results and min_size('RelatedTopics', duck_api_results): 202 ## if we still can't find a search result, let's grab a topic URL 203 ## this is usually vaguely related to the search query 204 ## many times this is a wikipedia result 205 for topic in duck_api_results['RelatedTopics']: 206 output = '%s - %s' % (topic['Name'], topic['Topics'][0]['FirstURL']) 207 if 'duckduckgo.com' in output: 208 ## as a last resort, DuckDuckGo will provide links to the query on its site 209 ## it doesn't appear to ever return a https URL 210 output = output.replace('http://', 'https://') 211 break 212 else: 213 ## if we still can't find a search result via the API 214 ## let's try scraping the html page 215 uri = 'https://duckduckgo.com/html/?q=%s&kl=us-en&kp=-1' % web.urllib.quote(query) 216 page = web.get(uri) 217 r_duck = re.compile(r'nofollow" class="[^"]+" href="(.*?)">') 218 m = r_duck.findall(page) 219 output = str() 220 if m: 221 for result in m: 222 if '/y.js?' not in result and '//ad.ddg.gg/' not in result and '.msn.com/' not in result: 223 ## ignore ads 224 output = result 225 break 226 else: 227 ## if we absolustely can't find a URL, let's try scraping the HTML 228 ## page for a zero_click info 229 output = duck_zero_click_scrape(page) 230 return duck_sanitize(output) 231 232def min_size(key, dictt): 233 ## I am lazy 234 return len(dictt[key]) > 0 235 236def duck_api(query): 237 '''Send 'query' to DDG's API and return results as a dictionary''' 238 query = web.urllib.quote(query) 239 uri = 'https://api.duckduckgo.com/?q=%s&format=json&no_html=1&no_redirect=1&kp=-1' % query 240 results = web.get(uri) 241 results = json.loads(web.get(uri)) 242 return results 243 244def duck_zero_click_api(query): 245 output = list() 246 header = 'Zero Click: ' 247 results = duck_api(query) 248 ## look for any possible Zero Click answers 249 if 'Redirect' in results and min_size('Redirect', results): 250 ## this is used when it is a !bang 251 output.append(results['Redirect'].strip()) 252 if 'AbstractText' in results and min_size('AbstractText', results): 253 ## topic summary (with no HTML) 254 output.append(header + results['AbstractText'].strip()) 255 if 'Answer' in results and min_size('Answer', results): 256 output.append(header + results['Answer'].strip()) 257 if 'Definition' in results and min_size('Definition', results): 258 output.append(header + results['Definition'].strip()) 259 if not output: 260 ## if we can't find anything in the API for Zero-Click 261 ## give up 262 return None 263 264 return output 265 266def duck(jenni, input): 267 '''Perform a DuckDuckGo Search and Zero-Click lookup''' 268 query = input.group(2) 269 if not query: 270 return jenni.reply('.ddg what?') 271 272 query = query.encode('utf-8') 273 274 ## try to find a search result via the API 275 uri = duck_search(query) 276 if uri: 277 jenni.say(uri) 278 if hasattr(jenni, 'last_seen_uri') and input.sender in jenni.last_seen_uri: 279 jenni.last_seen_uri[input.sender] = uri 280 281 ## try to find any Zero-Click stuff 282 result = duck_zero_click_api(query) 283 284 if result and len(result) == 1: 285 if hasattr(jenni, 'last_seen_uri') and input.sender in jenni.last_seen_uri: 286 jenni.last_seen_uri[input.sender] = result[0] 287 288 ## loop through zero-click results 289 if result and len(result) >= 1: 290 k = 0 291 for each in result: 292 if len(each) > 0: 293 jenni.say(remove_spaces(each)) 294 k += 1 295 if k > 3: 296 ## only show 3 zero-click results 297 ## we don't want to be too spammy 298 break 299 300 ## if we didn't get a search result 301 ## nor did we get a Zero-Click result 302 ## fail 303 if not uri and (not result or not len(result) >= 1): 304 return jenni.reply("No results found for '%s'." % query) 305duck.commands = ['duck', 'ddg'] 306 307def search(jenni, input): 308 if not input.group(2): 309 return jenni.reply('.search for what?') 310 query = input.group(2).encode('utf-8') 311 gu = google_search(query) or '-' 312 bu = bing_search(query) or '-' 313 du = duck_search(query) or '-' 314 315 if (gu == bu) and (bu == du): 316 result = '%s (g, b, d)' % gu 317 elif (gu == bu): 318 result = '%s (g, b), %s (d)' % (gu, du) 319 elif (bu == du): 320 result = '%s (b, d), %s (g)' % (bu, gu) 321 elif (gu == du): 322 result = '%s (g, d), %s (b)' % (gu, bu) 323 else: 324 if len(gu) > 250: gu = '(extremely long link)' 325 if len(bu) > 150: bu = '(extremely long link)' 326 if len(du) > 150: du = '(extremely long link)' 327 result = '%s (g), %s (b), %s (d)' % (gu, bu, du) 328 329 jenni.reply(result) 330search.commands = ['search'] 331 332def suggest(jenni, input): 333 if not input.group(2): 334 return jenni.reply("No query term.") 335 query = input.group(2).encode('utf-8') 336 uri = 'http://websitedev.de/temp-bin/suggest.pl?q=' 337 answer = web.get(uri + web.urllib.quote(query).replace('+', '%2B')) 338 if answer: 339 jenni.say(answer) 340 else: jenni.reply('Sorry, no result.') 341suggest.commands = ['suggest'] 342 343if __name__ == '__main__': 344 print __doc__.strip()