PageRenderTime 777ms CodeModel.GetById 107ms app.highlight 583ms RepoModel.GetById 82ms app.codeStats 0ms

/modules/search.py

https://github.com/myano/jenni
Python | 344 lines | 320 code | 10 blank | 14 comment | 13 complexity | f14baf24e26b48fc58a0add018120fe8 MD5 | raw file
  1#!/usr/bin/env python
  2"""
  3search.py - jenni Web Search Module
  4Copyright 2009-2013, Michael Yanovich (yanovich.net)
  5Copyright 2013, Edward Powell (embolalia.net)
  6Copyright 2008-2013 Sean B. Palmer (inamidst.com)
  7Licensed under the Eiffel Forum License 2.
  8
  9More info:
 10 * jenni: https://github.com/myano/jenni/
 11 * Phenny: http://inamidst.com/phenny/
 12"""
 13
 14import json
 15import re
 16import urllib
 17import web
 18
 19r_tag = re.compile(r'<(?!!)[^>]+>')
 20
 21
 22def remove_spaces(x):
 23    if '  ' in x:
 24        x = x.replace('  ', ' ')
 25        return remove_spaces(x)
 26    else:
 27        return x
 28
 29
 30class Grab(web.urllib.URLopener):
 31    def __init__(self, *args):
 32        self.version = 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0'
 33        web.urllib.URLopener.__init__(self, *args)
 34        self.addheader('Referer', 'https://github.com/myano/jenni')
 35        self.addheader('Accept', '*/*')
 36    def http_error_default(self, url, fp, errcode, errmsg, headers):
 37        return web.urllib.addinfourl(fp, [headers, errcode], "http:" + url)
 38
 39def google_ajax(query):
 40    """Search using AjaxSearch, and return its JSON."""
 41    if isinstance(query, unicode):
 42        query = query.encode('utf-8')
 43    uri = 'https://ajax.googleapis.com/ajax/services/search/web'
 44    args = '?v=1.0&safe=off&q=' + web.urllib.quote(query)
 45    handler = web.urllib._urlopener
 46    web.urllib._urlopener = Grab()
 47    bytes = web.get(uri + args)
 48    web.urllib._urlopener = handler
 49    return json.loads(bytes)
 50
 51def google_search(query):
 52    results = google_ajax(query)
 53    try: return results['responseData']['results'][0]['unescapedUrl']
 54    except IndexError: return None
 55    except TypeError:
 56        print results
 57        return False
 58
 59def google_count(query):
 60    results = google_ajax(query)
 61    if not results.has_key('responseData'): return '0'
 62    if not results['responseData'].has_key('cursor'): return '0'
 63    if not results['responseData']['cursor'].has_key('estimatedResultCount'):
 64        return '0'
 65    return results['responseData']['cursor']['estimatedResultCount']
 66
 67def formatnumber(n):
 68    """Format a number with beautiful commas."""
 69    parts = list(str(n))
 70    for i in range((len(parts) - 3), 0, -3):
 71        parts.insert(i, ',')
 72    return ''.join(parts)
 73
 74def g(jenni, input):
 75    """Queries Google for the specified input."""
 76    query = input.group(2)
 77    if not query:
 78        return jenni.reply('.g what?')
 79    query = query.encode('utf-8')
 80    uri = google_search(query)
 81    if uri:
 82        if 'wikipedia.org/' in uri:
 83            uri = uri.replace('http:', 'https:')
 84        jenni.reply(uri)
 85        if not hasattr(jenni, 'last_seen_uri'):
 86            jenni.last_seen_uri = {}
 87        jenni.last_seen_uri[input.sender] = uri
 88    elif uri is False: jenni.reply("Problem getting data from Google.")
 89    else: jenni.reply("No results found for '%s'." % query)
 90g.commands = ['g']
 91g.priority = 'high'
 92g.example = '.g swhack'
 93
 94def gc(jenni, input):
 95    """Returns the number of Google results for the specified input."""
 96    query = input.group(2)
 97    if not query:
 98        return jenni.reply('.gc what?')
 99    query = query.encode('utf-8')
100    num = formatnumber(google_count(query))
101    jenni.say(query + ': ' + num)
102gc.commands = ['gc']
103gc.priority = 'high'
104gc.example = '.gc extrapolate'
105
106r_query = re.compile(
107    r'\+?"[^"\\]*(?:\\.[^"\\]*)*"|\[[^]\\]*(?:\\.[^]\\]*)*\]|\S+'
108)
109
110def gcs(jenni, input):
111    if not input.group(2):
112        return jenni.reply("Nothing to compare.")
113    queries = r_query.findall(input.group(2))
114    if len(queries) > 6:
115        return jenni.reply('Sorry, can only compare up to six things.')
116
117    results = []
118    for i, query in enumerate(queries):
119        query = query.strip('[]')
120        query = query.encode('utf-8')
121        n = int((formatnumber(google_count(query)) or '0').replace(',', ''))
122        results.append((n, query))
123        if i >= 2: __import__('time').sleep(0.25)
124        if i >= 4: __import__('time').sleep(0.25)
125
126    results = [(term, n) for (n, term) in reversed(sorted(results))]
127    reply = ', '.join('%s (%s)' % (t, formatnumber(n)) for (t, n) in results)
128    jenni.say(reply)
129gcs.commands = ['gcs', 'comp']
130
131r_bing = re.compile(r'<h3><a href="([^"]+)"')
132
133def bing_search(query, lang='en-GB'):
134    query = web.urllib.quote(query)
135    base = 'http://www.bing.com/search?mkt=%s&q=' % lang
136    bytes = web.get(base + query)
137    m = r_bing.search(bytes)
138    if m: return m.group(1)
139
140def bing(jenni, input):
141    """Queries Bing for the specified input."""
142    query = input.group(2)
143    if query.startswith(':'):
144        lang, query = query.split(' ', 1)
145        lang = lang[1:]
146    else: lang = 'en-GB'
147    if not query:
148        return jenni.reply('.bing what?')
149
150    query = query.encode('utf-8')
151    uri = bing_search(query, lang)
152    if uri:
153        jenni.reply(uri)
154        if not hasattr(jenni, 'last_seen_uri'):
155            jenni.last_seen_uri = {}
156        jenni.last_seen_uri[input.sender] = uri
157    else: jenni.reply("No results found for '%s'." % query)
158bing.commands = ['bing']
159bing.example = '.bing swhack'
160
161
162def duck_sanitize(incoming):
163    return web.decode((incoming).decode('utf-8'))
164
165
166def duck_zero_click_scrape(html):
167    '''Scrape DDG HTML page for Zero-Click'''
168    try:
169        ## prefer to use BeautifulSoup
170        from BeautifulSoup import BeautifulSoup
171    except:
172        ## if BS is not available, just fail out here
173        return str()
174
175    soup = BeautifulSoup(html)
176    zero_click = str()
177    if soup('div', {'class': 'zero-click-result'}):
178        zero_click = str(soup('div', {'class': 'zero-click-result'})[0])
179    output = r_tag.sub('', zero_click).strip()
180    output = output.replace('\n', '').replace('\t', '')
181    output = remove_spaces(output)
182    return output
183
184
185def duck_search(query):
186    '''Do a DuckDuckGo Search'''
187
188    ## grab results from the API for the query
189    duck_api_results = duck_api(query)
190
191    ## output is a string of the URL result
192
193    ## try to find the first result
194    if 'Results' in duck_api_results and min_size('Results', duck_api_results):
195        ## 'Results' is the most common place to look for the first result
196        output = duck_api_results['Results'][0]['FirstURL']
197    elif 'AbstractURL' in duck_api_results and min_size('AbstractURL', duck_api_results):
198        ## if there is no 'result', let's try AbstractURL
199        ## this is usually a wikipedia article
200        output = duck_api_results['AbstractURL']
201    elif 'RelatedTopics' in duck_api_results and min_size('RelatedTopics', duck_api_results):
202        ## if we still can't find a search result, let's grab a topic URL
203        ## this is usually vaguely related to the search query
204        ## many times this is a wikipedia result
205        for topic in duck_api_results['RelatedTopics']:
206            output = '%s - %s' % (topic['Name'], topic['Topics'][0]['FirstURL'])
207            if 'duckduckgo.com' in output:
208                ## as a last resort, DuckDuckGo will provide links to the query on its site
209                ## it doesn't appear to ever return a https URL
210                output = output.replace('http://', 'https://')
211            break
212    else:
213        ## if we still can't find a search result via the API
214        ## let's try scraping the html page
215        uri = 'https://duckduckgo.com/html/?q=%s&kl=us-en&kp=-1' % web.urllib.quote(query)
216        page = web.get(uri)
217        r_duck = re.compile(r'nofollow" class="[^"]+" href="(.*?)">')
218        m = r_duck.findall(page)
219        output = str()
220        if m:
221            for result in m:
222                if '/y.js?' not in result and '//ad.ddg.gg/' not in result and '.msn.com/' not in result:
223                    ## ignore ads
224                    output = result
225                    break
226        else:
227            ## if we absolustely can't find a URL, let's try scraping the HTML
228            ## page for a zero_click info
229            output = duck_zero_click_scrape(page)
230    return duck_sanitize(output)
231
232def min_size(key, dictt):
233    ## I am lazy
234    return len(dictt[key]) > 0
235
236def duck_api(query):
237    '''Send 'query' to DDG's API and return results as a dictionary'''
238    query = web.urllib.quote(query)
239    uri = 'https://api.duckduckgo.com/?q=%s&format=json&no_html=1&no_redirect=1&kp=-1' % query
240    results = web.get(uri)
241    results = json.loads(web.get(uri))
242    return results
243
244def duck_zero_click_api(query):
245    output = list()
246    header = 'Zero Click: '
247    results = duck_api(query)
248    ## look for any possible Zero Click answers
249    if 'Redirect' in results and min_size('Redirect', results):
250        ## this is used when it is a !bang
251        output.append(results['Redirect'].strip())
252    if 'AbstractText' in results and min_size('AbstractText', results):
253        ## topic summary (with no HTML)
254        output.append(header + results['AbstractText'].strip())
255    if 'Answer' in results and min_size('Answer', results):
256        output.append(header + results['Answer'].strip())
257    if 'Definition' in results and min_size('Definition', results):
258        output.append(header + results['Definition'].strip())
259    if not output:
260        ## if we can't find anything in the API for Zero-Click
261        ## give up
262        return None
263
264    return output
265
266def duck(jenni, input):
267    '''Perform a DuckDuckGo Search and Zero-Click lookup'''
268    query = input.group(2)
269    if not query:
270        return jenni.reply('.ddg what?')
271
272    query = query.encode('utf-8')
273
274    ## try to find a search result via the API
275    uri = duck_search(query)
276    if uri:
277        jenni.say(uri)
278        if hasattr(jenni, 'last_seen_uri') and input.sender in jenni.last_seen_uri:
279            jenni.last_seen_uri[input.sender] = uri
280
281    ## try to find any Zero-Click stuff
282    result = duck_zero_click_api(query)
283
284    if result and len(result) == 1:
285        if hasattr(jenni, 'last_seen_uri') and input.sender in jenni.last_seen_uri:
286            jenni.last_seen_uri[input.sender] = result[0]
287
288    ## loop through zero-click results
289    if result and len(result) >= 1:
290        k = 0
291        for each in result:
292            if len(each) > 0:
293                jenni.say(remove_spaces(each))
294                k += 1
295                if k > 3:
296                    ## only show 3 zero-click results
297                    ## we don't want to be too spammy
298                    break
299
300    ## if we didn't get a search result
301    ## nor did we get a Zero-Click result
302    ## fail
303    if not uri and (not result or not len(result) >= 1):
304        return jenni.reply("No results found for '%s'." % query)
305duck.commands = ['duck', 'ddg']
306
307def search(jenni, input):
308    if not input.group(2):
309        return jenni.reply('.search for what?')
310    query = input.group(2).encode('utf-8')
311    gu = google_search(query) or '-'
312    bu = bing_search(query) or '-'
313    du = duck_search(query) or '-'
314
315    if (gu == bu) and (bu == du):
316        result = '%s (g, b, d)' % gu
317    elif (gu == bu):
318        result = '%s (g, b), %s (d)' % (gu, du)
319    elif (bu == du):
320        result = '%s (b, d), %s (g)' % (bu, gu)
321    elif (gu == du):
322        result = '%s (g, d), %s (b)' % (gu, bu)
323    else:
324        if len(gu) > 250: gu = '(extremely long link)'
325        if len(bu) > 150: bu = '(extremely long link)'
326        if len(du) > 150: du = '(extremely long link)'
327        result = '%s (g), %s (b), %s (d)' % (gu, bu, du)
328
329    jenni.reply(result)
330search.commands = ['search']
331
332def suggest(jenni, input):
333    if not input.group(2):
334        return jenni.reply("No query term.")
335    query = input.group(2).encode('utf-8')
336    uri = 'http://websitedev.de/temp-bin/suggest.pl?q='
337    answer = web.get(uri + web.urllib.quote(query).replace('+', '%2B'))
338    if answer:
339        jenni.say(answer)
340    else: jenni.reply('Sorry, no result.')
341suggest.commands = ['suggest']
342
343if __name__ == '__main__':
344    print __doc__.strip()