jenni /modules/search.py

Language Python Lines 340
MD5 Hash 808e1da7516ae43f76dc4a423926367c Estimated Cost $6,170 (why?)
Repository git://github.com/myano/jenni.git View Raw File
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
#!/usr/bin/env python
"""
search.py - jenni Web Search Module
Copyright 2009-2013, Michael Yanovich (yanovich.net)
Copyright 2013, Edward Powell (embolalia.net)
Copyright 2008-2013 Sean B. Palmer (inamidst.com)
Licensed under the Eiffel Forum License 2.

More info:
 * jenni: https://github.com/myano/jenni/
 * Phenny: http://inamidst.com/phenny/
"""

import json
import re
import urllib
import web

r_tag = re.compile(r'<(?!!)[^>]+>')


def remove_spaces(x):
    if '  ' in x:
        x = x.replace('  ', ' ')
        return remove_spaces(x)
    else:
        return x


class Grab(web.urllib.URLopener):
    def __init__(self, *args):
        self.version = 'Mozilla/5.0 (Windows NT 6.1; rv:17.0) Gecko/20100101 Firefox/17.0'
        web.urllib.URLopener.__init__(self, *args)
        self.addheader('Referer', 'https://github.com/myano/jenni')
        self.addheader('Accept', '*/*')
    def http_error_default(self, url, fp, errcode, errmsg, headers):
        return web.urllib.addinfourl(fp, [headers, errcode], "http:" + url)

def google_ajax(query):
    """Search using AjaxSearch, and return its JSON."""
    if isinstance(query, unicode):
        query = query.encode('utf-8')
    uri = 'https://ajax.googleapis.com/ajax/services/search/web'
    args = '?v=1.0&safe=off&q=' + web.urllib.quote(query)
    handler = web.urllib._urlopener
    web.urllib._urlopener = Grab()
    bytes = web.get(uri + args)
    web.urllib._urlopener = handler
    return json.loads(bytes)

def google_search(query):
    results = google_ajax(query)
    try: return results['responseData']['results'][0]['unescapedUrl']
    except IndexError: return None
    except TypeError:
        print results
        return False

def google_count(query):
    results = google_ajax(query)
    if not results.has_key('responseData'): return '0'
    if not results['responseData'].has_key('cursor'): return '0'
    if not results['responseData']['cursor'].has_key('estimatedResultCount'):
        return '0'
    return results['responseData']['cursor']['estimatedResultCount']

def formatnumber(n):
    """Format a number with beautiful commas."""
    parts = list(str(n))
    for i in range((len(parts) - 3), 0, -3):
        parts.insert(i, ',')
    return ''.join(parts)

def g(jenni, input):
    """Queries Google for the specified input."""
    query = input.group(2)
    if not query:
        return jenni.reply('.g what?')
    query = query.encode('utf-8')
    uri = google_search(query)
    if uri:
        jenni.reply(uri)
        if not hasattr(jenni, 'last_seen_uri'):
            jenni.bot.last_seen_uri = {}
        jenni.bot.last_seen_uri[input.sender] = uri
    elif uri is False: jenni.reply("Problem getting data from Google.")
    else: jenni.reply("No results found for '%s'." % query)
g.commands = ['g']
g.priority = 'high'
g.example = '.g swhack'

def gc(jenni, input):
    """Returns the number of Google results for the specified input."""
    query = input.group(2)
    if not query:
        return jenni.reply('.gc what?')
    query = query.encode('utf-8')
    num = formatnumber(google_count(query))
    jenni.say(query + ': ' + num)
gc.commands = ['gc']
gc.priority = 'high'
gc.example = '.gc extrapolate'

r_query = re.compile(
    r'\+?"[^"\\]*(?:\\.[^"\\]*)*"|\[[^]\\]*(?:\\.[^]\\]*)*\]|\S+'
)

def gcs(jenni, input):
    if not input.group(2):
        return jenni.reply("Nothing to compare.")
    queries = r_query.findall(input.group(2))
    if len(queries) > 6:
        return jenni.reply('Sorry, can only compare up to six things.')

    results = []
    for i, query in enumerate(queries):
        query = query.strip('[]')
        query = query.encode('utf-8')
        n = int((formatnumber(google_count(query)) or '0').replace(',', ''))
        results.append((n, query))
        if i >= 2: __import__('time').sleep(0.25)
        if i >= 4: __import__('time').sleep(0.25)

    results = [(term, n) for (n, term) in reversed(sorted(results))]
    reply = ', '.join('%s (%s)' % (t, formatnumber(n)) for (t, n) in results)
    jenni.say(reply)
gcs.commands = ['gcs', 'comp']

r_bing = re.compile(r'<h3><a href="([^"]+)"')

def bing_search(query, lang='en-GB'):
    query = web.urllib.quote(query)
    base = 'http://www.bing.com/search?mkt=%s&q=' % lang
    bytes = web.get(base + query)
    m = r_bing.search(bytes)
    if m: return m.group(1)

def bing(jenni, input):
    """Queries Bing for the specified input."""
    query = input.group(2)
    if query.startswith(':'):
        lang, query = query.split(' ', 1)
        lang = lang[1:]
    else: lang = 'en-GB'
    if not query:
        return jenni.reply('.bing what?')

    query = query.encode('utf-8')
    uri = bing_search(query, lang)
    if uri:
        jenni.reply(uri)
        if not hasattr(jenni, 'last_seen_uri'):
            jenni.bot.last_seen_uri = {}
        jenni.bot.last_seen_uri[input.sender] = uri
    else: jenni.reply("No results found for '%s'." % query)
bing.commands = ['bing']
bing.example = '.bing swhack'


def duck_sanitize(incoming):
    return web.decode((incoming).decode('utf-8'))


def duck_zero_click_scrape(html):
    '''Scrape DDG HTML page for Zero-Click'''
    try:
        ## prefer to use BeautifulSoup
        from BeautifulSoup import BeautifulSoup
    except:
        ## if BS is not available, just fail out here
        return str()

    soup = BeautifulSoup(html)
    zero_click = str()
    if soup('div', {'class': 'zero-click-result'}):
        zero_click = str(soup('div', {'class': 'zero-click-result'})[0])
    output = r_tag.sub('', zero_click).strip()
    output = output.replace('\n', '').replace('\t', '')
    output = remove_spaces(output)
    return output


def duck_search(query):
    '''Do a DuckDuckGo Search'''

    ## grab results from the API for the query
    duck_api_results = duck_api(query)

    ## output is a string of the URL result

    ## try to find the first result
    if 'Results' in duck_api_results and min_size('Results', duck_api_results):
        ## 'Results' is the most common place to look for the first result
        output = duck_api_results['Results'][0]['FirstURL']
    elif 'AbstractURL' in duck_api_results and min_size('AbstractURL', duck_api_results):
        ## if there is no 'result', let's try AbstractURL
        ## this is usually a wikipedia article
        output = duck_api_results['AbstractURL']
    elif 'RelatedTopics' in duck_api_results and min_size('RelatedTopics', duck_api_results):
        ## if we still can't find a search result, let's grab a topic URL
        ## this is usually vaguely related to the search query
        ## many times this is a wikipedia result
        for topic in duck_api_results['RelatedTopics']:
            output = '%s - %s' % (topic['Name'], topic['Topics'][0]['FirstURL'])
            if 'duckduckgo.com' in output:
                ## as a last resort, DuckDuckGo will provide links to the query on its site
                ## it doesn't appear to ever return a https URL
                output = output.replace('http://', 'https://')
            break
    else:
        ## if we still can't find a search result via the API
        ## let's try scraping the html page
        uri = 'https://duckduckgo.com/html/?q=%s&kl=us-en&kp=-1' % web.urllib.quote(query)
        page = web.get(uri)
        r_duck = re.compile(r'nofollow" class="[^"]+" href="(.*?)">')
        m = r_duck.findall(page)
        output = str()
        if m:
            for result in m:
                if '/y.js?' not in result and '//ad.ddg.gg/' not in result:
                    ## ignore ads
                    output = result
                    break
        else:
            ## if we absolustely can't find a URL, let's try scraping the HTML
            ## page for a zero_click info
            output = duck_zero_click_scrape(page)
    return duck_sanitize(output)

def min_size(key, dictt):
    ## I am lazy
    return len(dictt[key]) > 0

def duck_api(query):
    '''Send 'query' to DDG's API and return results as a dictionary'''
    query = web.urllib.quote(query)
    uri = 'https://api.duckduckgo.com/?q=%s&format=json&no_html=1&no_redirect=1&kp=-1' % query
    results = web.get(uri)
    results = json.loads(web.get(uri))
    return results

def duck_zero_click_api(query):
    output = list()
    header = 'Zero Click: '
    results = duck_api(query)
    ## look for any possible Zero Click answers
    if 'Redirect' in results and min_size('Redirect', results):
        ## this is used when it is a !bang
        output.append(results['Redirect'].strip())
    if 'AbstractText' in results and min_size('AbstractText', results):
        ## topic summary (with no HTML)
        output.append(header + results['AbstractText'].strip())
    if 'Answer' in results and min_size('Answer', results):
        output.append(header + results['Answer'].strip())
    if 'Definition' in results and min_size('Definition', results):
        output.append(header + results['Definition'].strip())
    if not output:
        ## if we can't find anything in the API for Zero-Click
        ## give up
        return None

    return output

def duck(jenni, input):
    '''Perform a DuckDuckGo Search and Zero-Click lookup'''
    query = input.group(2)
    if not query:
        return jenni.reply('.ddg what?')

    query = query.encode('utf-8')

    ## try to find a search result via the API
    uri = duck_search(query)
    if uri:
        jenni.say(uri)
        if not hasattr(jenni, 'last_seen_uri'):
            jenni.bot.last_seen_uri = dict()
        jenni.bot.last_seen_uri[input.sender] = uri

    ## try to find any Zero-Click stuff
    result = duck_zero_click_api(query)

    ## loop through zero-click results
    if result and len(result) >= 1:
        k = 0
        for each in result:
            if len(each) > 0:
                jenni.say(remove_spaces(each))
                k += 1
                if k > 3:
                    ## only show 3 zero-click results
                    ## we don't want to be too spammy
                    break

    ## if we didn't get a search result
    ## nor did we get a Zero-Click result
    ## fail
    if not uri and (not result or not len(result) >= 1):
        return jenni.reply("No results found for '%s'." % query)
duck.commands = ['duck', 'ddg']

def search(jenni, input):
    if not input.group(2):
        return jenni.reply('.search for what?')
    query = input.group(2).encode('utf-8')
    gu = google_search(query) or '-'
    bu = bing_search(query) or '-'
    du = duck_search(query) or '-'

    if (gu == bu) and (bu == du):
        result = '%s (g, b, d)' % gu
    elif (gu == bu):
        result = '%s (g, b), %s (d)' % (gu, du)
    elif (bu == du):
        result = '%s (b, d), %s (g)' % (bu, gu)
    elif (gu == du):
        result = '%s (g, d), %s (b)' % (gu, bu)
    else:
        if len(gu) > 250: gu = '(extremely long link)'
        if len(bu) > 150: bu = '(extremely long link)'
        if len(du) > 150: du = '(extremely long link)'
        result = '%s (g), %s (b), %s (d)' % (gu, bu, du)

    jenni.reply(result)
search.commands = ['search']

def suggest(jenni, input):
    if not input.group(2):
        return jenni.reply("No query term.")
    query = input.group(2).encode('utf-8')
    uri = 'http://websitedev.de/temp-bin/suggest.pl?q='
    answer = web.get(uri + web.urllib.quote(query).replace('+', '%2B'))
    if answer:
        jenni.say(answer)
    else: jenni.reply('Sorry, no result.')
suggest.commands = ['suggest']

if __name__ == '__main__':
    print __doc__.strip()
Back to Top