PageRenderTime 22ms CodeModel.GetById 81ms app.highlight 41ms RepoModel.GetById 39ms app.codeStats 0ms

/modules/search.py

http://github.com/myano/jenni
Python | 230 lines | 176 code | 24 blank | 30 comment | 34 complexity | 9f63d9a8b0b0c518681232a2f9966c37 MD5 | raw file
  1#!/usr/bin/env python
  2"""
  3search.py - jenni Web Search Module
  4Copyright 2009-2013, yano (yanovich.net)
  5Copyright 2013, Edward Powell (embolalia.net)
  6Copyright 2008-2013 Sean B. Palmer (inamidst.com)
  7Licensed under the Eiffel Forum License 2.
  8
  9More info:
 10 * jenni: https://github.com/myano/jenni/
 11 * Phenny: http://inamidst.com/phenny/
 12"""
 13
 14import json
 15import re
 16import urllib
 17import web
 18from modules import proxy
 19
 20r_tag = re.compile(r'<(?!!)[^>]+>')
 21r_bing = re.compile(r'<h2><a href="([^"]+)"')
 22
 23
 24def remove_spaces(x):
 25    if '  ' in x:
 26        x = x.replace('  ', ' ')
 27        return remove_spaces(x)
 28    else:
 29        return x
 30
 31
 32def bing_search(query, lang='en-GB'):
 33    query = web.urllib.quote(query)
 34    base = 'https://www.bing.com/search?mkt=%s&q=' % lang
 35    page = proxy.get(base + query)
 36    m = r_bing.search(page)
 37    if m: return m.group(1)
 38
 39
 40def bing(jenni, input):
 41    """Queries Bing for the specified input."""
 42    query = input.group(2)
 43    if query.startswith(':'):
 44        lang, query = query.split(' ', 1)
 45        lang = lang[1:]
 46    else: lang = 'en-GB'
 47    if not query:
 48        return jenni.reply('.bing what?')
 49
 50    query = query.encode('utf-8')
 51    uri = bing_search(query, lang)
 52    if uri:
 53        jenni.say(uri)
 54        if not hasattr(jenni, 'last_seen_uri'):
 55            jenni.last_seen_uri = {}
 56        jenni.last_seen_uri[input.sender] = uri
 57    else: jenni.reply("No results found for '%s'." % query)
 58bing.commands = ['bing']
 59bing.example = '.bing swhack'
 60
 61
 62def duck_sanitize(incoming):
 63    return web.decode((incoming).decode('utf-8'))
 64
 65
 66def duck_zero_click_scrape(html):
 67    '''Scrape DDG HTML page for Zero-Click'''
 68    try:
 69        ## prefer to use BeautifulSoup
 70        from BeautifulSoup import BeautifulSoup
 71    except:
 72        ## if BS is not available, just fail out here
 73        return str()
 74
 75    soup = BeautifulSoup(html)
 76    zero_click = str()
 77    if soup('div', {'class': 'zero-click-result'}):
 78        zero_click = str(soup('div', {'class': 'zero-click-result'})[0])
 79    output = r_tag.sub('', zero_click).strip()
 80    output = output.replace('\n', '').replace('\t', '')
 81    output = remove_spaces(output)
 82    return output
 83
 84
 85def duck_search(query):
 86    '''Do a DuckDuckGo Search'''
 87
 88    ## grab results from the API for the query
 89    duck_api_results = duck_api(query)
 90
 91    ## output is a string of the URL result
 92
 93    ## try to find the first result
 94    if 'Results' in duck_api_results and min_size('Results', duck_api_results):
 95        ## 'Results' is the most common place to look for the first result
 96        output = duck_api_results['Results'][0]['FirstURL']
 97    elif 'AbstractURL' in duck_api_results and min_size('AbstractURL', duck_api_results):
 98        ## if there is no 'result', let's try AbstractURL
 99        ## this is usually a wikipedia article
100        output = duck_api_results['AbstractURL']
101    elif 'RelatedTopics' in duck_api_results and min_size('RelatedTopics', duck_api_results):
102        ## if we still can't find a search result, let's grab a topic URL
103        ## this is usually vaguely related to the search query
104        ## many times this is a wikipedia result
105        for topic in duck_api_results['RelatedTopics']:
106            output = '%s - %s' % (topic['Name'], topic['Topics'][0]['FirstURL'])
107            if 'duckduckgo.com' in output:
108                ## as a last resort, DuckDuckGo will provide links to the query on its site
109                ## it doesn't appear to ever return a https URL
110                output = output.replace('http://', 'https://')
111            break
112    else:
113        ## if we still can't find a search result via the API
114        ## let's try scraping the html page
115        uri = 'https://duckduckgo.com/html/?q=%s&kl=us-en&kp=-1' % web.urllib.quote(query)
116        page = proxy.get(uri)
117
118        r_duck = re.compile(r'nofollow" class="[^"]+" href="(.*?)">')
119
120        bad_results = ['/y.js?', '//ad.ddg.gg/', '.msn.com/', 'r.search.yahoo.com/',]
121        m = r_duck.findall(page)
122        output = str()
123        if m:
124            for result in m:
125                valid_result = True
126                for each in bad_results:
127                    if each in result:
128                        valid_result = False
129                if valid_result:
130                    output = result
131                    break
132        else:
133            ## if we absolustely can't find a URL, let's try scraping the HTML
134            ## page for a zero_click info
135            return((duck_zero_click_scrape(page), False))
136
137    return((duck_sanitize(output), True))
138
139def min_size(key, dictt):
140    ## I am lazy
141    return len(dictt[key]) > 0
142
143
144def duck_api(query):
145    '''Send 'query' to DDG's API and return results as a dictionary'''
146    #query = web.urllib.quote(query)
147    uri = 'https://api.duckduckgo.com/?q=%s&format=json&no_html=1&no_redirect=1&kp=-1' % query
148    results = proxy.get(uri)
149    results = json.loads(results)
150    return results
151
152
153def duck_zero_click_api(query):
154    output = list()
155    header = 'Zero Click: '
156    results = duck_api(query)
157    ## look for any possible Zero Click answers
158    if 'Redirect' in results and min_size('Redirect', results):
159        ## this is used when it is a !bang
160        output.append(results['Redirect'].strip())
161    if 'AbstractText' in results and min_size('AbstractText', results):
162        ## topic summary (with no HTML)
163        output.append(header + results['AbstractText'].strip())
164    if 'Answer' in results and min_size('Answer', results):
165        output.append(header + results['Answer'].strip())
166    if 'Definition' in results and min_size('Definition', results):
167        output.append(header + results['Definition'].strip())
168    if not output:
169        ## if we can't find anything in the API for Zero-Click
170        ## give up
171        return None
172    return output
173
174
175def duck(jenni, input):
176    '''Perform a DuckDuckGo Search and Zero-Click lookup'''
177    query = input.group(2)
178    if not query:
179        return jenni.reply('.ddg what?')
180
181    #query = query.encode('utf-8')
182    #jenni.say('query: ' + query)
183
184    ## try to find a search result via the API
185    uri, only_url = duck_search(query)
186    if uri:
187        jenni.say(uri)
188        if hasattr(jenni, 'last_seen_uri') and input.sender in jenni.last_seen_uri:
189            jenni.last_seen_uri[input.sender] = uri
190
191    ## try to find any Zero-Click stuff
192    result = duck_zero_click_api(query)
193
194    if result and len(result) == 1:
195        if hasattr(jenni, 'last_seen_uri') and input.sender in jenni.last_seen_uri:
196            jenni.last_seen_uri[input.sender] = result[0]
197
198    ## loop through zero-click results
199    if result and len(result) >= 1:
200        k = 0
201        for each in result:
202            if len(each) > 0:
203                jenni.say(remove_spaces(each))
204                k += 1
205                if k > 3:
206                    ## only show 3 zero-click results
207                    ## we don't want to be too spammy
208                    break
209
210    ## if we didn't get a search result
211    ## nor did we get a Zero-Click result
212    ## fail
213    if not uri and (not result or not len(result) >= 1):
214        return jenni.reply("No results found for '%s'." % query)
215duck.commands = ['duck', 'ddg', 'g', 'search']
216
217
218def suggest(jenni, input):
219    if not input.group(2):
220        return jenni.reply("No query term.")
221    query = input.group(2).encode('utf-8')
222    uri = 'http://websitedev.de/temp-bin/suggest.pl?q='
223    answer = web.get(uri + web.urllib.quote(query).replace('+', '%2B'))
224    if answer:
225        jenni.say(answer)
226    else: jenni.reply('Sorry, no result.')
227suggest.commands = ['suggest']
228
229if __name__ == '__main__':
230    print __doc__.strip()