PageRenderTime 54ms CodeModel.GetById 48ms app.highlight 4ms RepoModel.GetById 1ms app.codeStats 0ms

/theHarvester/discovery/duckduckgosearch.py

https://github.com/laramies/theHarvester
Python | 85 lines | 82 code | 3 blank | 0 comment | 0 complexity | 0b1d67f006abe341cf7e695f3f812b93 MD5 | raw file
 1from theHarvester.discovery.constants import *
 2from theHarvester.lib.core import *
 3from theHarvester.parsers import myparser
 4import json
 5
 6
 7class SearchDuckDuckGo:
 8
 9    def __init__(self, word, limit):
10        self.word = word
11        self.results = ""
12        self.totalresults = ""
13        self.dorks = []
14        self.links = []
15        self.database = 'https://duckduckgo.com/?q='
16        self.api = 'https://api.duckduckgo.com/?q=x&format=json&pretty=1'  # Currently using API.
17        self.quantity = '100'
18        self.limit = limit
19        self.proxy = False
20
21    async def do_search(self):
22        # Do normal scraping.
23        url = self.api.replace('x', self.word)
24        headers = {'User-Agent': googleUA}
25        first_resp = await AsyncFetcher.fetch_all([url], headers=headers, proxy=self.proxy)
26        self.results = first_resp[0]
27        self.totalresults += self.results
28        urls = await self.crawl(self.results)
29        urls = {url for url in urls if len(url) > 5}
30        all_resps = await AsyncFetcher.fetch_all(urls)
31        self.totalresults += ''.join(all_resps)
32
33    async def crawl(self, text):
34        """
35        Function parses json and returns URLs.
36        :param text: formatted json
37        :return: set of URLs
38        """
39        urls = set()
40        try:
41            load = json.loads(text)
42            for keys in load.keys():  # Iterate through keys of dict.
43                val = load.get(keys)
44                if isinstance(val, int) or isinstance(val, dict) or val is None:
45                    continue
46                if isinstance(val, list):
47                    if len(val) == 0:  # Make sure not indexing an empty list.
48                        continue
49                    val = val[0]  # First value should be dict.
50                    if isinstance(val, dict):  # Sanity check.
51                        for key in val.keys():
52                            value = val.get(key)
53                            if isinstance(value, str) and value != '' and 'https://' in value or 'http://' in value:
54                                urls.add(value)
55                if isinstance(val, str) and val != '' and 'https://' in val or 'http://' in val:
56                    urls.add(val)
57            tmp = set()
58            for url in urls:
59                if '<' in url and 'href=' in url:  # Format is <href="https://www.website.com"/>
60                    equal_index = url.index('=')
61                    true_url = ''
62                    for ch in url[equal_index + 1:]:
63                        if ch == '"':
64                            tmp.add(true_url)
65                            break
66                        true_url += ch
67                else:
68                    if url != '':
69                        tmp.add(url)
70            return tmp
71        except Exception as e:
72            print(f'Exception occurred: {e}')
73            return []
74
75    async def get_emails(self):
76        rawres = myparser.Parser(self.totalresults, self.word)
77        return await rawres.emails()
78
79    async def get_hostnames(self):
80        rawres = myparser.Parser(self.totalresults, self.word)
81        return await rawres.hostnames()
82
83    async def process(self, proxy=False):
84        self.proxy = proxy
85        await self.do_search()  # Only need to search once since using API.