duckduckgosearch.py - Do normal scraping.

/theHarvester/discovery/duckduckgosearch.py

https://github.com/laramies/theHarvester · Python · 85 lines · 71 code · 8 blank · 6 comment · 27 complexity · 0b1d67f006abe341cf7e695f3f812b93 MD5 · raw file


from theHarvester.discovery.constants import *
from theHarvester.lib.core import *
from theHarvester.parsers import myparser
import json


class SearchDuckDuckGo:

    def __init__(self, word, limit):
        self.word = word
        self.results = ""
        self.totalresults = ""
        self.dorks = []
        self.links = []
        self.database = 'https://duckduckgo.com/?q='
        self.api = 'https://api.duckduckgo.com/?q=x&format=json&pretty=1'  # Currently using API.
        self.quantity = '100'
        self.limit = limit
        self.proxy = False

    async def do_search(self):
        # Do normal scraping.
        url = self.api.replace('x', self.word)
        headers = {'User-Agent': googleUA}
        first_resp = await AsyncFetcher.fetch_all([url], headers=headers, proxy=self.proxy)
        self.results = first_resp[0]
        self.totalresults += self.results
        urls = await self.crawl(self.results)
        urls = {url for url in urls if len(url) > 5}
        all_resps = await AsyncFetcher.fetch_all(urls)
        self.totalresults += ''.join(all_resps)

    async def crawl(self, text):
        """
        Function parses json and returns URLs.
        :param text: formatted json
        :return: set of URLs
        """
        urls = set()
        try:
            load = json.loads(text)
            for keys in load.keys():  # Iterate through keys of dict.
                val = load.get(keys)
                if isinstance(val, int) or isinstance(val, dict) or val is None:
                    continue
                if isinstance(val, list):
                    if len(val) == 0:  # Make sure not indexing an empty list.
                        continue
                    val = val[0]  # First value should be dict.
                    if isinstance(val, dict):  # Sanity check.
                        for key in val.keys():
                            value = val.get(key)
                            if isinstance(value, str) and value != '' and 'https://' in value or 'http://' in value:
                                urls.add(value)
                if isinstance(val, str) and val != '' and 'https://' in val or 'http://' in val:
                    urls.add(val)
            tmp = set()
            for url in urls:
                if '<' in url and 'href=' in url:  # Format is <href="https://www.website.com"/>
                    equal_index = url.index('=')
                    true_url = ''
                    for ch in url[equal_index + 1:]:
                        if ch == '"':
                            tmp.add(true_url)
                            break
                        true_url += ch
                else:
                    if url != '':
                        tmp.add(url)
            return tmp
        except Exception as e:
            print(f'Exception occurred: {e}')
            return []

    async def get_emails(self):
        rawres = myparser.Parser(self.totalresults, self.word)
        return await rawres.emails()

    async def get_hostnames(self):
        rawres = myparser.Parser(self.totalresults, self.word)
        return await rawres.hostnames()

    async def process(self, proxy=False):
        self.proxy = proxy
        await self.do_search()  # Only need to search once since using API.

Tech Fingerprint

Alerts (15)

'import *' Avoid to prevent namespace pollution; import specific names or use aliases
1 2
'def' Ensure functions have docstrings for documentation
21 75 79 83
'try:' Ensure try blocks have corresponding except or finally blocks
40
'isinstance(' Overuse may indicate design issues; consider polymorphism
44 46 50 53 55
Complexity hotspot; line 53 (total complexity: 4)
53
Complexity hotspot; line 55 (total complexity: 4)
55
'print(' Use logging module for better control and configurability
72