PageRenderTime 95ms CodeModel.GetById 28ms RepoModel.GetById 0ms app.codeStats 0ms

/theHarvester/discovery/duckduckgosearch.py

https://github.com/laramies/theHarvester
Python | 85 lines | 82 code | 3 blank | 0 comment | 0 complexity | 0b1d67f006abe341cf7e695f3f812b93 MD5 | raw file
  1. from theHarvester.discovery.constants import *
  2. from theHarvester.lib.core import *
  3. from theHarvester.parsers import myparser
  4. import json
  5. class SearchDuckDuckGo:
  6. def __init__(self, word, limit):
  7. self.word = word
  8. self.results = ""
  9. self.totalresults = ""
  10. self.dorks = []
  11. self.links = []
  12. self.database = 'https://duckduckgo.com/?q='
  13. self.api = 'https://api.duckduckgo.com/?q=x&format=json&pretty=1' # Currently using API.
  14. self.quantity = '100'
  15. self.limit = limit
  16. self.proxy = False
  17. async def do_search(self):
  18. # Do normal scraping.
  19. url = self.api.replace('x', self.word)
  20. headers = {'User-Agent': googleUA}
  21. first_resp = await AsyncFetcher.fetch_all([url], headers=headers, proxy=self.proxy)
  22. self.results = first_resp[0]
  23. self.totalresults += self.results
  24. urls = await self.crawl(self.results)
  25. urls = {url for url in urls if len(url) > 5}
  26. all_resps = await AsyncFetcher.fetch_all(urls)
  27. self.totalresults += ''.join(all_resps)
  28. async def crawl(self, text):
  29. """
  30. Function parses json and returns URLs.
  31. :param text: formatted json
  32. :return: set of URLs
  33. """
  34. urls = set()
  35. try:
  36. load = json.loads(text)
  37. for keys in load.keys(): # Iterate through keys of dict.
  38. val = load.get(keys)
  39. if isinstance(val, int) or isinstance(val, dict) or val is None:
  40. continue
  41. if isinstance(val, list):
  42. if len(val) == 0: # Make sure not indexing an empty list.
  43. continue
  44. val = val[0] # First value should be dict.
  45. if isinstance(val, dict): # Sanity check.
  46. for key in val.keys():
  47. value = val.get(key)
  48. if isinstance(value, str) and value != '' and 'https://' in value or 'http://' in value:
  49. urls.add(value)
  50. if isinstance(val, str) and val != '' and 'https://' in val or 'http://' in val:
  51. urls.add(val)
  52. tmp = set()
  53. for url in urls:
  54. if '<' in url and 'href=' in url: # Format is <href="https://www.website.com"/>
  55. equal_index = url.index('=')
  56. true_url = ''
  57. for ch in url[equal_index + 1:]:
  58. if ch == '"':
  59. tmp.add(true_url)
  60. break
  61. true_url += ch
  62. else:
  63. if url != '':
  64. tmp.add(url)
  65. return tmp
  66. except Exception as e:
  67. print(f'Exception occurred: {e}')
  68. return []
  69. async def get_emails(self):
  70. rawres = myparser.Parser(self.totalresults, self.word)
  71. return await rawres.emails()
  72. async def get_hostnames(self):
  73. rawres = myparser.Parser(self.totalresults, self.word)
  74. return await rawres.hostnames()
  75. async def process(self, proxy=False):
  76. self.proxy = proxy
  77. await self.do_search() # Only need to search once since using API.