/子域名/theHarvester/theHarvester/discovery/duckduckgosearch.py

https://github.com/Wh0ale/SRC-script · Python · 93 lines · 80 code · 8 blank · 5 comment · 30 complexity · 5770c9531ecc09f6fb91859fb32b1d10 MD5 · raw file

  1. from theHarvester.discovery.constants import *
  2. from theHarvester.lib.core import *
  3. from theHarvester.parsers import myparser
  4. import json
  5. import requests
  6. import time
  7. class SearchDuckDuckGo:
  8. def __init__(self, word, limit):
  9. self.word = word
  10. self.results = ""
  11. self.totalresults = ""
  12. self.dorks = []
  13. self.links = []
  14. self.database = 'https://duckduckgo.com/?q='
  15. self.api = 'https://api.duckduckgo.com/?q=x&format=json&pretty=1' # Currently using API.
  16. self.quantity = '100'
  17. self.limit = limit
  18. def do_search(self):
  19. try: # Do normal scraping.
  20. url = self.api.replace('x', self.word)
  21. headers = {'User-Agent': googleUA}
  22. r = requests.get(url, headers=headers)
  23. except Exception as e:
  24. print(e)
  25. time.sleep(getDelay())
  26. self.results = r.text
  27. self.totalresults += self.results
  28. urls = self.crawl(self.results)
  29. for url in urls:
  30. try:
  31. self.totalresults += requests.get(url, headers={'User-Agent': Core.get_user_agent()}).text
  32. time.sleep(getDelay())
  33. except Exception:
  34. continue
  35. def crawl(self, text):
  36. """
  37. Function parses json and returns URLs.
  38. :param text: formatted json
  39. :return: set of URLs
  40. """
  41. urls = set()
  42. try:
  43. load = json.loads(text)
  44. for key in load.keys(): # Iterate through keys of dict.
  45. val = load.get(key)
  46. if isinstance(val, int) or isinstance(val, dict) or val is None:
  47. continue
  48. if isinstance(val, list):
  49. if len(val) == 0: # Make sure not indexing an empty list.
  50. continue
  51. val = val[0] # First value should be dict.
  52. if isinstance(val, dict): # Sanity check.
  53. for key in val.keys():
  54. value = val.get(key)
  55. if isinstance(value, str) and value != '' and 'https://' in value or 'http://' in value:
  56. urls.add(value)
  57. if isinstance(val, str) and val != '' and 'https://' in val or 'http://' in val:
  58. urls.add(val)
  59. tmp = set()
  60. for url in urls:
  61. if '<' in url and 'href=' in url: # Format is <href="https://www.website.com"/>
  62. equal_index = url.index('=')
  63. true_url = ''
  64. for ch in url[equal_index + 1:]:
  65. if ch == '"':
  66. tmp.add(true_url)
  67. break
  68. true_url += ch
  69. else:
  70. if url != '':
  71. tmp.add(url)
  72. return tmp
  73. except Exception as e:
  74. print(f'Exception occurred: {e}')
  75. import traceback as t
  76. print(t.print_exc())
  77. return []
  78. def get_emails(self):
  79. rawres = myparser.Parser(self.totalresults, self.word)
  80. return rawres.emails()
  81. def get_hostnames(self):
  82. rawres = myparser.Parser(self.totalresults, self.word)
  83. return rawres.hostnames()
  84. def process(self):
  85. self.do_search() # Only need to search once since using API.