PageRenderTime 91ms CodeModel.GetById 24ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/detector/trash.py

https://gitlab.com/billyprice1/dump-scraper
Python | 233 lines | 193 code | 22 blank | 18 comment | 10 complexity | 39dcb76019cdf2d685b0484f4e419c95 MD5 | raw file
  1. __author__ = 'Davide Tampellini'
  2. __copyright__ = '2015 Davide Tampellini - FabbricaBinaria'
  3. __license__ = 'GNU GPL version 3 or later'
  4. import re
  5. from lib.detector.abstract import AbstractDetector
  6. class TrashDetector(AbstractDetector):
  7. def __init__(self, level):
  8. super(TrashDetector, self).__init__(level)
  9. from collections import OrderedDict
  10. # Order MATTERS! Functions to detect false positives MUST BE executed first
  11. self.functions = OrderedDict()
  12. # Accordingly to the level, set the correct function list
  13. # The higher the level, the more data I want to extract, so I can sustain false positives.
  14. if self.level <= 3:
  15. self.functions['fewLines'] = 1
  16. self.functions['longLines'] = 1
  17. self.functions['privateKeys'] = 1
  18. self.functions['detectRawEmail'] = 1
  19. self.functions['detectEmailsOnly'] = 1
  20. self.functions['detectDebug'] = 1.2
  21. self.functions['detectHtml'] = 1
  22. self.functions['detectVarious'] = 1
  23. if self.level <= 2:
  24. self.functions['detectTimeStamps'] = 1
  25. if self.level <= 1:
  26. self.functions['detectIP'] = 1.5
  27. # Let's log the functions that will be applied
  28. self.logfunctions()
  29. # Let's compile some regexes to speed up the execution
  30. self.regex['emailsOnly'] = re.compile(r'^[\s"]?[a-z0-9\-\._]+@[a-z0-9\-\.]+\.[a-z]{2,4}[\s|\t]?$', re.I | re.M)
  31. self.regex['debugHex'] = re.compile(r'0x[a-f0-9]{8}', re.I)
  32. self.regex['winPath'] = re.compile(r'[A-Z]:\\\.*?\\\.*?\\\\', re.M)
  33. # Chat log 330e8f8887e4ea04b06a6cffc66cfce0 -1 Admin Ban G-SH
  34. self.regex['chat'] = re.compile(r'[a-f0-9]{32} -\d')
  35. self.regex['mysqlTable'] = re.compile(r'\+-{10,}?\+', re.M)
  36. self.regex['startingDigits'] = re.compile(r'^\d{1,4},', re.M)
  37. self.regex['ip'] = re.compile(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b')
  38. self.regex['mysqlDates'] = re.compile(r'(19|20)\d\d[\-/.](0[1-9]|1[012])[\-/.](0[1-9]|[12][0-9]|3[01])')
  39. self.regex['engDates'] = re.compile(r'(0[1-9]|1[012])[\-/.](0[1-9]|[12][0-9]|3[01])[\-/.](19|20)\d\d')
  40. self.regex['time'] = re.compile(r'(?:2[0-3]|[01][0-9]):[0-5][0-9](?::[0-5][0-9])?')
  41. self.regex['htmlTags'] = re.compile(r'</?(?:html|head|body|div|p|div|script|link|span|u|ul|li|ol|a)+\s*/?>', re.I)
  42. self.regex['htmlLinks'] = re.compile(r'\b(?:(?:https?|udp)://|www\.)[-A-Z0-9+&@#/%=~_|$?!:,.]*[A-Z0-9+&@#/%=~_|$]', re.I)
  43. self.regex['md5links'] = re.compile(r'(?:(?:https?|udp)://|www\.)[-A-Z0-9+&@#/%=~_|$?!:,.]*[A-Z0-9+&@#/%=~_|$]=[a-f0-9]{32}', re.I)
  44. def analyze(self, results):
  45. for function, coefficient in self.functions.iteritems():
  46. self.score += getattr(self, function)() * coefficient
  47. if self.score >= 3:
  48. break
  49. def returnkey(self):
  50. return 'trash'
  51. def fewLines(self):
  52. # If I just have few lines, most likely it's trash. I have to do this since sometimes some debug output are
  53. # crammed into a single line, screwing up all the stats
  54. if self.lines < 3:
  55. return 3
  56. return 0
  57. def longLines(self):
  58. """
  59. Files with huge lines are debug info
  60. :return:
  61. """
  62. # This is a special case: porn passwords usually have tons of keywords and long lines (4k+)
  63. # Let's manually add an exception for those files and hope for the best
  64. if self.data.count('XXX Porn Passwords') > 0:
  65. return 0
  66. lines = self.data.split("\n")
  67. for line in lines:
  68. if len(line) > 1000:
  69. return 3
  70. return 0
  71. def privateKeys(self):
  72. """
  73. RSA private keys
  74. :return:
  75. """
  76. if self.data.count('---BEGIN') > 0:
  77. return 3
  78. return 0
  79. def detectRawEmail(self):
  80. """
  81. Detects emails in "raw mode"
  82. :return:
  83. """
  84. if self.data.count('Content-Type:') > 0:
  85. return 3
  86. return 0
  87. def detectEmailsOnly(self):
  88. """
  89. Detect full list of email addresses only, useless for us
  90. :return:
  91. """
  92. emails = re.findall(self.regex['emailsOnly'], self.data)
  93. return len(emails) / self.lines
  94. def detectDebug(self):
  95. """
  96. Files with debug info
  97. :return: float
  98. """
  99. data_lower = self.data.lower()
  100. # Windows paths
  101. score = len(re.findall(self.regex['winPath'], self.data))
  102. score += len(re.findall(self.regex['debugHex'], self.data))
  103. # Windows register keys
  104. score += data_lower.count('hklm\\')
  105. score += data_lower.count('debug')
  106. score += data_lower.count('[trace]')
  107. score += data_lower.count('session')
  108. score += data_lower.count('class=')
  109. score += data_lower.count('thread')
  110. score += data_lower.count('uuid')
  111. score += len(re.findall(self.regex['chat'], self.data))
  112. return score / self.lines
  113. def detectIP(self):
  114. """
  115. Files with IP most likely are access log files
  116. :return:
  117. """
  118. multiplier = 1
  119. # Do I have a table dump? If so I have to lower the score
  120. insert = self.data.count('INSERT INTO')
  121. mysql = len(re.findall(self.regex['mysqlTable'], self.data))
  122. # Do I have lines starting with a number? Maybe it's a table dump without any MySQL markup
  123. digits = len(re.findall(self.regex['startingDigits'], self.data)) / self.lines
  124. if insert > 1 or mysql > 1 or digits > 0.25:
  125. multiplier = 0.01
  126. ip = len(re.findall(self.regex['ip'], self.data)) * multiplier
  127. return ip / self.lines
  128. def detectTimeStamps(self):
  129. """
  130. Files with a lot of timestamps most likely are log files
  131. :return:
  132. """
  133. multiplier = 1
  134. # Do I have a table dump? If so I have to lower the score of the timestamps, since most likely it's the creation time
  135. insert = self.data.count('INSERT INTO')
  136. mysql = len(re.findall(self.regex['mysqlTable'], self.data))
  137. # Do I have lines starting with a number? Maybe it's a table dump without any MySQL markup
  138. digits = len(re.findall(self.regex['startingDigits'], self.data)) / self.lines
  139. # Do I have a SQLmap cracked password signature?
  140. sqlmap = len(re.findall(r'\[INFO\] (cracked|resuming) password', self.data)) / self.lines
  141. if insert > 1 or mysql > 1 or digits > 0.25 or sqlmap > 0.25:
  142. multiplier = 0.01
  143. # MySQL dates - 2015-11-02
  144. dates = len(re.findall(self.regex['mysqlDates'], self.data)) * multiplier
  145. score = dates / self.lines
  146. # English dates - 11-25-2015
  147. dates = len(re.findall(self.regex['engDates'], self.data)) * multiplier
  148. score += dates / self.lines
  149. # Search for the time only if the previous regex didn't match anything.
  150. # Otherwise I'll count timestamps YYYY-mm-dd HH:ii:ss twice
  151. if score < 0.01:
  152. time = len(re.findall(self.regex['time'], self.data)) * multiplier
  153. score += time / self.lines
  154. return score
  155. def detectHtml(self):
  156. """
  157. HTML tags in the file, most likely garbage
  158. :return:
  159. """
  160. # HTML tags (only the most used ones are here)
  161. score = len(re.findall(self.regex['htmlTags'], self.data)) * 1.5
  162. # Links
  163. score += len(re.findall(self.regex['htmlLinks'], self.data)) * 0.5
  164. # Links containing md5 hash
  165. score += len(re.findall(self.regex['md5links'], self.data))
  166. return score / self.lines
  167. def detectVarious(self):
  168. data_lower = self.data.lower()
  169. score = data_lower.count('e-mail found')
  170. # We moved these checks directly while scraping
  171. # The #EXTINF signature flags a file we're not interested into
  172. # if data_lower.count('#extinf'):
  173. # return 3
  174. # XML files
  175. # if data_lower.count('<?xml version="1.0" encoding="utf-8"?>'):
  176. # return 3
  177. return score / self.lines