PageRenderTime 54ms CodeModel.GetById 24ms RepoModel.GetById 1ms app.codeStats 0ms

/logfilter.py

https://github.com/fabricioferracioli/mestrado_files
Python | 354 lines | 282 code | 49 blank | 23 comment | 68 complexity | b7979e27dba73a3c9db50c219e9ac829 MD5 | raw file
  1. # -*- coding: utf-8 -*-
  2. #!/usr/bin/pyhton
  3. # script geral do filtro
  4. import logutil, mining_database, datetime
  5. class LogFilter:
  6. def __init__(self, configFile):
  7. self.extensionsToExclude = ['.gif', '.png', '.jpg', '.jpeg', '.svg', '.ico', '.css', '.js']
  8. self.configFile = configFile
  9. def isInGroup(self, searchFor, inGroup):
  10. for index in range(len(inGroup)):
  11. if searchFor.find(inGroup[index]) >= 0:
  12. return True
  13. return False
  14. def findUrlInLogLine(self, line):
  15. begin = line.index('"')
  16. end = line.rindex('"')
  17. return line[begin:end].split()[1]
  18. def findActualTask(self, tasks, line):
  19. for i in range(len(tasks)):
  20. if line.find(tasks[i]['initial_url']) != -1:
  21. return i
  22. return None
  23. def isInitialUrl(self, line, tasks):
  24. for task in tasks:
  25. if line.find(task['initial_url']) != -1:
  26. return True
  27. return False
  28. def isFinalUrl(self, line, tasks):
  29. for task in tasks:
  30. if line.find(task['final_url']) != -1:
  31. return True
  32. return False
  33. def timeBetweenRequests(self, firstAccess, secondAccess):
  34. fd = firstAccess.split()[0].split('-')
  35. ft = firstAccess.split()[1].split(':')
  36. fdt = datetime.datetime(int(fd[2]), int(fd[1]), int(fd[0]), int(ft[0]), int(ft[1]), int(ft[2]))
  37. sd = secondAccess.split()[0].split('-')
  38. st = secondAccess.split()[1].split(':')
  39. stf = datetime.datetime(int(sd[2]), int(sd[1]), int(sd[0]), int(st[0]), int(st[1]), int(st[2]))
  40. return stf - fdt
  41. def reachMaxIdleTime(self, firstAccessTime, secondAccessTime):
  42. return self.timeBetweenRequests(firstAccessTime, secondAccessTime) > datetime.timedelta(0,0,0,0,30)
  43. def userCompletedTheTask(self, urlSequence, initialAccess, taskConfigs):
  44. line = str(initialAccess[1])
  45. initial_access_time = final_access_time = 0
  46. for i in range(len(urlSequence)):
  47. #verifico se ele abortou e comecou qualquer outra tarefa
  48. for config in taskConfigs:
  49. if (urlSequence[i][1] == config[1]):
  50. print 'outra tarefa foi iniciada'
  51. return False
  52. #agora devo verificar o tempo de acesso entre duas urls (atual - anterior)
  53. startNewSession = False
  54. if (i == 0):
  55. startNewSession = self.reachMaxIdleTime(initialAccess[3], urlSequence[i][3])
  56. initial_access_time = initialAccess[3]
  57. else:
  58. startNewSession = self.reachMaxIdleTime(urlSequence[i-1][3], urlSequence[i][3])
  59. if (startNewSession == False):
  60. line = line + ' ' + str(urlSequence[i][1])
  61. #se a url atual eh a que finaliza a tarefa, posso retornar
  62. if (i > 0 and urlSequence[i][1] == config[2]):
  63. time_spent = self.timeBetweenRequests(initial_access_time, urlSequence[i][3])
  64. print 'tarefa finalizada com sucesso'
  65. return [line, int(time_spent.total_seconds())]
  66. else:
  67. #as urls depois de expirada a sessao sao ignoradas
  68. print 'sessao expirada'
  69. time_spent = self.timeBetweenRequests(initial_access_time, urlSequence[i][3])
  70. return [line, int(time_spent.total_seconds())]
  71. time_spent = self.timeBetweenRequests(initial_access_time, urlSequence[i][3])
  72. return [line, int(time_spent.total_seconds())]
  73. def normalizeLine(self, line, maxUrls, withTime = False):
  74. if withTime == True:
  75. if len(line.split()) < maxUrls * 2:
  76. line = line + (maxUrls*2 - len(line.split())) * ' 0 :00'
  77. else:
  78. if len(line.split()) < maxUrls+1:
  79. #precisa adicionar -1 (ao inves de 0) pois o somtoolbox reconhece esse valor como null
  80. #coloco +1 pois a url inicial ja esta no historico do cara
  81. line = line + (maxUrls+1 - len(line.split())) * ' -1'
  82. return line
  83. def initialFilter(self, logFile, filteredLog = 'output.log'):
  84. if (logFile == None):
  85. raise Exception('WrongParameter', 'The logFile must be specified.')
  86. if (logFile == filteredLog):
  87. raise Excetion('WrongParameter', 'The values of arguments are equal. Please, set different values.')
  88. lines_included = 0
  89. lines_excluded = 0
  90. inputlogfile = open(logFile, 'r')
  91. outputlogfile = open(filteredLog, 'w')
  92. for line in inputlogfile:
  93. include_line = True
  94. for extension in self.extensionsToExclude:
  95. if line.find(extension) != -1:
  96. lines_excluded += 1
  97. include_line = False
  98. break
  99. else:
  100. if (include_line):
  101. outputlogfile.write(line)
  102. lines_included += 1
  103. inputlogfile.close()
  104. outputlogfile.close()
  105. return {'excluded': lines_excluded, 'included': lines_included}
  106. def filterUrls(self, logFile = 'output.log'):
  107. if (logFile == None):
  108. raise Exception('WrongParameter', 'logFile parameter must be a valid path to a filtered log file')
  109. if (self.configFile == logFile):
  110. raise ('WrongParameter', 'The two arguments must be different.')
  111. #initial and final urls from the tasks to be filtered, max requisition between urls, avoiding umcompleted tasks
  112. initial_urls = []
  113. final_urls = []
  114. max_req_between_urls = []
  115. urlsfile = open(self.configFile, 'r')
  116. for line in urlsfile:
  117. splitted_line = line.split()
  118. initial_urls.append(splitted_line[0])
  119. final_urls.append(splitted_line[1])
  120. max_req_between_urls.append(int(splitted_line[2]))
  121. urlsfile.close()
  122. logfile = open(logFile, 'r')
  123. outputfile = open('urls_output.log', 'w')
  124. index_tested = 0
  125. num_searches = 0
  126. include_line = False
  127. for line in logfile:
  128. if include_line == False:
  129. num_searches = 0
  130. for index in range(len(initial_urls)):
  131. if line.find(initial_urls[index]) >= 0:
  132. index_tested = index
  133. num_searches += 1
  134. include_line = True
  135. outputfile.write(line)
  136. break
  137. else:
  138. if num_searches <= max_req_between_urls[index_tested]:
  139. if self.isInGroup(line, initial_urls) == True:
  140. num_searches = 1
  141. outputfile.write(line)
  142. num_searches += 1
  143. if line.find(final_urls[index_tested]) >= 0:
  144. include_line = False
  145. else:
  146. if self.isInGroup(line, initial_urls) == True:
  147. outputfile.write(line)
  148. include_line = False
  149. logfile.close()
  150. outputfile.close()
  151. return True
  152. def filterTasks(self, filteredUrlsFile = 'urls_output.log'):
  153. if (filteredUrlsFile == None):
  154. raise Exception('WrongParameter', 'You must specify a log file with the filtered urls')
  155. if (self.configFile == filteredUrlsFile):
  156. raise Exception('WrongParameter', 'The config and filtered urls file must be different')
  157. tasks_file = open(self.configFile, 'r')
  158. tasks = []
  159. for line in tasks_file:
  160. params = line.split()
  161. tasks.append({'id': params[3], 'initial_url': params[0], 'final_url': params[1], 'max_requisitions': params[2]})
  162. #se o arquivo nao existe eh criado, se existe eh truncado, e logo apos fechado
  163. open(params[3], 'w').close()
  164. tasks_file.close()
  165. logfile = open(filteredUrlsFile, 'r')
  166. num_searches = 0
  167. task_index = 0
  168. for line in logfile:
  169. if self.isInitialUrl(line, tasks):
  170. if task_index == -1:
  171. outputfile.close()
  172. task_index = self.findActualTask(tasks, line)
  173. outputfile = open(tasks[task_index]['id'], 'a')
  174. outputfile.write(line)
  175. num_searches = 1
  176. else:
  177. if num_searches <= int(tasks[task_index]['max_requisitions']):
  178. num_searches += 1
  179. if self.isFinalUrl(line, tasks):
  180. task_index = -1
  181. num_searches = 0
  182. outputfile.write(line)
  183. outputfile.close()
  184. return True
  185. def identifyTasks(self, forceDbCreation):
  186. db = mining_database.MiningDatabase(forceDbCreation)
  187. db.createTables()
  188. logfiles = []
  189. inputfile = open(self.configFile, 'r')
  190. for line in inputfile:
  191. conf = line.split()
  192. logfiles.append(conf[3])
  193. db.insertPage(conf[0])
  194. db.insertPage(conf[1])
  195. db.insertConfig(conf[0], conf[1], conf[2], conf[3])
  196. inputfile.close()
  197. lu = logutil.LogUtil()
  198. for logfilepath in logfiles:
  199. try:
  200. logfile = open(logfilepath, 'r')
  201. for line in logfile:
  202. requester = lu.getRequester(line)
  203. if len(db.searchUser('ip', 'like', requester).fetchall()) == 0:
  204. db.insertUser(requester)
  205. document_requested = lu.getRequestedDocument(line)
  206. if len(db.searchPage('page', 'like', document_requested).fetchall()) == 0:
  207. db.insertPage(document_requested)
  208. request_date = lu.getRequestDate(line)
  209. server_response_status = lu.getServerResponseStatus(line)
  210. response_size = lu.getResponseSize(line)
  211. db.insertAccess(requester, document_requested, request_date, server_response_status, response_size)
  212. except IOError:
  213. print 'Arquivo '+ logfilepath +' nao encontrado'
  214. return True
  215. def sessions(self, runSom):
  216. db = mining_database.MiningDatabase();
  217. configs = db.searchConfig('1', '!=', '2').fetchall()
  218. ext = 'ghsom'
  219. if runSom:
  220. ext = 'som'
  221. #com as ids das requisicoes iniciais realizo a separacao das sessoes por tarefa
  222. for config in configs:
  223. file_lines = []
  224. begin_of_tasks = db.customQuery('select * from access where page_id = '+str(config[1])).fetchall()
  225. #com o inicio das tarefas posso realizar a captura da sequencia de urls acessadas
  226. counter = 1
  227. for i in range(len(begin_of_tasks)):
  228. if i < len(begin_of_tasks) - 1:
  229. subsequent_requests = db.customQuery('select * from access where id > '+str(begin_of_tasks[i][0])+' AND user_id = '+str(+begin_of_tasks[i][2])+' AND id < '+ str(begin_of_tasks[i+1][0]) +' limit '+str(config[3]))
  230. else:
  231. subsequent_requests = db.customQuery('select * from access where id > '+str(begin_of_tasks[i][0])+' AND user_id = '+str(+begin_of_tasks[i][2])+' limit '+str(config[3]))
  232. #nesse ponto tenho os acessos subsequentes ao inicio da tarefa atual para um usuario com a quantidade maxima de urls possiveis para aquela tarefa
  233. #devo verificar se o usuario chega a url final
  234. #devo verificar se ele atinge o tempo limite de sessao
  235. #devo cuidar para nao ter outro inicio de tarefa entre as requisicoes
  236. requests = subsequent_requests.fetchall()
  237. if len(requests) > 0:
  238. line = self.userCompletedTheTask(requests, begin_of_tasks[i], configs)
  239. else:
  240. line = [str(begin_of_tasks[i][1]), 0]
  241. if line != False:
  242. print line
  243. file_lines.append(self.normalizeLine(line[0], config[3])+' '+str(line[1])+' access_'+str(counter))
  244. counter += 1
  245. #construo o arquivo de entrada para a tarefa atual
  246. #documentacao do somtoolbox http://www.ifs.tuwien.ac.at/dm/somtoolbox/index.html
  247. #mas como eu vou usar o ghsom1.6, preciso verificar essa documentacao
  248. #http://www.ifs.tuwien.ac.at/~andi/ghsom/index.html
  249. # tamanho de vecdim eh config[3]+2 porque conto a url inicial e o tempo
  250. filename = config[4]+'.in.'+ext
  251. header = '$TYPE '+config[4]+'\n$XDIM '+str(len(file_lines))+'\n$YDIM 1 \n$VEC_DIM '+str(config[3]+2)+'\n'
  252. som_file = open(filename, 'w');
  253. som_file.write(header);
  254. for access in file_lines:
  255. som_file.write(access+'\n')
  256. som_file.close()
  257. print ' -file '+filename+' generated'
  258. template = config[4]+'.t.'+ext
  259. header = '$TYPE '+config[4]+'_template\n$XDIM 7\n$YDIM '+str(len(file_lines))+'\n$VEC_DIM '+str(config[3]+2)+'\n'
  260. template_file = open(template, 'w')
  261. template_file.write(header)
  262. for i in range(config[3]+2):
  263. template_file.write(str(i)+' url_'+str(i)+' 1 1 1 1 1.0\n')
  264. template_file.close()
  265. print ' -file '+template+' generated'
  266. prop = config[4]+'.'+ext+'.prop'
  267. prop_file = open(prop, 'w')
  268. expand_cicles = '4'
  269. initial_x_size = '2'
  270. initial_y_size = '2'
  271. tau_1 = '0.2'
  272. tau_2 = '0.1'
  273. orientation = 'true'
  274. if runSom:
  275. expand_cicles = '100'
  276. initial_x_size = '3'
  277. initial_y_size = '4'
  278. tau_1 = '1.0'
  279. tau_2 = '1.0'
  280. orientation = 'false'
  281. prop_file.write('EXPAND_CYCLES='+expand_cicles+'\n')
  282. prop_file.write('MAX_CYCLES=0\n')
  283. prop_file.write('TAU_1='+tau_1+'\n')
  284. prop_file.write('TAU_2='+tau_2+'\n')
  285. prop_file.write('INITIAL_LEARNRATE=0.8\n')
  286. prop_file.write('NR=0.0006\n')
  287. prop_file.write('HTML_PREFIX='+config[4]+'\n')
  288. prop_file.write('DATAFILE_EXTENSION=\n')
  289. prop_file.write('randomSeed=17\n')
  290. prop_file.write('inputFile=./'+filename+'\n')
  291. prop_file.write('descriptionFile=./'+template+'\n')
  292. prop_file.write('savePath=./output\n')
  293. prop_file.write('printMQE=false\n')
  294. prop_file.write('normInputVectors=NONE\n')
  295. prop_file.write('saveAsHTML=true\n')
  296. prop_file.write('saveAsSOMLib=true\n')
  297. prop_file.write('INITIAL_X_SIZE='+initial_x_size+'\n')
  298. prop_file.write('INITIAL_Y_SIZE='+initial_y_size+'\n')
  299. prop_file.write('LABELS_NUM=1\n')
  300. prop_file.write('LABELS_ONLY=true\n')
  301. prop_file.write('LABELS_THRESHOLD=0.35\n')
  302. prop_file.write('ORIENTATION='+orientation+'\n')
  303. #o usuario pode alterar os valores default para calibrar melhor o metodo
  304. prop_file.close()
  305. print ' -file '+prop+' generated'