PageRenderTime 65ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 1ms

/lib/eventgensamples.py

https://github.com/Jaykul/eventgen
Python | 1149 lines | 1030 code | 53 blank | 66 comment | 143 complexity | 20da1c12ed45e8c54b653dd5cda477b2 MD5 | raw file
Possible License(s): Apache-2.0

Large files files are truncated, but you can click here to view the full file

  1. from __future__ import division, with_statement
  2. import os, sys
  3. import logging
  4. import pprint
  5. import random
  6. import datetime
  7. import re
  8. import csv
  9. import json
  10. import copy
  11. from eventgenoutput import Output
  12. from timeparser import timeParser, timeDelta2secs
  13. import httplib2, urllib
  14. from xml.dom import minidom
  15. from xml.parsers.expat import ExpatError
  16. class Sample:
  17. # Required fields for Sample
  18. name = None
  19. app = None
  20. filePath = None
  21. # Options which are all valid for a sample
  22. disabled = None
  23. spoolDir = None
  24. spoolFile = None
  25. breaker = None
  26. sampletype = None
  27. mode = None
  28. interval = None
  29. delay = None
  30. count = None
  31. bundlelines = None
  32. earliest = None
  33. latest = None
  34. hourOfDayRate = None
  35. dayOfWeekRate = None
  36. randomizeEvents = None
  37. randomizeCount = None
  38. outputMode = None
  39. fileName = None
  40. fileMaxBytes = None
  41. fileBackupFiles = None
  42. splunkHost = None
  43. splunkPort = None
  44. splunkMethod = None
  45. splunkUser = None
  46. splunkPass = None
  47. index = None
  48. source = None
  49. sourcetype = None
  50. host = None
  51. hostRegex = None
  52. hostToken = None
  53. tokens = None
  54. projectID = None
  55. accessToken = None
  56. backfill = None
  57. backfillSearch = None
  58. backfillSearchUrl = None
  59. minuteOfHourRate = None
  60. timeMultiple = None
  61. # Internal fields
  62. _c = None
  63. _out = None
  64. _sampleLines = None
  65. _sampleDict = None
  66. _lockedSettings = None
  67. _priority = None
  68. _origName = None
  69. _lastts = None
  70. _backfillts = None
  71. _origEarliest = None
  72. _origLatest = None
  73. _timeSinceSleep = None
  74. def __init__(self, name):
  75. # Logger already setup by config, just get an instance
  76. logger = logging.getLogger('eventgen')
  77. globals()['logger'] = logger
  78. self.name = name
  79. self.tokens = [ ]
  80. self._lockedSettings = [ ]
  81. self._currentevent = 0
  82. self._rpevents = None
  83. self._backfilldone = False
  84. self._timeSinceSleep = datetime.timedelta()
  85. # Import config
  86. from eventgenconfig import Config
  87. self._c = Config()
  88. def __str__(self):
  89. """Only used for debugging, outputs a pretty printed representation of this sample"""
  90. # Eliminate recursive going back to parent
  91. temp = dict([ (key, value) for (key, value) in self.__dict__.items() if key != '_c' ])
  92. return pprint.pformat(temp)
  93. def __repr__(self):
  94. return self.__str__()
  95. def gen(self):
  96. logger.debug("Generating sample '%s' in app '%s'" % (self.name, self.app))
  97. startTime = datetime.datetime.now()
  98. # If this is the first time we're generating, setup out
  99. if self._out == None:
  100. logger.debug("Setting up Output class for sample '%s' in app '%s'" % (self.name, self.app))
  101. self._out = Output(self)
  102. if self.backfillSearchUrl == None:
  103. self.backfillSearchUrl = self._out._splunkUrl
  104. # Setup initial backfillts
  105. if self._backfillts == None and self.backfill != None and not self._backfilldone:
  106. try:
  107. self._backfillts = timeParser(self.backfill)
  108. logger.info("Setting up backfill of %s (%s)" % (self.backfill,self._backfillts))
  109. except Exception as ex:
  110. logger.error("Failed to parse backfill '%s': %s" % (self.backfill, ex))
  111. raise
  112. self._origEarliest = self.earliest
  113. self._origLatest = self.latest
  114. if self._out._outputMode == "splunkstream" and self.backfillSearch != None:
  115. if not self.backfillSearch.startswith('search'):
  116. self.backfillSearch = 'search ' + self.backfillSearch
  117. self.backfillSearch += '| head 1 | table _time'
  118. logger.debug("Searching Splunk URL '%s/services/search/jobs' with search '%s' with sessionKey '%s'" % (self.backfillSearchUrl, self.backfillSearch, self._out._c.sessionKey))
  119. results = httplib2.Http(disable_ssl_certificate_validation=True).request(\
  120. self.backfillSearchUrl + '/services/search/jobs',
  121. 'POST', headers={'Authorization': 'Splunk %s' % self._out._c.sessionKey}, \
  122. body=urllib.urlencode({'search': self.backfillSearch,
  123. 'earliest_time': self.backfill,
  124. 'exec_mode': 'oneshot'}))[1]
  125. try:
  126. temptime = minidom.parseString(results).getElementsByTagName('text')[0].childNodes[0].nodeValue
  127. # logger.debug("Time returned from backfill search: %s" % temptime)
  128. # Results returned look like: 2013-01-16T10:59:15.411-08:00
  129. # But the offset in time can also be +, so make sure we strip that out first
  130. if len(temptime) > 0:
  131. if temptime.find('+') > 0:
  132. temptime = temptime.split('+')[0]
  133. temptime = '-'.join(temptime.split('-')[0:3])
  134. self._backfillts = datetime.datetime.strptime(temptime, '%Y-%m-%dT%H:%M:%S.%f')
  135. logger.debug("Backfill search results: '%s' value: '%s' time: '%s'" % (pprint.pformat(results), temptime, self._backfillts))
  136. except (ExpatError, IndexError):
  137. pass
  138. # Override earliest and latest during backfill until we're at current time
  139. if self.backfill != None and not self._backfilldone:
  140. if self._backfillts >= datetime.datetime.now():
  141. logger.info("Backfill complete")
  142. self._backfilldone = True
  143. self.earliest = self._origEarliest
  144. self.latest = self._origLatest
  145. else:
  146. logger.debug("Still backfilling for sample '%s'. Currently at %s" % (self.name, self._backfillts))
  147. self.earliest = datetime.datetime.strftime((self._backfillts - datetime.timedelta(seconds=self.interval)), \
  148. "%Y-%m-%d %H:%M:%S.%f")
  149. self.latest = datetime.datetime.strftime(self._backfillts, "%Y-%m-%d %H:%M:%S.%f")
  150. # if not self.mode == 'replay':
  151. # self._backfillts += datetime.timedelta(seconds=self.interval)
  152. logger.debug("Opening sample '%s' in app '%s'" % (self.name, self.app) )
  153. sampleFH = open(self.filePath, 'rU')
  154. if self.sampletype == 'raw':
  155. # 5/27/12 CS Added caching of the sample file
  156. if self._sampleLines == None:
  157. logger.debug("Reading raw sample '%s' in app '%s'" % (self.name, self.app))
  158. sampleLines = sampleFH.readlines()
  159. self._sampleLines = sampleLines
  160. else:
  161. sampleLines = self._sampleLines
  162. elif self.sampletype == 'csv':
  163. logger.debug("Reading csv sample '%s' in app '%s'" % (self.name, self.app))
  164. if self._sampleLines == None:
  165. logger.debug("Reading csv sample '%s' in app '%s'" % (self.name, self.app))
  166. sampleDict = [ ]
  167. sampleLines = [ ]
  168. csvReader = csv.DictReader(sampleFH)
  169. for line in csvReader:
  170. sampleDict.append(line)
  171. sampleLines.append(line['_raw'].decode('string_escape'))
  172. self._sampleDict = copy.deepcopy(sampleDict)
  173. self._sampleLines = copy.deepcopy(sampleLines)
  174. else:
  175. # If we're set to bundlelines, we'll modify sampleLines regularly.
  176. # Since lists in python are referenced rather than copied, we
  177. # need to make a fresh copy every time if we're bundlelines.
  178. # If not, just used the cached copy, we won't mess with it.
  179. if not self.bundlelines:
  180. sampleDict = self._sampleDict
  181. sampleLines = self._sampleLines
  182. else:
  183. sampleDict = copy.deepcopy(self._sampleDict)
  184. sampleLines = copy.deepcopy(self._sampleLines)
  185. # Check to see if this is the first time we've run, or if we're at the end of the file
  186. # and we're running replay. If so, we need to parse the whole file and/or setup our counters
  187. if self._rpevents == None and self.mode == 'replay':
  188. if self.sampletype == 'csv':
  189. self._rpevents = sampleDict
  190. else:
  191. if self.breaker != self._c.breaker:
  192. self._rpevents = []
  193. lines = '\n'.join(sampleLines)
  194. breaker = re.search(self.breaker, lines)
  195. currentchar = 0
  196. while breaker:
  197. self._rpevents.append(lines[currentchar:breaker.start(0)])
  198. lines = lines[breaker.end(0):]
  199. currentchar += breaker.start(0)
  200. breaker = re.search(self.breaker, lines)
  201. else:
  202. self._rpevents = sampleLines
  203. self._currentevent = 0
  204. # If we are replaying then we need to set the current sampleLines to the event
  205. # we're currently on
  206. if self.mode == 'replay':
  207. if self.sampletype == 'csv':
  208. sampleDict = [ self._rpevents[self._currentevent] ]
  209. sampleLines = [ self._rpevents[self._currentevent]['_raw'].decode('string_escape') ]
  210. else:
  211. sampleLines = [ self._rpevents[self._currentevent] ]
  212. self._currentevent += 1
  213. # If we roll over the max number of lines, roll over the counter and start over
  214. if self._currentevent >= len(self._rpevents):
  215. logger.debug("At end of the sample file, starting replay from the top")
  216. self._currentevent = 0
  217. self._lastts = None
  218. # Ensure all lines have a newline
  219. for i in xrange(0, len(sampleLines)):
  220. if sampleLines[i][-1] != '\n':
  221. sampleLines[i] += '\n'
  222. # If we've set bundlelines, then we want count copies of all of the lines in the file
  223. # And we'll set breaker to be a weird delimiter so that we'll end up with an events
  224. # array that can be rated by the hour of day and day of week rates
  225. # This is only for weird outside use cases like when we want to include a CSV file as the source
  226. # so we can't set breaker properly
  227. if self.bundlelines:
  228. logger.debug("Bundlelines set. Creating %s copies of original sample lines and setting breaker." % (self.count-1))
  229. self.breaker = '\n------\n'
  230. origSampleLines = copy.deepcopy(sampleLines)
  231. sampleLines.append(self.breaker)
  232. for i in range(0, self.count-1):
  233. sampleLines.extend(origSampleLines)
  234. sampleLines.append(self.breaker)
  235. if len(sampleLines) > 0:
  236. count = self.count
  237. if self.count == 0 and self.mode == 'sample':
  238. logger.debug("Count %s specified as default for sample '%s' in app '%s'; adjusting count to sample length %s; using default breaker" \
  239. % (self.count, self.name, self.app, len(sampleLines)) )
  240. count = len(sampleLines)
  241. self.breaker = self._c.breaker
  242. elif self.count > 0 or self.mode == 'replay':
  243. # 5/8/12 CS We've requested not the whole file, so we should adjust count based on
  244. # hourOfDay, dayOfWeek and randomizeCount configs
  245. rateFactor = 1.0
  246. if self.randomizeCount != 0 and self.randomizeCount != None:
  247. try:
  248. logger.debug("randomizeCount for sample '%s' in app '%s' is %s" \
  249. % (self.name, self.app, self.randomizeCount))
  250. # If we say we're going to be 20% variable, then that means we
  251. # can be .1% high or .1% low. Math below does that.
  252. randBound = round(self.randomizeCount * 1000, 0)
  253. rand = random.randint(0, randBound)
  254. randFactor = 1+((-((randBound / 2) - rand)) / 1000)
  255. logger.debug("randFactor for sample '%s' in app '%s' is %s" \
  256. % (self.name, self.app, randFactor))
  257. rateFactor *= randFactor
  258. except:
  259. import traceback
  260. stack = traceback.format_exc()
  261. logger.error("Randomize count failed. Stacktrace %s" % stack)
  262. if type(self.hourOfDayRate) == dict:
  263. try:
  264. if self.backfill != None and not self._backfilldone:
  265. now = self._backfillts
  266. else:
  267. now = datetime.datetime.now()
  268. rate = self.hourOfDayRate[str(now.hour)]
  269. logger.debug("hourOfDayRate for sample '%s' in app '%s' is %s" % (self.name, self.app, rate))
  270. rateFactor *= rate
  271. except KeyError:
  272. import traceback
  273. stack = traceback.format_exc()
  274. logger.error("Hour of day rate failed. Stacktrace %s" % stack)
  275. if type(self.dayOfWeekRate) == dict:
  276. try:
  277. if self.backfill != None and not self._backfilldone:
  278. now = self._backfillts
  279. else:
  280. now = datetime.datetime.now()
  281. weekday = datetime.date.weekday(now)
  282. if weekday == 6:
  283. weekday = 0
  284. else:
  285. weekday += 1
  286. rate = self.dayOfWeekRate[str(weekday)]
  287. logger.debug("dayOfWeekRate for sample '%s' in app '%s' is %s" % (self.name, self.app, rate))
  288. rateFactor *= rate
  289. except KeyError:
  290. import traceback
  291. stack = traceback.format_exc()
  292. logger.error("Hour of day rate failed. Stacktrace %s" % stack)
  293. if type(self.minuteOfHourRate) == dict:
  294. try:
  295. if self.backfill != None and not self._backfilldone:
  296. now = self._backfillts
  297. else:
  298. now = datetime.datetime.now()
  299. rate = self.minuteOfHourRate[str(now.minute)]
  300. logger.debug("minuteOfHourRate for sample '%s' in app '%s' is %s" % (self.name, self.app, rate))
  301. rateFactor *= rate
  302. except KeyError:
  303. import traceback
  304. stack = traceback.format_exc()
  305. logger.error("Minute of hour rate failed. Stacktrace %s" % stack)
  306. count = int(round(count * rateFactor, 0))
  307. if rateFactor != 1.0:
  308. logger.info("Original count: %s Rated count: %s Rate factor: %s" % (self.count, count, rateFactor))
  309. try:
  310. breakerRE = re.compile(self.breaker)
  311. except:
  312. logger.error("Line breaker '%s' for sample '%s' in app '%s' could not be compiled; using default breaker" \
  313. % (self.breaker, self.name, self.app) )
  314. self.breaker = self._c.breaker
  315. events = []
  316. event = ''
  317. if self.breaker == self._c.breaker:
  318. logger.debug("Default breaker detected for sample '%s' in app '%s'; using simple event fill" \
  319. % (self.name, self.app) )
  320. logger.debug("Filling events array for sample '%s' in app '%s'; count=%s, sampleLines=%s" \
  321. % (self.name, self.app, count, len(sampleLines)) )
  322. # 5/8/12 CS Added randomizeEvents config to randomize items from the file
  323. # 5/27/12 CS Don't randomize unless we're raw
  324. try:
  325. # 7/30/12 CS Can't remember why I wouldn't allow randomize Events for CSV so commenting
  326. # this out and seeing what breaks
  327. #if self.randomizeEvents and self.sampletype == 'raw':
  328. if self.randomizeEvents:
  329. logger.debug("Shuffling events for sample '%s' in app '%s'" \
  330. % (self.name, self.app))
  331. random.shuffle(sampleLines)
  332. except:
  333. logger.error("randomizeEvents for sample '%s' in app '%s' unparseable." \
  334. % (self.name, self.app))
  335. if count >= len(sampleLines):
  336. events = sampleLines
  337. else:
  338. events = sampleLines[0:count]
  339. else:
  340. logger.debug("Non-default breaker '%s' detected for sample '%s' in app '%s'; using advanced event fill" \
  341. % (self.breaker, self.name, self.app) )
  342. ## Fill events array from breaker and sampleLines
  343. breakersFound = 0
  344. x = 0
  345. logger.debug("Filling events array for sample '%s' in app '%s'; count=%s, sampleLines=%s" \
  346. % (self.name, self.app, count, len(sampleLines)) )
  347. while len(events) < count and x < len(sampleLines):
  348. #logger.debug("Attempting to match regular expression '%s' with line '%s' for sample '%s' in app '%s'" % (breaker, sampleLines[x], sample, app) )
  349. breakerMatch = breakerRE.search(sampleLines[x])
  350. if breakerMatch:
  351. #logger.debug("Match found for regular expression '%s' and line '%s' for sample '%s' in app '%s'" % (breaker, sampleLines[x], sample, app) )
  352. ## If not first
  353. # 5/28/12 CS This may cause a regression defect, but I can't figure out why
  354. # you'd want to ignore the first breaker you find. It's certainly breaking
  355. # my current use case.
  356. # 6/25/12 CS Definitely caused a regression defect. I'm going to add
  357. # a check for bundlelines which is where I need this to work every time
  358. if breakersFound != 0 or self.bundlelines:
  359. events.append(event)
  360. event = ''
  361. breakersFound += 1
  362. # else:
  363. # logger.debug("Match not found for regular expression '%s' and line '%s' for sample '%s' in app '%s'" % (breaker, sampleLines[x], sample, app) )
  364. # If we've inserted the breaker with bundlelines, don't insert the line, otherwise insert
  365. if not (self.bundlelines and breakerMatch):
  366. event += sampleLines[x]
  367. x += 1
  368. ## If events < count append remaining data in samples
  369. if len(events) < count:
  370. events.append(event + '\n')
  371. ## If breaker wasn't found in sample
  372. ## events = sample
  373. if breakersFound == 0:
  374. logger.warn("Breaker '%s' not found for sample '%s' in app '%s'; using default breaker" % (self.breaker, self.name, self.app) )
  375. if count >= len(sampleLines):
  376. events = sampleLines
  377. else:
  378. events = sampleLines[0:count]
  379. else:
  380. logger.debug("Found '%s' breakers for sample '%s' in app '%s'" % (breakersFound, self.name, self.app) )
  381. ## Continue to fill events array until len(events) == count
  382. if len(events) > 0 and len(events) < count:
  383. logger.debug("Events fill for sample '%s' in app '%s' less than count (%s vs. %s); continuing fill" % (self.name, self.app, len(events), count) )
  384. tempEvents = events[:]
  385. while len(events) < count:
  386. y = 0
  387. while len(events) < count and y < len(tempEvents):
  388. events.append(tempEvents[y])
  389. y += 1
  390. # logger.debug("events: %s" % pprint.pformat(events))
  391. logger.debug("Replacing %s tokens in %s events for sample '%s' in app '%s'" % (len(self.tokens), len(events), self.name, self.app))
  392. if self.sampletype == 'csv':
  393. self.index = sampleDict[0]['index']
  394. self.host = sampleDict[0]['host']
  395. self.source = sampleDict[0]['source']
  396. self.sourcetype = sampleDict[0]['sourcetype']
  397. logger.debug("Sampletype CSV. Setting self._out to CSV parameters. index: '%s' host: '%s' source: '%s' sourcetype: '%s'" \
  398. % (self.index, self.host, self.source, self.sourcetype))
  399. self._out.refreshconfig(self)
  400. # Find interval before we muck with the event but after we've done event breaking
  401. if self.mode == 'replay':
  402. logger.debug("Finding timestamp to compute interval for events")
  403. if self._lastts == None:
  404. if self.sampletype == 'csv':
  405. self._lastts = self._getTSFromEvent(self._rpevents[self._currentevent]['_raw'])
  406. else:
  407. self._lastts = self._getTSFromEvent(self._rpevents[self._currentevent])
  408. if (self._currentevent+1) < len(self._rpevents):
  409. if self.sampletype == 'csv':
  410. nextts = self._getTSFromEvent(self._rpevents[self._currentevent+1]['_raw'])
  411. else:
  412. nextts = self._getTSFromEvent(self._rpevents[self._currentevent+1])
  413. else:
  414. logger.debug("At end of _rpevents")
  415. return 0
  416. logger.debug('Computing timeDiff nextts: "%s" lastts: "%s"' % (nextts, self._lastts))
  417. timeDiff = nextts - self._lastts
  418. if timeDiff.days >= 0 and timeDiff.seconds >= 0 and timeDiff.microseconds >= 0:
  419. partialInterval = float("%d.%06d" % (timeDiff.seconds, timeDiff.microseconds))
  420. else:
  421. partialInterval = 0
  422. if self.timeMultiple > 0:
  423. partialInterval *= self.timeMultiple
  424. logger.debug("Setting partialInterval for replay mode with timeMultiple %s: %s %s" % (self.timeMultiple, timeDiff, partialInterval))
  425. self._lastts = nextts
  426. ## Iterate events
  427. for x in range(0, len(events)):
  428. event = events[x]
  429. # Maintain state for every token in a given event
  430. # Hash contains keys for each file name which is assigned a list of values
  431. # picked from a random line in that file
  432. mvhash = { }
  433. ## Iterate tokens
  434. for token in self.tokens:
  435. token.mvhash = mvhash
  436. event = token.replace(event)
  437. if(self.hostToken):
  438. # clear the host mvhash every time, because we need to re-randomize it
  439. self.hostToken.mvhash = {}
  440. # Hack for bundle lines to work with sampletype csv
  441. # Basically, bundlelines allows us to create copies of a bundled set of
  442. # of events as one event, and this splits those back out so that we properly
  443. # send each line with the proper sourcetype and source if we're we're sampletype csv
  444. if self.bundlelines and self.sampletype == 'csv':
  445. # Trim last newline so we don't end up with blank at end of the array
  446. if event[-1] == '\n':
  447. event = event[:-1]
  448. lines = event.split('\n')
  449. logger.debug("Bundlelines set and sampletype csv, breaking event back apart. %s lines." % (len(lines)))
  450. for lineno in range(0, len(lines)):
  451. if self.sampletype == 'csv' and (sampleDict[lineno]['index'] != self.index or \
  452. sampleDict[lineno]['host'] != self.host or \
  453. sampleDict[lineno]['source'] != self.source or \
  454. sampleDict[lineno]['sourcetype'] != self.sourcetype):
  455. # Flush events before we change all the various parameters
  456. logger.debug("Sampletype CSV with bundlelines, parameters changed at event %s. Flushing output." % lineno)
  457. self._out.flush()
  458. self.index = sampleDict[lineno]['index']
  459. self.host = sampleDict[lineno]['host']
  460. # Allow randomizing the host:
  461. if(self.hostToken):
  462. self.host = self.hostToken.replace(self.host)
  463. self.source = sampleDict[lineno]['source']
  464. self.sourcetype = sampleDict[lineno]['sourcetype']
  465. logger.debug("Sampletype CSV. Setting self._out to CSV parameters. index: '%s' host: '%s' source: '%s' sourcetype: '%s'" \
  466. % (self.index, self.host, self.source, self.sourcetype))
  467. self._out.refreshconfig(self)
  468. self._out.send(lines[lineno])
  469. logger.debug("Completed bundlelines event. Flushing.")
  470. self._out.flush()
  471. else:
  472. # logger.debug("Sample Index: %s Host: %s Source: %s Sourcetype: %s" % (self.index, self.host, self.source, self.sourcetype))
  473. # logger.debug("Event Index: %s Host: %s Source: %s Sourcetype: %s" % (sampleDict[x]['index'], sampleDict[x]['host'], sampleDict[x]['source'], sampleDict[x]['sourcetype']))
  474. if self.sampletype == 'csv' and (sampleDict[x]['index'] != self.index or \
  475. sampleDict[x]['host'] != self.host or \
  476. sampleDict[x]['source'] != self.source or \
  477. sampleDict[x]['sourcetype'] != self.sourcetype):
  478. # Flush events before we change all the various parameters
  479. logger.debug("Sampletype CSV, parameters changed at event %s. Flushing output." % x)
  480. self._out.flush()
  481. self.index = sampleDict[x]['index']
  482. self.host = sampleDict[x]['host']
  483. # Allow randomizing the host:
  484. if(self.hostToken):
  485. self.host = self.hostToken.replace(self.host)
  486. self.source = sampleDict[x]['source']
  487. self.sourcetype = sampleDict[x]['sourcetype']
  488. logger.debug("Sampletype CSV. Setting self._out to CSV parameters. index: '%s' host: '%s' source: '%s' sourcetype: '%s'" \
  489. % (self.index, self.host, self.source, self.sourcetype))
  490. self._out.refreshconfig(self)
  491. self._out.send(event)
  492. ## Close file handles
  493. self._out.flush()
  494. sampleFH.close()
  495. endTime = datetime.datetime.now()
  496. timeDiff = endTime - startTime
  497. if self.mode == 'sample':
  498. # timeDiffSecs = timeDelta2secs(timeDiff)
  499. timeDiffSecs = float("%d.%06d" % (timeDiff.seconds, timeDiff.microseconds))
  500. wholeIntervals = timeDiffSecs / self.interval
  501. partialInterval = timeDiffSecs % self.interval
  502. if wholeIntervals > 1:
  503. logger.warn("Generation of sample '%s' in app '%s' took longer than interval (%s seconds vs. %s seconds); consider adjusting interval" \
  504. % (self.name, self.app, timeDiff, self.interval) )
  505. partialInterval = self.interval - partialInterval
  506. # No rest for the wicked! Or while we're doing backfill
  507. if self.backfill != None and not self._backfilldone:
  508. # Since we would be sleeping, increment the timestamp by the amount of time we're sleeping
  509. incsecs = round(partialInterval / 1, 0)
  510. incmicrosecs = partialInterval % 1
  511. self._backfillts += datetime.timedelta(seconds=incsecs, microseconds=incmicrosecs)
  512. partialInterval = 0
  513. self._timeSinceSleep += timeDiff
  514. if partialInterval > 0:
  515. timeDiffFrac = "%d.%06d" % (self._timeSinceSleep.seconds, self._timeSinceSleep.microseconds)
  516. logger.info("Generation of sample '%s' in app '%s' completed in %s seconds. Sleeping for %f seconds" \
  517. % (self.name, self.app, timeDiffFrac, partialInterval) )
  518. self._timeSinceSleep = datetime.timedelta()
  519. return partialInterval
  520. else:
  521. logger.warn("Sample '%s' in app '%s' contains no data" % (self.name, self.app) )
  522. ## Replaces $SPLUNK_HOME w/ correct pathing
  523. def pathParser(self, path):
  524. greatgreatgrandparentdir = os.path.dirname(os.path.dirname(self._c.grandparentdir))
  525. sharedStorage = ['$SPLUNK_HOME/etc/apps', '$SPLUNK_HOME/etc/users/', '$SPLUNK_HOME/var/run/splunk']
  526. ## Replace windows os.sep w/ nix os.sep
  527. path = path.replace('\\', '/')
  528. ## Normalize path to os.sep
  529. path = os.path.normpath(path)
  530. ## Iterate special paths
  531. for x in range(0, len(sharedStorage)):
  532. sharedPath = os.path.normpath(sharedStorage[x])
  533. if path.startswith(sharedPath):
  534. path.replace('$SPLUNK_HOME', greatgreatgrandparentdir)
  535. break
  536. ## Split path
  537. path = path.split(os.sep)
  538. ## Iterate path segments
  539. for x in range(0, len(path)):
  540. segment = path[x].lstrip('$')
  541. ## If segement is an environment variable then replace
  542. if os.environ.has_key(segment):
  543. path[x] = os.environ[segment]
  544. ## Join path
  545. path = os.sep.join(path)
  546. return path
  547. def _getTSFromEvent(self, event):
  548. currentTime = None
  549. formats = [ ]
  550. # JB: 2012/11/20 - Can we optimize this by only testing tokens of type = *timestamp?
  551. # JB: 2012/11/20 - Alternatively, documentation should suggest putting timestamp as token.0.
  552. for token in self.tokens:
  553. try:
  554. formats.append(token.token)
  555. # logger.debug("Searching for token '%s' in event '%s'" % (token.token, event))
  556. results = token._search(event)
  557. if results:
  558. timeFormat = token.replacement
  559. group = 0 if len(results.groups()) == 0 else 1
  560. timeString = results.group(group)
  561. # logger.debug("Testing '%s' as a time string against '%s'" % (timeString, timeFormat))
  562. if timeFormat == "%s":
  563. ts = float(timeString) if len(timeString) < 10 else float(timeString) / (10**(len(timeString)-10))
  564. currentTime = datetime.datetime.fromtimestamp(ts)
  565. else:
  566. currentTime = datetime.datetime.strptime(timeString, timeFormat)
  567. logger.debug("Match '%s' Format '%s' result: '%s'" % (timeString, timeFormat, currentTime))
  568. if type(currentTime) == datetime.datetime:
  569. break
  570. except ValueError:
  571. logger.debug("Match found ('%s') but time parse failed. Timeformat '%s' Event '%s'" % (timeString, timeFormat, event))
  572. if type(currentTime) != datetime.datetime:
  573. # Total fail
  574. logger.error("Can't find a timestamp (using patterns '%s') in this event: '%s'." % (formats, event))
  575. raise ValueError("Can't find a timestamp (using patterns '%s') in this event: '%s'." % (formats, event))
  576. # Check to make sure we parsed a year
  577. if currentTime.year == 1900:
  578. currentTime = currentTime.replace(year=datetime.datetime.now().year)
  579. return currentTime
  580. def saveState(self):
  581. """Saves state of all integer IDs of this sample to a file so when we restart we'll pick them up"""
  582. for token in self.tokens:
  583. if token.replacementType == 'integerid':
  584. stateFile = open(os.path.join(self._c.sampleDir, 'state.'+urllib.pathname2url(token.token)), 'w')
  585. stateFile.write(token.replacement)
  586. stateFile.close()
  587. class Token:
  588. """Contains data and methods for replacing a token in a given sample"""
  589. token = None
  590. replacementType = None
  591. replacement = None
  592. sample = None
  593. mvhash = { }
  594. _now = None
  595. _replaytd = None
  596. _lastts = None
  597. _tokenre = None
  598. _tokenfile = None
  599. _tokents = None
  600. _earliestTime = None
  601. _latestTime = None
  602. def __init__(self, sample):
  603. self.sample = sample
  604. # Logger already setup by config, just get an instance
  605. logger = logging.getLogger('eventgen')
  606. globals()['logger'] = logger
  607. self._now = datetime.datetime.now()
  608. self._earliestTime = (None, None)
  609. self._latestTime = (None, None)
  610. def __str__(self):
  611. """Only used for debugging, outputs a pretty printed representation of this token"""
  612. # Eliminate recursive going back to parent
  613. temp = dict([ (key, value) for (key, value) in self.__dict__.items() if key != 'sample' ])
  614. return pprint.pformat(temp)
  615. def __repr__(self):
  616. return self.__str__()
  617. def _match(self, event):
  618. """Executes regular expression match and returns the re.Match object"""
  619. if self._tokenre == None:
  620. self._tokenre = re.compile(self.token)
  621. return self._tokenre.match(event)
  622. def _search(self, event):
  623. """Executes regular expression search and returns the re.Match object"""
  624. if self._tokenre == None:
  625. self._tokenre = re.compile(self.token)
  626. return self._tokenre.search(event)
  627. def _finditer(self, event):
  628. """Executes regular expression finditer and returns the re.Match object"""
  629. if self._tokenre == None:
  630. self._tokenre = re.compile(self.token)
  631. return self._tokenre.finditer(event)
  632. def replace(self, event):
  633. """Replaces all instances of this token in provided event and returns event"""
  634. offset = 0
  635. tokenMatch = self._finditer(event)
  636. # logger.debug("Checking for match for token: '%s'" % (self.token))
  637. if tokenMatch:
  638. # 5/28/12 Changing logic to account for needing old to match
  639. # the right token we're actually replacing
  640. # This will call getReplacement for every match which is more
  641. # expensive, but necessary.
  642. # # Find old in case of error
  643. oldMatch = self._search(event)
  644. if oldMatch:
  645. # old = event[oldMatch.start(group):oldMatch.end(group)]
  646. group = 0 if len(oldMatch.groups()) == 0 else 1
  647. old = oldMatch.group(group)
  648. else:
  649. old = ""
  650. # logger.debug("Got match for token: '%s'" % (self.token))
  651. replacement = self._getReplacement(old)
  652. if replacement is not None:
  653. logger.debug("Replacement: '%s'" % replacement)
  654. ## Iterate matches
  655. for match in tokenMatch:
  656. # logger.debug("Match: %s" % (match))
  657. try:
  658. matchStart = match.start(1) + offset
  659. matchEnd = match.end(1) + offset
  660. startEvent = event[:matchStart]
  661. endEvent = event[matchEnd:]
  662. # In order to not break legacy which might replace the same timestamp
  663. # with the same value in multiple matches, here we'll include
  664. # ones that need to be replaced for every match
  665. if self.replacementType in ('replaytimestamp'):
  666. replacement = self._getReplacement(event[matchStart:matchEnd])
  667. offset += len(replacement) - len(match.group(1))
  668. except:
  669. matchStart = match.start(0) + offset
  670. matchEnd = match.end(0) + offset
  671. startEvent = event[:matchStart]
  672. endEvent = event[matchEnd:]
  673. # In order to not break legacy which might replace the same timestamp
  674. # with the same value in multiple matches, here we'll include
  675. # ones that need to be replaced for every match
  676. if self.replacementType in ('replaytimestamp'):
  677. replacement = self._getReplacement(event[matchStart:matchEnd])
  678. offset += len(replacement) - len(match.group(0))
  679. # logger.debug("matchStart %d matchEnd %d offset %d" % (matchStart, matchEnd, offset))
  680. event = startEvent + replacement + endEvent
  681. # Reset replay internal variables for this token
  682. self._replaytd = None
  683. self._lastts = None
  684. return event
  685. def _getReplacement(self, old=None, event=None):
  686. if self.replacementType == 'static':
  687. return self.replacement
  688. elif self.replacementType in ('timestamp', 'replaytimestamp'):
  689. if self.sample.earliest and self.sample.latest:
  690. # Optimizing for parsing times during mass event generation
  691. # Cache results to prevent calls to timeParser unless the value changes
  692. # Because every second, relative times could change, we can only cache
  693. # results for at maximum one second. This seems not very effective, but we're
  694. # we're generating thousands of events per second it optimizes quite a bit.
  695. if self._tokents == None:
  696. self._tokents = datetime.datetime.now()
  697. # If we've gone more than a second, invalidate results, calculate
  698. # earliest and latest and cache new values
  699. if datetime.datetime.now() - self._tokents > datetime.timedelta(seconds=1):
  700. # logger.debug("Token Time Cache invalidated, refreshing")
  701. self._tokents = datetime.datetime.now()
  702. earliestTime = timeParser(self.sample.earliest)
  703. latestTime = timeParser(self.sample.latest)
  704. self._earliestTime = (self.sample.earliest, earliestTime)
  705. self._latestTime = (self.sample.latest, latestTime)
  706. else:
  707. # If we match the text of the earliest and latest config value
  708. # return cached value
  709. if self.sample.earliest == self._earliestTime[0] \
  710. and self.sample.latest == self._latestTime[0]:
  711. # logger.debug("Updating time from cache")
  712. earliestTime = self._earliestTime[1]
  713. latestTime = self._latestTime[1]
  714. # Otherwise calculate and update the cache
  715. else:
  716. # logger.debug("Earliest and Latest Time Cache invalidated for times '%s' & '%s', refreshing" \
  717. # % (self.sample.earliest, self.sample.latest))
  718. earliestTime = timeParser(self.sample.earliest)
  719. self._earlestTime = (self.sample.earliest, earliestTime)
  720. latestTime = timeParser(self.sample.latest)
  721. self._latestTime = (self.sample.latest, latestTime)
  722. # Don't muck with time while we're backfilling
  723. # if self.sample.backfill != None and not self.sample._backfilldone:
  724. # earliestTime = timeParser(self.sample.earliest)
  725. # latestTime = timeParser(self.sample.latest)
  726. # else:
  727. # if datetime.datetime.now() - self._tokents > datetime.timedelta(seconds=1):
  728. # self._tokents = datetime.datetime.now()
  729. # earliestTime = timeParser(self.sample.earliest)
  730. # latestTime = timeParser(self.sample.latest)
  731. # self._earliestTime = earliestTime
  732. # self._latestTime = latestTime
  733. # else:
  734. # earliestTime = self._earliestTime
  735. # latestTime = self._latestTime
  736. if earliestTime and latestTime:
  737. if latestTime>=earliestTime:
  738. minDelta = 0
  739. ## Compute timeDelta as total_seconds
  740. td = latestTime - earliestTime
  741. maxDelta = timeDelta2secs(td)
  742. ## Get random timeDelta
  743. randomDelta = datetime.timedelta(seconds=random.randint(minDelta, maxDelta))
  744. ## Compute replacmentTime
  745. replacementTime = latestTime - randomDelta
  746. if self.replacementType == 'replaytimestamp':
  747. if old != None and len(old) > 0:
  748. # Determine type of timestamp to use for this token
  749. # We can either be a string with one strptime format
  750. # or we can be a json formatted list of strptime formats
  751. currentts = None
  752. try:
  753. strptimelist = json.loads(self.replacement)
  754. for currentformat in strptimelist:
  755. try:
  756. timeformat = currentformat
  757. if timeformat == "%s":
  758. ts = float(old) if len(old) < 10 else float(old) / (10**(len(old)-10))
  759. currentts = datetime.datetime.fromtimestamp(ts)
  760. else:
  761. currentts = datetime.datetime.strptime(old, timeformat)
  762. # logger.debug("Old '%s' Timeformat '%s' currentts '%s'" % (old, timeformat, currentts))
  763. if type(currentts) == datetime.datetime:
  764. break
  765. except ValueError:
  766. pass
  767. if type(currentts) != datetime.datetime:
  768. # Total fail
  769. logger.error("Can't find strptime format for this timestamp '%s' in the list of formats. Returning original value" % old)
  770. return old
  771. except ValueError:
  772. # Not JSON, try to read as text
  773. timeformat = self.replacement
  774. try:
  775. if timeformat == "%s":
  776. ts = float(old) if len(old) < 10 else float(old) / (10**(len(old)-10))
  777. currentts = datetime.datetime.fromtimestamp(ts)
  778. else:
  779. currentts = datetime.datetime.strptime(old, timeformat)
  780. # logger.debug("Timeformat '%s' currentts '%s'" % (timeformat, currentts))
  781. except ValueError:
  782. # Total fail
  783. logger.error("Can't match strptime format ('%s') to this timestamp '%s'. Returning original value" % (timeformat, old))
  784. return old
  785. # Can't parse as strptime, try JSON
  786. # Check to make sure we parsed a year
  787. if currentts.year == 1900:
  788. currentts = currentts.replace(year=datetime.datetime.now().year)
  789. # We should now know the timeformat and currentts associated with this event
  790. # If we're the first, save those values
  791. if self._replaytd == None:
  792. self._replaytd = replacementTime - currentts
  793. # logger.debug("replaytd %s" % self._replaytd)
  794. replacementTime = currentts + self._replaytd
  795. # Randomize time a bit between last event and this one
  796. # Note that we'll always end up shortening the time between
  797. # events because we don't know when the next timestamp is going to be
  798. if self.sample.bundlelines:
  799. if self._lastts == None:
  800. self._lastts = replacementTime
  801. oldtd = replacementTime - self._lastts
  802. randomsecs = random.randint(0, oldtd.seconds)
  803. if oldtd.seconds > 0:
  804. randommicrosecs = random.randint(0, 1000000)
  805. else:
  806. randommicrosecs = random.randint(0, oldtd.microseconds)
  807. randomtd = datetime.timedelta(seconds=randomsecs, microseconds=randommicrosecs)
  808. replacementTime -= randomtd
  809. else:
  810. randomtd = datetime.timedelta()
  811. self._lastts = replacementTime
  812. replacementTime = replacementTime.strftime(timeformat)
  813. # logger.debug("Old '%s' Timeformat '%s' currentts '%s' replacementTime '%s' replaytd '%s' randomtd '%s'" \
  814. # % (old, timeformat, currentts, replacementTime, self._replaytd, randomtd))
  815. else:
  816. logger.error("Could not find old value, needed for replaytimestamp")
  817. return old
  818. else:
  819. replacementTime = replacementTime.strftime(self.replacement)
  820. ## replacementTime == replacement for invalid strptime specifiers
  821. if replacementTime != self.replacement.replace('%', ''):
  822. return replacementTime
  823. else:
  824. logger.error("Invalid strptime specifier '%s' detected; will not replace" \
  825. % (self.replacement) )
  826. return old
  827. ## earliestTime/latestTime not proper
  828. else:
  829. logger.error("Earliest specifier '%s', value '%s' is greater than latest specifier '%s', value '%s' for sample '%s'; will not replace" \
  830. % (self.sample.earliest, earliestTime, self.sample.latest, latestTime, self.sample.name) )
  831. return old
  832. ## earliest/latest not proper
  833. else:
  834. logger.error('Earliest or latest specifier were not set; will not replace')
  835. return old
  836. elif self.replacementType in ('random', 'rated'):
  837. ## Validations:
  838. integerRE = re.compile('integer\[([-]?\d+):([-]?\d+)\]', re.I)
  839. integerMatch = integerRE.match(self.replacement)
  840. floatRE = re.compile('float\[(\d+)\.(\d+):(\d+)\.(\d+)\]', re.I)
  841. floatMatch = floatRE.match(self.replacement)
  842. stringRE = re.compi

Large files files are truncated, but you can click here to view the full file