/lib/eventgensamples.py
Python | 1149 lines | 1030 code | 53 blank | 66 comment | 143 complexity | 20da1c12ed45e8c54b653dd5cda477b2 MD5 | raw file
Possible License(s): Apache-2.0
Large files files are truncated, but you can click here to view the full file
- from __future__ import division, with_statement
- import os, sys
- import logging
- import pprint
- import random
- import datetime
- import re
- import csv
- import json
- import copy
- from eventgenoutput import Output
- from timeparser import timeParser, timeDelta2secs
- import httplib2, urllib
- from xml.dom import minidom
- from xml.parsers.expat import ExpatError
- class Sample:
- # Required fields for Sample
- name = None
- app = None
- filePath = None
-
- # Options which are all valid for a sample
- disabled = None
- spoolDir = None
- spoolFile = None
- breaker = None
- sampletype = None
- mode = None
- interval = None
- delay = None
- count = None
- bundlelines = None
- earliest = None
- latest = None
- hourOfDayRate = None
- dayOfWeekRate = None
- randomizeEvents = None
- randomizeCount = None
- outputMode = None
- fileName = None
- fileMaxBytes = None
- fileBackupFiles = None
- splunkHost = None
- splunkPort = None
- splunkMethod = None
- splunkUser = None
- splunkPass = None
- index = None
- source = None
- sourcetype = None
- host = None
- hostRegex = None
- hostToken = None
- tokens = None
- projectID = None
- accessToken = None
- backfill = None
- backfillSearch = None
- backfillSearchUrl = None
- minuteOfHourRate = None
- timeMultiple = None
-
- # Internal fields
- _c = None
- _out = None
- _sampleLines = None
- _sampleDict = None
- _lockedSettings = None
- _priority = None
- _origName = None
- _lastts = None
- _backfillts = None
- _origEarliest = None
- _origLatest = None
- _timeSinceSleep = None
-
- def __init__(self, name):
- # Logger already setup by config, just get an instance
- logger = logging.getLogger('eventgen')
- globals()['logger'] = logger
-
- self.name = name
- self.tokens = [ ]
- self._lockedSettings = [ ]
- self._currentevent = 0
- self._rpevents = None
- self._backfilldone = False
- self._timeSinceSleep = datetime.timedelta()
-
- # Import config
- from eventgenconfig import Config
- self._c = Config()
-
- def __str__(self):
- """Only used for debugging, outputs a pretty printed representation of this sample"""
- # Eliminate recursive going back to parent
- temp = dict([ (key, value) for (key, value) in self.__dict__.items() if key != '_c' ])
- return pprint.pformat(temp)
-
- def __repr__(self):
- return self.__str__()
-
- def gen(self):
- logger.debug("Generating sample '%s' in app '%s'" % (self.name, self.app))
- startTime = datetime.datetime.now()
-
- # If this is the first time we're generating, setup out
- if self._out == None:
- logger.debug("Setting up Output class for sample '%s' in app '%s'" % (self.name, self.app))
- self._out = Output(self)
- if self.backfillSearchUrl == None:
- self.backfillSearchUrl = self._out._splunkUrl
- # Setup initial backfillts
- if self._backfillts == None and self.backfill != None and not self._backfilldone:
- try:
- self._backfillts = timeParser(self.backfill)
- logger.info("Setting up backfill of %s (%s)" % (self.backfill,self._backfillts))
- except Exception as ex:
- logger.error("Failed to parse backfill '%s': %s" % (self.backfill, ex))
- raise
- self._origEarliest = self.earliest
- self._origLatest = self.latest
- if self._out._outputMode == "splunkstream" and self.backfillSearch != None:
- if not self.backfillSearch.startswith('search'):
- self.backfillSearch = 'search ' + self.backfillSearch
- self.backfillSearch += '| head 1 | table _time'
- logger.debug("Searching Splunk URL '%s/services/search/jobs' with search '%s' with sessionKey '%s'" % (self.backfillSearchUrl, self.backfillSearch, self._out._c.sessionKey))
- results = httplib2.Http(disable_ssl_certificate_validation=True).request(\
- self.backfillSearchUrl + '/services/search/jobs',
- 'POST', headers={'Authorization': 'Splunk %s' % self._out._c.sessionKey}, \
- body=urllib.urlencode({'search': self.backfillSearch,
- 'earliest_time': self.backfill,
- 'exec_mode': 'oneshot'}))[1]
- try:
- temptime = minidom.parseString(results).getElementsByTagName('text')[0].childNodes[0].nodeValue
- # logger.debug("Time returned from backfill search: %s" % temptime)
- # Results returned look like: 2013-01-16T10:59:15.411-08:00
- # But the offset in time can also be +, so make sure we strip that out first
- if len(temptime) > 0:
- if temptime.find('+') > 0:
- temptime = temptime.split('+')[0]
- temptime = '-'.join(temptime.split('-')[0:3])
- self._backfillts = datetime.datetime.strptime(temptime, '%Y-%m-%dT%H:%M:%S.%f')
- logger.debug("Backfill search results: '%s' value: '%s' time: '%s'" % (pprint.pformat(results), temptime, self._backfillts))
- except (ExpatError, IndexError):
- pass
- # Override earliest and latest during backfill until we're at current time
- if self.backfill != None and not self._backfilldone:
- if self._backfillts >= datetime.datetime.now():
- logger.info("Backfill complete")
- self._backfilldone = True
- self.earliest = self._origEarliest
- self.latest = self._origLatest
- else:
- logger.debug("Still backfilling for sample '%s'. Currently at %s" % (self.name, self._backfillts))
- self.earliest = datetime.datetime.strftime((self._backfillts - datetime.timedelta(seconds=self.interval)), \
- "%Y-%m-%d %H:%M:%S.%f")
- self.latest = datetime.datetime.strftime(self._backfillts, "%Y-%m-%d %H:%M:%S.%f")
- # if not self.mode == 'replay':
- # self._backfillts += datetime.timedelta(seconds=self.interval)
-
- logger.debug("Opening sample '%s' in app '%s'" % (self.name, self.app) )
- sampleFH = open(self.filePath, 'rU')
- if self.sampletype == 'raw':
- # 5/27/12 CS Added caching of the sample file
- if self._sampleLines == None:
- logger.debug("Reading raw sample '%s' in app '%s'" % (self.name, self.app))
- sampleLines = sampleFH.readlines()
- self._sampleLines = sampleLines
- else:
- sampleLines = self._sampleLines
- elif self.sampletype == 'csv':
- logger.debug("Reading csv sample '%s' in app '%s'" % (self.name, self.app))
- if self._sampleLines == None:
- logger.debug("Reading csv sample '%s' in app '%s'" % (self.name, self.app))
- sampleDict = [ ]
- sampleLines = [ ]
- csvReader = csv.DictReader(sampleFH)
- for line in csvReader:
- sampleDict.append(line)
- sampleLines.append(line['_raw'].decode('string_escape'))
- self._sampleDict = copy.deepcopy(sampleDict)
- self._sampleLines = copy.deepcopy(sampleLines)
- else:
- # If we're set to bundlelines, we'll modify sampleLines regularly.
- # Since lists in python are referenced rather than copied, we
- # need to make a fresh copy every time if we're bundlelines.
- # If not, just used the cached copy, we won't mess with it.
- if not self.bundlelines:
- sampleDict = self._sampleDict
- sampleLines = self._sampleLines
- else:
- sampleDict = copy.deepcopy(self._sampleDict)
- sampleLines = copy.deepcopy(self._sampleLines)
- # Check to see if this is the first time we've run, or if we're at the end of the file
- # and we're running replay. If so, we need to parse the whole file and/or setup our counters
- if self._rpevents == None and self.mode == 'replay':
- if self.sampletype == 'csv':
- self._rpevents = sampleDict
- else:
- if self.breaker != self._c.breaker:
- self._rpevents = []
- lines = '\n'.join(sampleLines)
- breaker = re.search(self.breaker, lines)
- currentchar = 0
- while breaker:
- self._rpevents.append(lines[currentchar:breaker.start(0)])
- lines = lines[breaker.end(0):]
- currentchar += breaker.start(0)
- breaker = re.search(self.breaker, lines)
- else:
- self._rpevents = sampleLines
- self._currentevent = 0
-
- # If we are replaying then we need to set the current sampleLines to the event
- # we're currently on
- if self.mode == 'replay':
- if self.sampletype == 'csv':
- sampleDict = [ self._rpevents[self._currentevent] ]
- sampleLines = [ self._rpevents[self._currentevent]['_raw'].decode('string_escape') ]
- else:
- sampleLines = [ self._rpevents[self._currentevent] ]
- self._currentevent += 1
- # If we roll over the max number of lines, roll over the counter and start over
- if self._currentevent >= len(self._rpevents):
- logger.debug("At end of the sample file, starting replay from the top")
- self._currentevent = 0
- self._lastts = None
- # Ensure all lines have a newline
- for i in xrange(0, len(sampleLines)):
- if sampleLines[i][-1] != '\n':
- sampleLines[i] += '\n'
- # If we've set bundlelines, then we want count copies of all of the lines in the file
- # And we'll set breaker to be a weird delimiter so that we'll end up with an events
- # array that can be rated by the hour of day and day of week rates
- # This is only for weird outside use cases like when we want to include a CSV file as the source
- # so we can't set breaker properly
- if self.bundlelines:
- logger.debug("Bundlelines set. Creating %s copies of original sample lines and setting breaker." % (self.count-1))
- self.breaker = '\n------\n'
- origSampleLines = copy.deepcopy(sampleLines)
- sampleLines.append(self.breaker)
- for i in range(0, self.count-1):
- sampleLines.extend(origSampleLines)
- sampleLines.append(self.breaker)
-
- if len(sampleLines) > 0:
- count = self.count
- if self.count == 0 and self.mode == 'sample':
- logger.debug("Count %s specified as default for sample '%s' in app '%s'; adjusting count to sample length %s; using default breaker" \
- % (self.count, self.name, self.app, len(sampleLines)) )
- count = len(sampleLines)
- self.breaker = self._c.breaker
- elif self.count > 0 or self.mode == 'replay':
-
- # 5/8/12 CS We've requested not the whole file, so we should adjust count based on
- # hourOfDay, dayOfWeek and randomizeCount configs
- rateFactor = 1.0
- if self.randomizeCount != 0 and self.randomizeCount != None:
- try:
- logger.debug("randomizeCount for sample '%s' in app '%s' is %s" \
- % (self.name, self.app, self.randomizeCount))
- # If we say we're going to be 20% variable, then that means we
- # can be .1% high or .1% low. Math below does that.
- randBound = round(self.randomizeCount * 1000, 0)
- rand = random.randint(0, randBound)
- randFactor = 1+((-((randBound / 2) - rand)) / 1000)
- logger.debug("randFactor for sample '%s' in app '%s' is %s" \
- % (self.name, self.app, randFactor))
- rateFactor *= randFactor
- except:
- import traceback
- stack = traceback.format_exc()
- logger.error("Randomize count failed. Stacktrace %s" % stack)
- if type(self.hourOfDayRate) == dict:
- try:
- if self.backfill != None and not self._backfilldone:
- now = self._backfillts
- else:
- now = datetime.datetime.now()
- rate = self.hourOfDayRate[str(now.hour)]
- logger.debug("hourOfDayRate for sample '%s' in app '%s' is %s" % (self.name, self.app, rate))
- rateFactor *= rate
- except KeyError:
- import traceback
- stack = traceback.format_exc()
- logger.error("Hour of day rate failed. Stacktrace %s" % stack)
- if type(self.dayOfWeekRate) == dict:
- try:
- if self.backfill != None and not self._backfilldone:
- now = self._backfillts
- else:
- now = datetime.datetime.now()
- weekday = datetime.date.weekday(now)
- if weekday == 6:
- weekday = 0
- else:
- weekday += 1
- rate = self.dayOfWeekRate[str(weekday)]
- logger.debug("dayOfWeekRate for sample '%s' in app '%s' is %s" % (self.name, self.app, rate))
- rateFactor *= rate
- except KeyError:
- import traceback
- stack = traceback.format_exc()
- logger.error("Hour of day rate failed. Stacktrace %s" % stack)
- if type(self.minuteOfHourRate) == dict:
- try:
- if self.backfill != None and not self._backfilldone:
- now = self._backfillts
- else:
- now = datetime.datetime.now()
- rate = self.minuteOfHourRate[str(now.minute)]
- logger.debug("minuteOfHourRate for sample '%s' in app '%s' is %s" % (self.name, self.app, rate))
- rateFactor *= rate
- except KeyError:
- import traceback
- stack = traceback.format_exc()
- logger.error("Minute of hour rate failed. Stacktrace %s" % stack)
- count = int(round(count * rateFactor, 0))
- if rateFactor != 1.0:
- logger.info("Original count: %s Rated count: %s Rate factor: %s" % (self.count, count, rateFactor))
- try:
- breakerRE = re.compile(self.breaker)
- except:
- logger.error("Line breaker '%s' for sample '%s' in app '%s' could not be compiled; using default breaker" \
- % (self.breaker, self.name, self.app) )
- self.breaker = self._c.breaker
- events = []
- event = ''
- if self.breaker == self._c.breaker:
- logger.debug("Default breaker detected for sample '%s' in app '%s'; using simple event fill" \
- % (self.name, self.app) )
- logger.debug("Filling events array for sample '%s' in app '%s'; count=%s, sampleLines=%s" \
- % (self.name, self.app, count, len(sampleLines)) )
- # 5/8/12 CS Added randomizeEvents config to randomize items from the file
- # 5/27/12 CS Don't randomize unless we're raw
- try:
- # 7/30/12 CS Can't remember why I wouldn't allow randomize Events for CSV so commenting
- # this out and seeing what breaks
- #if self.randomizeEvents and self.sampletype == 'raw':
- if self.randomizeEvents:
- logger.debug("Shuffling events for sample '%s' in app '%s'" \
- % (self.name, self.app))
- random.shuffle(sampleLines)
- except:
- logger.error("randomizeEvents for sample '%s' in app '%s' unparseable." \
- % (self.name, self.app))
-
- if count >= len(sampleLines):
- events = sampleLines
- else:
- events = sampleLines[0:count]
- else:
- logger.debug("Non-default breaker '%s' detected for sample '%s' in app '%s'; using advanced event fill" \
- % (self.breaker, self.name, self.app) )
- ## Fill events array from breaker and sampleLines
- breakersFound = 0
- x = 0
- logger.debug("Filling events array for sample '%s' in app '%s'; count=%s, sampleLines=%s" \
- % (self.name, self.app, count, len(sampleLines)) )
- while len(events) < count and x < len(sampleLines):
- #logger.debug("Attempting to match regular expression '%s' with line '%s' for sample '%s' in app '%s'" % (breaker, sampleLines[x], sample, app) )
- breakerMatch = breakerRE.search(sampleLines[x])
- if breakerMatch:
- #logger.debug("Match found for regular expression '%s' and line '%s' for sample '%s' in app '%s'" % (breaker, sampleLines[x], sample, app) )
- ## If not first
- # 5/28/12 CS This may cause a regression defect, but I can't figure out why
- # you'd want to ignore the first breaker you find. It's certainly breaking
- # my current use case.
- # 6/25/12 CS Definitely caused a regression defect. I'm going to add
- # a check for bundlelines which is where I need this to work every time
- if breakersFound != 0 or self.bundlelines:
- events.append(event)
- event = ''
- breakersFound += 1
- # else:
- # logger.debug("Match not found for regular expression '%s' and line '%s' for sample '%s' in app '%s'" % (breaker, sampleLines[x], sample, app) )
- # If we've inserted the breaker with bundlelines, don't insert the line, otherwise insert
- if not (self.bundlelines and breakerMatch):
- event += sampleLines[x]
- x += 1
- ## If events < count append remaining data in samples
- if len(events) < count:
- events.append(event + '\n')
- ## If breaker wasn't found in sample
- ## events = sample
- if breakersFound == 0:
- logger.warn("Breaker '%s' not found for sample '%s' in app '%s'; using default breaker" % (self.breaker, self.name, self.app) )
- if count >= len(sampleLines):
- events = sampleLines
- else:
- events = sampleLines[0:count]
- else:
- logger.debug("Found '%s' breakers for sample '%s' in app '%s'" % (breakersFound, self.name, self.app) )
- ## Continue to fill events array until len(events) == count
- if len(events) > 0 and len(events) < count:
- logger.debug("Events fill for sample '%s' in app '%s' less than count (%s vs. %s); continuing fill" % (self.name, self.app, len(events), count) )
- tempEvents = events[:]
- while len(events) < count:
- y = 0
- while len(events) < count and y < len(tempEvents):
- events.append(tempEvents[y])
- y += 1
- # logger.debug("events: %s" % pprint.pformat(events))
- logger.debug("Replacing %s tokens in %s events for sample '%s' in app '%s'" % (len(self.tokens), len(events), self.name, self.app))
-
- if self.sampletype == 'csv':
- self.index = sampleDict[0]['index']
- self.host = sampleDict[0]['host']
- self.source = sampleDict[0]['source']
- self.sourcetype = sampleDict[0]['sourcetype']
- logger.debug("Sampletype CSV. Setting self._out to CSV parameters. index: '%s' host: '%s' source: '%s' sourcetype: '%s'" \
- % (self.index, self.host, self.source, self.sourcetype))
- self._out.refreshconfig(self)
-
- # Find interval before we muck with the event but after we've done event breaking
- if self.mode == 'replay':
- logger.debug("Finding timestamp to compute interval for events")
- if self._lastts == None:
- if self.sampletype == 'csv':
- self._lastts = self._getTSFromEvent(self._rpevents[self._currentevent]['_raw'])
- else:
- self._lastts = self._getTSFromEvent(self._rpevents[self._currentevent])
- if (self._currentevent+1) < len(self._rpevents):
- if self.sampletype == 'csv':
- nextts = self._getTSFromEvent(self._rpevents[self._currentevent+1]['_raw'])
- else:
- nextts = self._getTSFromEvent(self._rpevents[self._currentevent+1])
- else:
- logger.debug("At end of _rpevents")
- return 0
- logger.debug('Computing timeDiff nextts: "%s" lastts: "%s"' % (nextts, self._lastts))
- timeDiff = nextts - self._lastts
- if timeDiff.days >= 0 and timeDiff.seconds >= 0 and timeDiff.microseconds >= 0:
- partialInterval = float("%d.%06d" % (timeDiff.seconds, timeDiff.microseconds))
- else:
- partialInterval = 0
- if self.timeMultiple > 0:
- partialInterval *= self.timeMultiple
- logger.debug("Setting partialInterval for replay mode with timeMultiple %s: %s %s" % (self.timeMultiple, timeDiff, partialInterval))
- self._lastts = nextts
- ## Iterate events
- for x in range(0, len(events)):
- event = events[x]
- # Maintain state for every token in a given event
- # Hash contains keys for each file name which is assigned a list of values
- # picked from a random line in that file
- mvhash = { }
- ## Iterate tokens
- for token in self.tokens:
- token.mvhash = mvhash
- event = token.replace(event)
- if(self.hostToken):
- # clear the host mvhash every time, because we need to re-randomize it
- self.hostToken.mvhash = {}
- # Hack for bundle lines to work with sampletype csv
- # Basically, bundlelines allows us to create copies of a bundled set of
- # of events as one event, and this splits those back out so that we properly
- # send each line with the proper sourcetype and source if we're we're sampletype csv
- if self.bundlelines and self.sampletype == 'csv':
- # Trim last newline so we don't end up with blank at end of the array
- if event[-1] == '\n':
- event = event[:-1]
- lines = event.split('\n')
- logger.debug("Bundlelines set and sampletype csv, breaking event back apart. %s lines." % (len(lines)))
- for lineno in range(0, len(lines)):
- if self.sampletype == 'csv' and (sampleDict[lineno]['index'] != self.index or \
- sampleDict[lineno]['host'] != self.host or \
- sampleDict[lineno]['source'] != self.source or \
- sampleDict[lineno]['sourcetype'] != self.sourcetype):
- # Flush events before we change all the various parameters
- logger.debug("Sampletype CSV with bundlelines, parameters changed at event %s. Flushing output." % lineno)
- self._out.flush()
- self.index = sampleDict[lineno]['index']
- self.host = sampleDict[lineno]['host']
- # Allow randomizing the host:
- if(self.hostToken):
- self.host = self.hostToken.replace(self.host)
- self.source = sampleDict[lineno]['source']
- self.sourcetype = sampleDict[lineno]['sourcetype']
- logger.debug("Sampletype CSV. Setting self._out to CSV parameters. index: '%s' host: '%s' source: '%s' sourcetype: '%s'" \
- % (self.index, self.host, self.source, self.sourcetype))
- self._out.refreshconfig(self)
- self._out.send(lines[lineno])
- logger.debug("Completed bundlelines event. Flushing.")
- self._out.flush()
- else:
- # logger.debug("Sample Index: %s Host: %s Source: %s Sourcetype: %s" % (self.index, self.host, self.source, self.sourcetype))
- # logger.debug("Event Index: %s Host: %s Source: %s Sourcetype: %s" % (sampleDict[x]['index'], sampleDict[x]['host'], sampleDict[x]['source'], sampleDict[x]['sourcetype']))
- if self.sampletype == 'csv' and (sampleDict[x]['index'] != self.index or \
- sampleDict[x]['host'] != self.host or \
- sampleDict[x]['source'] != self.source or \
- sampleDict[x]['sourcetype'] != self.sourcetype):
- # Flush events before we change all the various parameters
- logger.debug("Sampletype CSV, parameters changed at event %s. Flushing output." % x)
- self._out.flush()
- self.index = sampleDict[x]['index']
- self.host = sampleDict[x]['host']
- # Allow randomizing the host:
- if(self.hostToken):
- self.host = self.hostToken.replace(self.host)
- self.source = sampleDict[x]['source']
- self.sourcetype = sampleDict[x]['sourcetype']
- logger.debug("Sampletype CSV. Setting self._out to CSV parameters. index: '%s' host: '%s' source: '%s' sourcetype: '%s'" \
- % (self.index, self.host, self.source, self.sourcetype))
- self._out.refreshconfig(self)
- self._out.send(event)
- ## Close file handles
- self._out.flush()
- sampleFH.close()
- endTime = datetime.datetime.now()
- timeDiff = endTime - startTime
- if self.mode == 'sample':
- # timeDiffSecs = timeDelta2secs(timeDiff)
- timeDiffSecs = float("%d.%06d" % (timeDiff.seconds, timeDiff.microseconds))
- wholeIntervals = timeDiffSecs / self.interval
- partialInterval = timeDiffSecs % self.interval
- if wholeIntervals > 1:
- logger.warn("Generation of sample '%s' in app '%s' took longer than interval (%s seconds vs. %s seconds); consider adjusting interval" \
- % (self.name, self.app, timeDiff, self.interval) )
- partialInterval = self.interval - partialInterval
-
- # No rest for the wicked! Or while we're doing backfill
- if self.backfill != None and not self._backfilldone:
- # Since we would be sleeping, increment the timestamp by the amount of time we're sleeping
- incsecs = round(partialInterval / 1, 0)
- incmicrosecs = partialInterval % 1
- self._backfillts += datetime.timedelta(seconds=incsecs, microseconds=incmicrosecs)
- partialInterval = 0
- self._timeSinceSleep += timeDiff
- if partialInterval > 0:
- timeDiffFrac = "%d.%06d" % (self._timeSinceSleep.seconds, self._timeSinceSleep.microseconds)
- logger.info("Generation of sample '%s' in app '%s' completed in %s seconds. Sleeping for %f seconds" \
- % (self.name, self.app, timeDiffFrac, partialInterval) )
- self._timeSinceSleep = datetime.timedelta()
- return partialInterval
- else:
- logger.warn("Sample '%s' in app '%s' contains no data" % (self.name, self.app) )
-
- ## Replaces $SPLUNK_HOME w/ correct pathing
- def pathParser(self, path):
- greatgreatgrandparentdir = os.path.dirname(os.path.dirname(self._c.grandparentdir))
- sharedStorage = ['$SPLUNK_HOME/etc/apps', '$SPLUNK_HOME/etc/users/', '$SPLUNK_HOME/var/run/splunk']
- ## Replace windows os.sep w/ nix os.sep
- path = path.replace('\\', '/')
- ## Normalize path to os.sep
- path = os.path.normpath(path)
- ## Iterate special paths
- for x in range(0, len(sharedStorage)):
- sharedPath = os.path.normpath(sharedStorage[x])
- if path.startswith(sharedPath):
- path.replace('$SPLUNK_HOME', greatgreatgrandparentdir)
- break
- ## Split path
- path = path.split(os.sep)
- ## Iterate path segments
- for x in range(0, len(path)):
- segment = path[x].lstrip('$')
- ## If segement is an environment variable then replace
- if os.environ.has_key(segment):
- path[x] = os.environ[segment]
- ## Join path
- path = os.sep.join(path)
- return path
- def _getTSFromEvent(self, event):
- currentTime = None
- formats = [ ]
- # JB: 2012/11/20 - Can we optimize this by only testing tokens of type = *timestamp?
- # JB: 2012/11/20 - Alternatively, documentation should suggest putting timestamp as token.0.
- for token in self.tokens:
- try:
- formats.append(token.token)
- # logger.debug("Searching for token '%s' in event '%s'" % (token.token, event))
- results = token._search(event)
- if results:
- timeFormat = token.replacement
- group = 0 if len(results.groups()) == 0 else 1
- timeString = results.group(group)
- # logger.debug("Testing '%s' as a time string against '%s'" % (timeString, timeFormat))
- if timeFormat == "%s":
- ts = float(timeString) if len(timeString) < 10 else float(timeString) / (10**(len(timeString)-10))
- currentTime = datetime.datetime.fromtimestamp(ts)
- else:
- currentTime = datetime.datetime.strptime(timeString, timeFormat)
- logger.debug("Match '%s' Format '%s' result: '%s'" % (timeString, timeFormat, currentTime))
- if type(currentTime) == datetime.datetime:
- break
- except ValueError:
- logger.debug("Match found ('%s') but time parse failed. Timeformat '%s' Event '%s'" % (timeString, timeFormat, event))
- if type(currentTime) != datetime.datetime:
- # Total fail
- logger.error("Can't find a timestamp (using patterns '%s') in this event: '%s'." % (formats, event))
- raise ValueError("Can't find a timestamp (using patterns '%s') in this event: '%s'." % (formats, event))
- # Check to make sure we parsed a year
- if currentTime.year == 1900:
- currentTime = currentTime.replace(year=datetime.datetime.now().year)
- return currentTime
-
- def saveState(self):
- """Saves state of all integer IDs of this sample to a file so when we restart we'll pick them up"""
- for token in self.tokens:
- if token.replacementType == 'integerid':
- stateFile = open(os.path.join(self._c.sampleDir, 'state.'+urllib.pathname2url(token.token)), 'w')
- stateFile.write(token.replacement)
- stateFile.close()
-
- class Token:
- """Contains data and methods for replacing a token in a given sample"""
- token = None
- replacementType = None
- replacement = None
- sample = None
- mvhash = { }
-
- _now = None
- _replaytd = None
- _lastts = None
- _tokenre = None
- _tokenfile = None
- _tokents = None
- _earliestTime = None
- _latestTime = None
-
- def __init__(self, sample):
- self.sample = sample
-
- # Logger already setup by config, just get an instance
- logger = logging.getLogger('eventgen')
- globals()['logger'] = logger
-
- self._now = datetime.datetime.now()
- self._earliestTime = (None, None)
- self._latestTime = (None, None)
-
- def __str__(self):
- """Only used for debugging, outputs a pretty printed representation of this token"""
- # Eliminate recursive going back to parent
- temp = dict([ (key, value) for (key, value) in self.__dict__.items() if key != 'sample' ])
- return pprint.pformat(temp)
- def __repr__(self):
- return self.__str__()
-
- def _match(self, event):
- """Executes regular expression match and returns the re.Match object"""
- if self._tokenre == None:
- self._tokenre = re.compile(self.token)
- return self._tokenre.match(event)
-
- def _search(self, event):
- """Executes regular expression search and returns the re.Match object"""
- if self._tokenre == None:
- self._tokenre = re.compile(self.token)
- return self._tokenre.search(event)
-
- def _finditer(self, event):
- """Executes regular expression finditer and returns the re.Match object"""
- if self._tokenre == None:
- self._tokenre = re.compile(self.token)
- return self._tokenre.finditer(event)
-
- def replace(self, event):
- """Replaces all instances of this token in provided event and returns event"""
- offset = 0
- tokenMatch = self._finditer(event)
- # logger.debug("Checking for match for token: '%s'" % (self.token))
- if tokenMatch:
- # 5/28/12 Changing logic to account for needing old to match
- # the right token we're actually replacing
- # This will call getReplacement for every match which is more
- # expensive, but necessary.
-
- # # Find old in case of error
- oldMatch = self._search(event)
- if oldMatch:
- # old = event[oldMatch.start(group):oldMatch.end(group)]
- group = 0 if len(oldMatch.groups()) == 0 else 1
- old = oldMatch.group(group)
- else:
- old = ""
-
- # logger.debug("Got match for token: '%s'" % (self.token))
- replacement = self._getReplacement(old)
-
- if replacement is not None:
- logger.debug("Replacement: '%s'" % replacement)
- ## Iterate matches
- for match in tokenMatch:
- # logger.debug("Match: %s" % (match))
- try:
- matchStart = match.start(1) + offset
- matchEnd = match.end(1) + offset
- startEvent = event[:matchStart]
- endEvent = event[matchEnd:]
- # In order to not break legacy which might replace the same timestamp
- # with the same value in multiple matches, here we'll include
- # ones that need to be replaced for every match
- if self.replacementType in ('replaytimestamp'):
- replacement = self._getReplacement(event[matchStart:matchEnd])
- offset += len(replacement) - len(match.group(1))
- except:
- matchStart = match.start(0) + offset
- matchEnd = match.end(0) + offset
- startEvent = event[:matchStart]
- endEvent = event[matchEnd:]
- # In order to not break legacy which might replace the same timestamp
- # with the same value in multiple matches, here we'll include
- # ones that need to be replaced for every match
- if self.replacementType in ('replaytimestamp'):
- replacement = self._getReplacement(event[matchStart:matchEnd])
- offset += len(replacement) - len(match.group(0))
- # logger.debug("matchStart %d matchEnd %d offset %d" % (matchStart, matchEnd, offset))
- event = startEvent + replacement + endEvent
-
- # Reset replay internal variables for this token
- self._replaytd = None
- self._lastts = None
- return event
-
- def _getReplacement(self, old=None, event=None):
- if self.replacementType == 'static':
- return self.replacement
- elif self.replacementType in ('timestamp', 'replaytimestamp'):
- if self.sample.earliest and self.sample.latest:
- # Optimizing for parsing times during mass event generation
- # Cache results to prevent calls to timeParser unless the value changes
- # Because every second, relative times could change, we can only cache
- # results for at maximum one second. This seems not very effective, but we're
- # we're generating thousands of events per second it optimizes quite a bit.
- if self._tokents == None:
- self._tokents = datetime.datetime.now()
- # If we've gone more than a second, invalidate results, calculate
- # earliest and latest and cache new values
- if datetime.datetime.now() - self._tokents > datetime.timedelta(seconds=1):
- # logger.debug("Token Time Cache invalidated, refreshing")
- self._tokents = datetime.datetime.now()
- earliestTime = timeParser(self.sample.earliest)
- latestTime = timeParser(self.sample.latest)
- self._earliestTime = (self.sample.earliest, earliestTime)
- self._latestTime = (self.sample.latest, latestTime)
- else:
- # If we match the text of the earliest and latest config value
- # return cached value
- if self.sample.earliest == self._earliestTime[0] \
- and self.sample.latest == self._latestTime[0]:
- # logger.debug("Updating time from cache")
- earliestTime = self._earliestTime[1]
- latestTime = self._latestTime[1]
- # Otherwise calculate and update the cache
- else:
- # logger.debug("Earliest and Latest Time Cache invalidated for times '%s' & '%s', refreshing" \
- # % (self.sample.earliest, self.sample.latest))
- earliestTime = timeParser(self.sample.earliest)
- self._earlestTime = (self.sample.earliest, earliestTime)
- latestTime = timeParser(self.sample.latest)
- self._latestTime = (self.sample.latest, latestTime)
- # Don't muck with time while we're backfilling
- # if self.sample.backfill != None and not self.sample._backfilldone:
- # earliestTime = timeParser(self.sample.earliest)
- # latestTime = timeParser(self.sample.latest)
- # else:
- # if datetime.datetime.now() - self._tokents > datetime.timedelta(seconds=1):
- # self._tokents = datetime.datetime.now()
- # earliestTime = timeParser(self.sample.earliest)
- # latestTime = timeParser(self.sample.latest)
- # self._earliestTime = earliestTime
- # self._latestTime = latestTime
- # else:
- # earliestTime = self._earliestTime
- # latestTime = self._latestTime
- if earliestTime and latestTime:
- if latestTime>=earliestTime:
- minDelta = 0
- ## Compute timeDelta as total_seconds
- td = latestTime - earliestTime
- maxDelta = timeDelta2secs(td)
- ## Get random timeDelta
- randomDelta = datetime.timedelta(seconds=random.randint(minDelta, maxDelta))
- ## Compute replacmentTime
- replacementTime = latestTime - randomDelta
-
- if self.replacementType == 'replaytimestamp':
- if old != None and len(old) > 0:
- # Determine type of timestamp to use for this token
- # We can either be a string with one strptime format
- # or we can be a json formatted list of strptime formats
- currentts = None
- try:
- strptimelist = json.loads(self.replacement)
- for currentformat in strptimelist:
- try:
- timeformat = currentformat
- if timeformat == "%s":
- ts = float(old) if len(old) < 10 else float(old) / (10**(len(old)-10))
- currentts = datetime.datetime.fromtimestamp(ts)
- else:
- currentts = datetime.datetime.strptime(old, timeformat)
- # logger.debug("Old '%s' Timeformat '%s' currentts '%s'" % (old, timeformat, currentts))
- if type(currentts) == datetime.datetime:
- break
- except ValueError:
- pass
- if type(currentts) != datetime.datetime:
- # Total fail
- logger.error("Can't find strptime format for this timestamp '%s' in the list of formats. Returning original value" % old)
- return old
- except ValueError:
- # Not JSON, try to read as text
- timeformat = self.replacement
- try:
- if timeformat == "%s":
- ts = float(old) if len(old) < 10 else float(old) / (10**(len(old)-10))
- currentts = datetime.datetime.fromtimestamp(ts)
- else:
- currentts = datetime.datetime.strptime(old, timeformat)
- # logger.debug("Timeformat '%s' currentts '%s'" % (timeformat, currentts))
- except ValueError:
- # Total fail
- logger.error("Can't match strptime format ('%s') to this timestamp '%s'. Returning original value" % (timeformat, old))
- return old
-
- # Can't parse as strptime, try JSON
-
- # Check to make sure we parsed a year
- if currentts.year == 1900:
- currentts = currentts.replace(year=datetime.datetime.now().year)
- # We should now know the timeformat and currentts associated with this event
- # If we're the first, save those values
- if self._replaytd == None:
- self._replaytd = replacementTime - currentts
-
- # logger.debug("replaytd %s" % self._replaytd)
- replacementTime = currentts + self._replaytd
-
- # Randomize time a bit between last event and this one
- # Note that we'll always end up shortening the time between
- # events because we don't know when the next timestamp is going to be
- if self.sample.bundlelines:
- if self._lastts == None:
- self._lastts = replacementTime
- oldtd = replacementTime - self._lastts
- randomsecs = random.randint(0, oldtd.seconds)
- if oldtd.seconds > 0:
- randommicrosecs = random.randint(0, 1000000)
- else:
- randommicrosecs = random.randint(0, oldtd.microseconds)
- randomtd = datetime.timedelta(seconds=randomsecs, microseconds=randommicrosecs)
- replacementTime -= randomtd
- else:
- randomtd = datetime.timedelta()
- self._lastts = replacementTime
- replacementTime = replacementTime.strftime(timeformat)
- # logger.debug("Old '%s' Timeformat '%s' currentts '%s' replacementTime '%s' replaytd '%s' randomtd '%s'" \
- # % (old, timeformat, currentts, replacementTime, self._replaytd, randomtd))
- else:
- logger.error("Could not find old value, needed for replaytimestamp")
- return old
- else:
- replacementTime = replacementTime.strftime(self.replacement)
- ## replacementTime == replacement for invalid strptime specifiers
- if replacementTime != self.replacement.replace('%', ''):
- return replacementTime
- else:
- logger.error("Invalid strptime specifier '%s' detected; will not replace" \
- % (self.replacement) )
- return old
- ## earliestTime/latestTime not proper
- else:
- logger.error("Earliest specifier '%s', value '%s' is greater than latest specifier '%s', value '%s' for sample '%s'; will not replace" \
- % (self.sample.earliest, earliestTime, self.sample.latest, latestTime, self.sample.name) )
- return old
- ## earliest/latest not proper
- else:
- logger.error('Earliest or latest specifier were not set; will not replace')
- return old
- elif self.replacementType in ('random', 'rated'):
- ## Validations:
- integerRE = re.compile('integer\[([-]?\d+):([-]?\d+)\]', re.I)
- integerMatch = integerRE.match(self.replacement)
-
- floatRE = re.compile('float\[(\d+)\.(\d+):(\d+)\.(\d+)\]', re.I)
- floatMatch = floatRE.match(self.replacement)
- stringRE = re.compi…
Large files files are truncated, but you can click here to view the full file