/bin/percentiles
Python | 206 lines | 202 code | 3 blank | 1 comment | 0 complexity | 4e6043e6f5860a80cd72f1e299ca6004 MD5 | raw file
Possible License(s): WTFPL
- #!/usr/bin/env python
- import argparse
- from collections import deque, namedtuple
- import re
- import sys
- import textwrap
- import time
- DataPoint = namedtuple('DataPoint', 'when value')
- parser = argparse.ArgumentParser(
- epilog=textwrap.dedent('''
- timespec:
- * If a number is prefixed with a time unit (s, m or h),
- it is treated as a duration.
- * If no suffix is given, it is treated as a number of samples.
- interpolation suffixes:
- l: linearly interpolate the two closest points
- n: takes the nearest point
- '''),
- formatter_class=argparse.RawDescriptionHelpFormatter,
- )
- parser.add_argument(
- '-p', '--percentiles',
- help='''
- Comma separated list of percentiles to print.
- Each percentile is comprised of a floating point number, and optionally a
- letter suffix indicating the interpolation mode.
- ''',
- )
- parser.add_argument(
- '-t', '--time-provided',
- help='''
- Instead of adding the time when the line is read from stdin, this flag
- indicates that each line of input has two numbers, the first being a
- timestamp (in fractional seconds), and the second being (as usual) the
- value.
- ''',
- )
- parser.add_argument(
- '-w', '--window',
- help='''
- The distribution will be calculated as a moving window over the data,
- using the given timespec. If empty (the default), the distribution will be
- taken over the entire dataset.
- ''',
- )
- parser.add_argument(
- '-u', '--update',
- help='''
- A timespec indicating how often to print out the percentiles.
- If empty (the default), the percentiles will be printed once, at the end of
- the data.
- ''',
- )
- args = parser.parse_args()
- _TIMESPEC = re.compile(r'^([0-9.]+)([a-z]+)$')
- MULTIPLIERS = {
- 'ms': 1 / 1000.,
- 's': 1,
- 'm': 60,
- 'h': 60 * 60,
- }
- def parse_timespec(value):
- if not value:
- return ('all', 0)
- if value.isdigit():
- return ('samples', int(value))
- match = _TIMESPEC.search(value)
- if match:
- value, unit = match.groups()
- multiplier = MULTIPLIERS[unit]
- try:
- return ('seconds', float(value) * multiplier)
- except ValueError:
- pass
- _PERCENTILE = re.compile(r'^([0-9.]+)([ln]?)$')
- INTERPOLATIONS = {
- '': 'linear',
- 'l': 'linear',
- 'n': 'nearest',
- }
- def parse_percentile(value):
- match = _PERCENTILE.search(value)
- if match:
- value, interpolation = match.groups()
- interpolation = INTERPOLATIONS[interpolation]
- try:
- return float(value), interpolation
- except ValueError:
- pass
- def read_input():
- for line in sys.stdin:
- if args.time_provided:
- now, line = line.split()
- now = float(now)
- else:
- now = time.time()
- value = float(line)
- yield DataPoint(now, value)
- def moving_window_n_samples(data, n):
- queue = deque([], n)
- # almost fill the queue
- for i in xrange(n - 1):
- queue.append(next(data))
- for item in data:
- queue.append(item)
- yield tuple(queue)
- def moving_window_n_seconds(data, n):
- queue = deque()
- for item in data:
- now = data.when
- while now > n - queue[0].when:
- queue.popleft()
- queue.append(data)
- yield tuple(queue)
- def moving_window(data):
- kind, value = parse_timespec(args.window)
- if kind == 'all':
- return [data]
- elif kind == 'samples':
- return moving_window_n_samples(data, value)
- elif kind == 'seconds':
- return moving_window_n_seconds(data, value)
- raise NotImplementedError('oops.')
- def update_n_samples(windows, n):
- while True:
- yield next(windows)
- for i in xrange(n - 1):
- next(windows)
- def update_n_seconds(windows, n):
- last = 0
- for window in windows:
- now = window[-1].when
- if now - last > n:
- yield window
- last = now
- def update(windows):
- kind, value = parse_timespec(args.update)
- if kind == 'all':
- return windows
- elif kind == 'samples':
- return update_n_samples(windows, value)
- elif kind == 'seconds':
- return update_n_seconds(windows, value)
- raise NotImplementedError('oops.')
- def get_percentile(data, percentile, interpolation):
- assert 0.0 <= percentile <= 100.0
- n = len(data) - 1
- i = percentile * n / 100.0
- assert 0.0 <= i <= n
- if interpolation == 'nearest':
- i = int(round(i))
- return data[i]
- elif interpolation == 'linear':
- i_down = int(i)
- alpha = i - i_down
- lower = data[i_down]
- if alpha == 0.0:
- return lower
- upper = data[i_down + 1]
- return lower * (1.0 - alpha) + upper * alpha
- raise NotImplementedError('Unknown interpolation %r' % (interpolation,))
- def get_percentiles(window, percentiles):
- window = sorted(data.value for data in window)
- for percentile, interpolation in percentiles:
- yield get_percentile(window, percentile, interpolation)
- def main():
- percentiles = [
- parse_percentile(arg)
- for arg in args.percentiles.split(',')
- ]
- data = read_input()
- windows = moving_window(data)
- update_windows = update(windows)
- for window in update_windows:
- values = get_percentiles(window, percentiles)
- print '\t'.join('%g' % value for value in values)
- if __name__ == '__main__':
- main()