/gchelpers.py
Python | 539 lines | 529 code | 7 blank | 3 comment | 6 complexity | 041b2ac8467f22f5b95763b9813b268e MD5 | raw file
- #!/usr/bin/python
- import paramiko
- import subprocess
- import time
- import re
- import math
- import uuid
- from datetime import datetime
- from datetime import timedelta
- from threading import Thread
- from gccommon import *
- from itertools import izip_longest
- from bisect import *
- #from pyjolokia import *
- USERNAME="wkatsak"
- def execute_remote_command_sync(host, username, command, max_tries=5, retry_delay=10, timeout=30):
-
- for i in xrange(0, max_tries):
- try:
- cmd = "bash -c 'source ~/.bash_profile && %s'" % command
- sshc = paramiko.SSHClient()
- sshc.set_missing_host_key_policy(paramiko.AutoAddPolicy())
- sshc.load_system_host_keys()
- sshc.connect(host, port = 22, username = username, timeout=timeout)
- #sshc.get_pty()
- transport = sshc.get_transport()
- transport.set_keepalive(30)
-
- ssh_in, ssh_out, ssh_err = sshc.exec_command(cmd)
- raw_out = ssh_out.readlines()
- raw_err = ssh_err.readlines()
- sshc.close()
- out = list()
- err = list()
- for line in raw_out:
- line = line.replace("\n", "")
- #line = line.strip()
- out.append(line)
- for line in raw_err:
- line = line.replace("\n", "")
- #line = line.strip()
- err.append(line)
- return out, err
- except:
- print "Host: %s, Could not execute command: %s" % (host, cmd)
- time.sleep(retry_delay)
- continue
-
- #raise Exception("Host: %s, Could not execute command: %s" % (host, cmd))
- return [], []
- def execute_remote_command_wait(host, username, command, interval=30, outfile="/dev/null", errfile="/dev/null", status=True):
- name = "cmd-%s" % str(uuid.uuid1())
- remote_cmd = "%s/gcdaemon.py --start --name %s --out %s --err %s %s" % (GC_PATH, name, outfile, errfile, command)
- #print remote_cmd
-
- out, err = execute_remote_command_sync(host, username, remote_cmd)
-
- while True:
- check_cmd = "%s/gcdaemon.py --isrunning --name %s" % (GC_PATH, name)
- #print check_cmd
- out, err = execute_remote_command_sync(host, username, check_cmd)
- #print out, err
- if len(out) <= 0:
- return
-
- reply = int(out[0].strip())
- #print "reply", reply
- # we asked "is running?"
-
- # if not running, we are done
- if reply == 0:
- if status:
- sys.stdout.write("\n")
- sys.stdout.flush()
- break
- # if still running
- else:
- if status:
- sys.stdout.write(". ")
- sys.stdout.flush()
-
- time.sleep(interval)
-
- class ExecuteResults:
- out = ""
- err = ""
-
- def execute_remote_command_thread(host, username, cmd, results):
- results.out, results.err = execute_remote_command_sync(host, username, cmd)
- #print results.out
-
- def execute_remote_command_multihost(hosts, username, cmd):
- out_dict = dict()
- err_dict = dict()
- thread_list = list()
- results_dict = dict()
- for host in hosts:
- results = ExecuteResults()
- results_dict[host] = results
- thread = Thread(target=execute_remote_command_thread, args=(host, username, cmd, results))
- thread_list.append(thread)
- thread.start()
- for thread in thread_list:
- thread.join()
- for host in hosts:
- out_dict[host] = results_dict[host].out
- err_dict[host] = results_dict[host].err
- return out_dict, err_dict
- def execute_remote_commands_parallel(host, username, cmds):
- out_dict = dict()
- err_dict = dict()
- thread_list = list()
- results_dict = dict()
- for cmd in cmds:
- results = ExecuteResults()
- results_dict[cmd] = results
- thread = Thread(target=execute_remote_command_thread, args=(host, username, cmd, results))
- thread_list.append(thread)
- thread.start()
- time.sleep(1)
- for thread in thread_list:
- thread.join()
- for cmd in cmds:
- out_dict[cmd] = results_dict[cmd].out
- err_dict[cmd] = results_dict[cmd].err
- return out_dict, err_dict
- def find_remote_pid(host, process_name):
- cmd = "ps axo pid,command"
- ps_re = re.compile(".*" + process_name + ".*")
- out, err = execute_remote_command_sync(host, USERNAME, cmd)
- for line in out:
- m = ps_re.search(line)
- if not m:
- continue
- return line.strip().split(" ")[0]
- return ""
- def kill_cassandra_node(host):
- pid = find_remote_pid(host, "CassandraDaemon")
- if pid != "":
- cmd = "pkill -SIGHUP -f CassandraDaemon"
- execute_remote_command_sync(host, USERNAME, cmd)
- #print "...killed Cassandra, was pid " + pid
- else:
- print "...Cassandra not running on " + host
- def sleep_cassandra_node(host):
- pid = find_remote_pid(host, "CassandraDaemon")
-
- if pid != "":
- cmd = "cassandra_sleep"
- execute_remote_command_sync(host, USERNAME, cmd)
- else:
- print "...Cassandra not running on " + host
- def wake_cassandra_node(host):
- pid = find_remote_pid(host, "CassandraDaemon")
-
- if pid != "":
- cmd = "cassandra_wake"
- execute_remote_command_sync(host, USERNAME, cmd)
- else:
- print "...Cassandra not running on " + host
- def start_cassandra_node(host):
- cmd = "cassandra"
- #print "Starting cassandra on " + host
- out, err = execute_remote_command_sync(host, USERNAME, cmd)
-
- def kill_cassandra_cluster(control_node):
- cmd = "cassandra_cluster_kill"
- execute_remote_command_sync(control_node, USERNAME, cmd)
-
- def start_cassandra_cluster(control_node):
- cmd = "cassandra_cluster_quickstart"
- execute_remote_command_sync(control_node, USERNAME, cmd)
-
- def parasol_s3(control_node, host):
- print "Sending " + host + " to S3..."
- cmd = "sudo parasol --s3 %s" % host
- execute_remote_command_sync(control_node, USERNAME, cmd)
- def parasol_wake(control_node, host):
- print "Waking up " + host + "..."
- cmd = "sudo parasol --wake %s" % host
- execute_remote_command_sync(control_node, USERNAME, cmd)
-
- def cassandra_nodetool(control_node, action_node, params):
- params_str = params
-
- cmd = "nodetool -host %s %s" % (action_node, params_str)
- #print "Cmd: %s" % cmd
- execute_remote_command_sync(control_node, USERNAME, cmd)
- def cassandra_nodetool_direct(action_node, params):
- cmd = "nodetool %s" % params
- #print "Cmd: %s" % cmd
- execute_remote_command_sync(action_node, USERNAME, cmd)
- def cassandra_nodetool_parallel(control_node, action_nodes, params):
- params_str = ""
- for p in params:
- params_str += p + " "
-
- cmd = "nodetool -host %s %s" % (action_node, params_str)
- #print "Cmd: %s" % cmds
- execute_remote_command_sync(control_node, USERNAME, cmd)
- def interpolate(p1, p2, x):
- x1, y1 = p1
- x2, y2 = p2
- x1 = dt_to_unix(x1)
- x2 = dt_to_unix(x2)
- x = dt_to_unix(x)
- m = float(y2 - y1)/float(x2-x1)
- b = y1 - m*x1
- return m*x + b
- def interpolate_dt(p1, p2, x):
- return interpolate(p1, p2, x)
- def extrapolate_dt(p1, p2, x):
- x1, y1 = p1
- x2, y2 = p2
- x1 = dt_to_unix(x1)
- x2 = dt_to_unix(x2)
- x = dt_to_unix(x)
-
- y = y1 + ((1.0*x - x1) / (x2 - x1)) * (y2 - y1)
-
- return y
- def dt_to_unix(dt):
- return (dt - datetime(1970, 1, 1)).total_seconds()
- def normalize(value, maximum):
- return float(value) / float(maximum)
- def denormalize(value, maximum):
- return int(math.ceil(float(value) * float(maximum)))
- def sleep_delta(delta):
- sleeptime = 0
- sleeptime += 60*60*24*delta.days + 1*delta.seconds + 0.000001*delta.microseconds
- if sleeptime > 0:
- time.sleep(sleeptime)
-
- def delta_total_microseconds(delta):
- return (delta.microseconds + (delta.seconds + delta.days * 24 * 3600) * 10**6)
- def parse_timestamp(timestamp):
- return datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S.%f")
- def wait_with_status(sec_to_wait, status_interval=5):
-
- total_waited_ms = 0
-
- while (total_waited_ms / 1000) < sec_to_wait:
- start_dt = datetime.now()
- time.sleep(status_interval)
- sys.stdout.write(". ")
- sys.stdout.flush()
- end_dt = datetime.now()
- diff = end_dt - start_dt
- diff_ms = diff.seconds*1000 + diff.microseconds/1000
- total_waited_ms += diff_ms
-
- sys.stdout.write("\n")
-
-
- def calcOffNodes(total_nodes, off_skip):
- remaining = len(MINIMUM_NODES + OPTIONAL_NODES) - total_nodes
- nodes_off = []
- i = 0
-
- while remaining > 0:
- nodes_off.append(OPTIONAL_NODES[i])
- remaining -= 1
- i += off_skip
-
- if (i > len(OPTIONAL_NODES) - 1):
- if off_skip == 2:
- i = 1
- else:
- break
-
- return nodes_off
-
- def optionalSortKey(node):
- index = OPTIONAL_NODES.index(node)
-
- if index % 2 == 1:
- key = index + len(OPTIONAL_NODES)
- else:
- key = index
-
- return key
- CASSANDRA_JOLOKIA_URL = "http://%s:8778/jolokia/"
- def getCassandraJolokiaConn(host, timeout=None):
- url = CASSANDRA_JOLOKIA_URL % host
- if timeout == None:
- jolokia_conn = Jolokia(url, timeout)
- else:
- jolokia_conn = Jolokia(url, timeout=timeout)
-
- return jolokia_conn
- def compact_greenhints(node):
- mbean = "org.apache.cassandra.db:type=StorageService"
- operation = "forceTableCompaction"
- try:
- jolokia_conn = getCassandraJolokiaConn(node, timeout=10000000000)
- jolokia_conn.request(type="exec", mbean=mbean, operation=operation, arguments=["system", "GreenHintsColumnFamily"])
- except JolokiaError:
- raise Exception("Could not compact greenhints on %s" % node)
- def compact_greenhints_ssh(node):
- cmd = "nodetool compact system GreenHintsColumnFamily"
- execute_remote_command_sync(node, USERNAME, cmd)
- # from http://docs.python.org/2/library/itertools.html#recipes
- def grouper(iterable, n, fillvalue=None):
- "Collect data into fixed-length chunks or blocks"
- # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx
- args = [iter(iterable)] * n
- return izip_longest(fillvalue=fillvalue, *args)
- # from http://docs.python.org/2/library/bisect.html
- def index(a, x):
- 'Locate the leftmost value exactly equal to x'
- i = bisect_left(a, x)
- if i != len(a) and a[i] == x:
- return i
- raise ValueError
- def find_lt(a, x):
- 'Find rightmost value less than x'
- i = bisect_left(a, x)
- if i:
- return a[i-1]
- raise ValueError
- def find_le(a, x):
- 'Find rightmost value less than or equal to x'
- i = bisect_right(a, x)
- if i:
- return a[i-1]
- raise ValueError
- def find_gt(a, x):
- 'Find leftmost value greater than x'
- i = bisect_right(a, x)
- if i != len(a):
- return a[i]
- raise ValueError
- def find_ge(a, x):
- 'Find leftmost item greater than or equal to x'
- i = bisect_left(a, x)
- if i != len(a):
- return a[i]
- raise ValueError
- class Transition(object):
- def __init__(self, tid, node, offDt, offIndex, transDt, transIndex, onDt, onIndex): #, nodesReady, integratedWorkload, maxTransitionLatency, avgTransitionWorkload):
- self.tid = tid
- self.node = node
- self.offDt = offDt
- self.offIndex = offIndex
- self.transDt = transDt
- self.transIndex = transIndex
- self.onDt = onDt
- self.onIndex = onIndex
-
- self.nodesReady = -1
- self.integratedOffWorkload = -1
- self.maxTransitionLatency = -1
- self.avgTransitionWorkload = -1
- self.schedDt = transDt
- self.schedIndex = transIndex
-
- def __str__(self):
- return "Transition(tid=%d, node=%s, offDt=%s, transDt=%s, onDt=%s, nodesReady=%d, integratedOffWorkload=%0.2f, maxTransitionLatency=%0.2f, avgTransitionWorkload=%0.2f" % (self.tid, self.node, str(self.offDt), str(self.transDt), str(self.onDt), self.nodesReady, self.integratedOffWorkload, self.maxTransitionLatency, self.avgTransitionWorkload)
- class TransitionAnalyzer(object):
-
- def __init__(self, results):
- self.results = results
- self.extractTransitions()
-
- def integrateOffWorkload(self, offIndex, transIndex, timestamps):
- integral = 0.0
- for i in xrange(offIndex+1, transIndex):
- thisDt = timestamps[i]
- lastDt = timestamps[i-1]
- delta = thisDt - lastDt
- seconds = delta.total_seconds()
-
- workload = self.results.getCommonValue(timestamps[i], "actual_workload")
- integral += workload*seconds
-
- return integral / 100000
-
- def maxTransitionLatency(self, transIndex, onIndex, timestamps):
- maxLatency = 0.0
-
- for i in xrange(transIndex, onIndex):
- value = self.results.getCommonValue(timestamps[i], "readlatency_99_window")
- if value > maxLatency:
- maxLatency = value
-
- return maxLatency
-
- def avgTransitionWorkload(self, transIndex, onIndex, timestamps):
- accum = 0.0
- count = 0
-
- for i in xrange(transIndex, onIndex):
- value = self.results.getCommonValue(timestamps[i], "actual_workload")
- if not math.isnan(value):
- accum += value
- count += 1
- try:
- return accum / count
- except:
- return 0.0
- def postprocessTransitions(self, transitions, timestamps):
- transitions.sort(key=lambda(x):x.onDt)
- transitions.reverse()
-
- for i in xrange(0, len(transitions)-1):
- transitions[i].transIndex = max(transitions[i].transIndex, transitions[i+1].onIndex)
- transitions[i].transDt = timestamps[transitions[i].transIndex]
-
- for transition in transitions:
- transition.nodesReady = self.results.getCommonValue(transition.transDt, "nodes_ready")
- transition.integratedOffWorkload = self.integrateOffWorkload(transition.offIndex, transition.transIndex, timestamps)
- transition.maxTransitionLatency = self.maxTransitionLatency(transition.transIndex, transition.onIndex, timestamps)
- transition.avgTransitionWorkload = self.avgTransitionWorkload(transition.transIndex, transition.onIndex, timestamps)
- def extractTransitions(self):
- tids = 0
- transitions = []
- timestamps = self.results.availableTimestamps()
- max_i = len(timestamps)
-
- for node in OPTIONAL_NODES:
- current_i = 0
- while current_i < max_i:
- # find first off time
- offIndex = next((i for i in xrange(current_i, max_i) if self.results.getNodeValue(timestamps[i], node, "state") == 0), None)
- if offIndex == None:
- break
-
- off_dt = timestamps[offIndex]
-
- transIndex = next((i for i in xrange(offIndex, max_i) if self.results.getNodeValue(timestamps[i], node, "state") == 1), None)
- if transIndex == None:
- break
-
- trans_dt = timestamps[transIndex]
-
- onIndex = next((i for i in xrange(transIndex, max_i) if self.results.getNodeValue(timestamps[i], node, "state") == 2), None)
- if onIndex == None:
- onIndex = max_i-1
- else:
- on_dt = timestamps[onIndex]
- transitions.append(Transition(tids, node, off_dt, offIndex, trans_dt, transIndex, on_dt, onIndex))
- tids += 1
-
- current_i = onIndex + 1
-
- self.postprocessTransitions(transitions, timestamps)
-
- transitions.sort(key=lambda(x):x.transDt)
- self.transitions = transitions
-
- def getTransition(self, dt):
- for transition in self.transitions:
- if transition.transDt <= dt and dt <= transition.onDt:
- return transition
- raise(Exception("No match for dt %s..." % str(dt)))
-
- def getTransitionLength(self, dt):
- transition = self.getTransition(dt)
- delta = transition.onDt - transition.transDt
- return delta.total_seconds()
-
- def getOffLength(self, dt):
- transition = self.getTransition(dt)
- delta = transition.transDt - transition.offDt
- return delta.total_seconds()
-
- def getIntegratedOffWorkload(self, dt):
- transition = self.getTransition(dt)
- return transition.integratedOffWorkload
-
- def getMaxTransitionLatency(self, dt):
- transition = self.getTransition(dt)
- return transition.maxTransitionLatency
-
- def getAvgTransitionWorkload(self, dt):
- transition = self.getTransition(dt)
- return transition.avgTransitionWorkload
-
-