PageRenderTime 58ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 1ms

/gchelpers.py

https://gitlab.com/abushoeb/gc-v1
Python | 539 lines | 529 code | 7 blank | 3 comment | 6 complexity | 041b2ac8467f22f5b95763b9813b268e MD5 | raw file
  1. #!/usr/bin/python
  2. import paramiko
  3. import subprocess
  4. import time
  5. import re
  6. import math
  7. import uuid
  8. from datetime import datetime
  9. from datetime import timedelta
  10. from threading import Thread
  11. from gccommon import *
  12. from itertools import izip_longest
  13. from bisect import *
  14. #from pyjolokia import *
  15. USERNAME="wkatsak"
  16. def execute_remote_command_sync(host, username, command, max_tries=5, retry_delay=10, timeout=30):
  17. for i in xrange(0, max_tries):
  18. try:
  19. cmd = "bash -c 'source ~/.bash_profile && %s'" % command
  20. sshc = paramiko.SSHClient()
  21. sshc.set_missing_host_key_policy(paramiko.AutoAddPolicy())
  22. sshc.load_system_host_keys()
  23. sshc.connect(host, port = 22, username = username, timeout=timeout)
  24. #sshc.get_pty()
  25. transport = sshc.get_transport()
  26. transport.set_keepalive(30)
  27. ssh_in, ssh_out, ssh_err = sshc.exec_command(cmd)
  28. raw_out = ssh_out.readlines()
  29. raw_err = ssh_err.readlines()
  30. sshc.close()
  31. out = list()
  32. err = list()
  33. for line in raw_out:
  34. line = line.replace("\n", "")
  35. #line = line.strip()
  36. out.append(line)
  37. for line in raw_err:
  38. line = line.replace("\n", "")
  39. #line = line.strip()
  40. err.append(line)
  41. return out, err
  42. except:
  43. print "Host: %s, Could not execute command: %s" % (host, cmd)
  44. time.sleep(retry_delay)
  45. continue
  46. #raise Exception("Host: %s, Could not execute command: %s" % (host, cmd))
  47. return [], []
  48. def execute_remote_command_wait(host, username, command, interval=30, outfile="/dev/null", errfile="/dev/null", status=True):
  49. name = "cmd-%s" % str(uuid.uuid1())
  50. remote_cmd = "%s/gcdaemon.py --start --name %s --out %s --err %s %s" % (GC_PATH, name, outfile, errfile, command)
  51. #print remote_cmd
  52. out, err = execute_remote_command_sync(host, username, remote_cmd)
  53. while True:
  54. check_cmd = "%s/gcdaemon.py --isrunning --name %s" % (GC_PATH, name)
  55. #print check_cmd
  56. out, err = execute_remote_command_sync(host, username, check_cmd)
  57. #print out, err
  58. if len(out) <= 0:
  59. return
  60. reply = int(out[0].strip())
  61. #print "reply", reply
  62. # we asked "is running?"
  63. # if not running, we are done
  64. if reply == 0:
  65. if status:
  66. sys.stdout.write("\n")
  67. sys.stdout.flush()
  68. break
  69. # if still running
  70. else:
  71. if status:
  72. sys.stdout.write(". ")
  73. sys.stdout.flush()
  74. time.sleep(interval)
  75. class ExecuteResults:
  76. out = ""
  77. err = ""
  78. def execute_remote_command_thread(host, username, cmd, results):
  79. results.out, results.err = execute_remote_command_sync(host, username, cmd)
  80. #print results.out
  81. def execute_remote_command_multihost(hosts, username, cmd):
  82. out_dict = dict()
  83. err_dict = dict()
  84. thread_list = list()
  85. results_dict = dict()
  86. for host in hosts:
  87. results = ExecuteResults()
  88. results_dict[host] = results
  89. thread = Thread(target=execute_remote_command_thread, args=(host, username, cmd, results))
  90. thread_list.append(thread)
  91. thread.start()
  92. for thread in thread_list:
  93. thread.join()
  94. for host in hosts:
  95. out_dict[host] = results_dict[host].out
  96. err_dict[host] = results_dict[host].err
  97. return out_dict, err_dict
  98. def execute_remote_commands_parallel(host, username, cmds):
  99. out_dict = dict()
  100. err_dict = dict()
  101. thread_list = list()
  102. results_dict = dict()
  103. for cmd in cmds:
  104. results = ExecuteResults()
  105. results_dict[cmd] = results
  106. thread = Thread(target=execute_remote_command_thread, args=(host, username, cmd, results))
  107. thread_list.append(thread)
  108. thread.start()
  109. time.sleep(1)
  110. for thread in thread_list:
  111. thread.join()
  112. for cmd in cmds:
  113. out_dict[cmd] = results_dict[cmd].out
  114. err_dict[cmd] = results_dict[cmd].err
  115. return out_dict, err_dict
  116. def find_remote_pid(host, process_name):
  117. cmd = "ps axo pid,command"
  118. ps_re = re.compile(".*" + process_name + ".*")
  119. out, err = execute_remote_command_sync(host, USERNAME, cmd)
  120. for line in out:
  121. m = ps_re.search(line)
  122. if not m:
  123. continue
  124. return line.strip().split(" ")[0]
  125. return ""
  126. def kill_cassandra_node(host):
  127. pid = find_remote_pid(host, "CassandraDaemon")
  128. if pid != "":
  129. cmd = "pkill -SIGHUP -f CassandraDaemon"
  130. execute_remote_command_sync(host, USERNAME, cmd)
  131. #print "...killed Cassandra, was pid " + pid
  132. else:
  133. print "...Cassandra not running on " + host
  134. def sleep_cassandra_node(host):
  135. pid = find_remote_pid(host, "CassandraDaemon")
  136. if pid != "":
  137. cmd = "cassandra_sleep"
  138. execute_remote_command_sync(host, USERNAME, cmd)
  139. else:
  140. print "...Cassandra not running on " + host
  141. def wake_cassandra_node(host):
  142. pid = find_remote_pid(host, "CassandraDaemon")
  143. if pid != "":
  144. cmd = "cassandra_wake"
  145. execute_remote_command_sync(host, USERNAME, cmd)
  146. else:
  147. print "...Cassandra not running on " + host
  148. def start_cassandra_node(host):
  149. cmd = "cassandra"
  150. #print "Starting cassandra on " + host
  151. out, err = execute_remote_command_sync(host, USERNAME, cmd)
  152. def kill_cassandra_cluster(control_node):
  153. cmd = "cassandra_cluster_kill"
  154. execute_remote_command_sync(control_node, USERNAME, cmd)
  155. def start_cassandra_cluster(control_node):
  156. cmd = "cassandra_cluster_quickstart"
  157. execute_remote_command_sync(control_node, USERNAME, cmd)
  158. def parasol_s3(control_node, host):
  159. print "Sending " + host + " to S3..."
  160. cmd = "sudo parasol --s3 %s" % host
  161. execute_remote_command_sync(control_node, USERNAME, cmd)
  162. def parasol_wake(control_node, host):
  163. print "Waking up " + host + "..."
  164. cmd = "sudo parasol --wake %s" % host
  165. execute_remote_command_sync(control_node, USERNAME, cmd)
  166. def cassandra_nodetool(control_node, action_node, params):
  167. params_str = params
  168. cmd = "nodetool -host %s %s" % (action_node, params_str)
  169. #print "Cmd: %s" % cmd
  170. execute_remote_command_sync(control_node, USERNAME, cmd)
  171. def cassandra_nodetool_direct(action_node, params):
  172. cmd = "nodetool %s" % params
  173. #print "Cmd: %s" % cmd
  174. execute_remote_command_sync(action_node, USERNAME, cmd)
  175. def cassandra_nodetool_parallel(control_node, action_nodes, params):
  176. params_str = ""
  177. for p in params:
  178. params_str += p + " "
  179. cmd = "nodetool -host %s %s" % (action_node, params_str)
  180. #print "Cmd: %s" % cmds
  181. execute_remote_command_sync(control_node, USERNAME, cmd)
  182. def interpolate(p1, p2, x):
  183. x1, y1 = p1
  184. x2, y2 = p2
  185. x1 = dt_to_unix(x1)
  186. x2 = dt_to_unix(x2)
  187. x = dt_to_unix(x)
  188. m = float(y2 - y1)/float(x2-x1)
  189. b = y1 - m*x1
  190. return m*x + b
  191. def interpolate_dt(p1, p2, x):
  192. return interpolate(p1, p2, x)
  193. def extrapolate_dt(p1, p2, x):
  194. x1, y1 = p1
  195. x2, y2 = p2
  196. x1 = dt_to_unix(x1)
  197. x2 = dt_to_unix(x2)
  198. x = dt_to_unix(x)
  199. y = y1 + ((1.0*x - x1) / (x2 - x1)) * (y2 - y1)
  200. return y
  201. def dt_to_unix(dt):
  202. return (dt - datetime(1970, 1, 1)).total_seconds()
  203. def normalize(value, maximum):
  204. return float(value) / float(maximum)
  205. def denormalize(value, maximum):
  206. return int(math.ceil(float(value) * float(maximum)))
  207. def sleep_delta(delta):
  208. sleeptime = 0
  209. sleeptime += 60*60*24*delta.days + 1*delta.seconds + 0.000001*delta.microseconds
  210. if sleeptime > 0:
  211. time.sleep(sleeptime)
  212. def delta_total_microseconds(delta):
  213. return (delta.microseconds + (delta.seconds + delta.days * 24 * 3600) * 10**6)
  214. def parse_timestamp(timestamp):
  215. return datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S.%f")
  216. def wait_with_status(sec_to_wait, status_interval=5):
  217. total_waited_ms = 0
  218. while (total_waited_ms / 1000) < sec_to_wait:
  219. start_dt = datetime.now()
  220. time.sleep(status_interval)
  221. sys.stdout.write(". ")
  222. sys.stdout.flush()
  223. end_dt = datetime.now()
  224. diff = end_dt - start_dt
  225. diff_ms = diff.seconds*1000 + diff.microseconds/1000
  226. total_waited_ms += diff_ms
  227. sys.stdout.write("\n")
  228. def calcOffNodes(total_nodes, off_skip):
  229. remaining = len(MINIMUM_NODES + OPTIONAL_NODES) - total_nodes
  230. nodes_off = []
  231. i = 0
  232. while remaining > 0:
  233. nodes_off.append(OPTIONAL_NODES[i])
  234. remaining -= 1
  235. i += off_skip
  236. if (i > len(OPTIONAL_NODES) - 1):
  237. if off_skip == 2:
  238. i = 1
  239. else:
  240. break
  241. return nodes_off
  242. def optionalSortKey(node):
  243. index = OPTIONAL_NODES.index(node)
  244. if index % 2 == 1:
  245. key = index + len(OPTIONAL_NODES)
  246. else:
  247. key = index
  248. return key
  249. CASSANDRA_JOLOKIA_URL = "http://%s:8778/jolokia/"
  250. def getCassandraJolokiaConn(host, timeout=None):
  251. url = CASSANDRA_JOLOKIA_URL % host
  252. if timeout == None:
  253. jolokia_conn = Jolokia(url, timeout)
  254. else:
  255. jolokia_conn = Jolokia(url, timeout=timeout)
  256. return jolokia_conn
  257. def compact_greenhints(node):
  258. mbean = "org.apache.cassandra.db:type=StorageService"
  259. operation = "forceTableCompaction"
  260. try:
  261. jolokia_conn = getCassandraJolokiaConn(node, timeout=10000000000)
  262. jolokia_conn.request(type="exec", mbean=mbean, operation=operation, arguments=["system", "GreenHintsColumnFamily"])
  263. except JolokiaError:
  264. raise Exception("Could not compact greenhints on %s" % node)
  265. def compact_greenhints_ssh(node):
  266. cmd = "nodetool compact system GreenHintsColumnFamily"
  267. execute_remote_command_sync(node, USERNAME, cmd)
  268. # from http://docs.python.org/2/library/itertools.html#recipes
  269. def grouper(iterable, n, fillvalue=None):
  270. "Collect data into fixed-length chunks or blocks"
  271. # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx
  272. args = [iter(iterable)] * n
  273. return izip_longest(fillvalue=fillvalue, *args)
  274. # from http://docs.python.org/2/library/bisect.html
  275. def index(a, x):
  276. 'Locate the leftmost value exactly equal to x'
  277. i = bisect_left(a, x)
  278. if i != len(a) and a[i] == x:
  279. return i
  280. raise ValueError
  281. def find_lt(a, x):
  282. 'Find rightmost value less than x'
  283. i = bisect_left(a, x)
  284. if i:
  285. return a[i-1]
  286. raise ValueError
  287. def find_le(a, x):
  288. 'Find rightmost value less than or equal to x'
  289. i = bisect_right(a, x)
  290. if i:
  291. return a[i-1]
  292. raise ValueError
  293. def find_gt(a, x):
  294. 'Find leftmost value greater than x'
  295. i = bisect_right(a, x)
  296. if i != len(a):
  297. return a[i]
  298. raise ValueError
  299. def find_ge(a, x):
  300. 'Find leftmost item greater than or equal to x'
  301. i = bisect_left(a, x)
  302. if i != len(a):
  303. return a[i]
  304. raise ValueError
  305. class Transition(object):
  306. def __init__(self, tid, node, offDt, offIndex, transDt, transIndex, onDt, onIndex): #, nodesReady, integratedWorkload, maxTransitionLatency, avgTransitionWorkload):
  307. self.tid = tid
  308. self.node = node
  309. self.offDt = offDt
  310. self.offIndex = offIndex
  311. self.transDt = transDt
  312. self.transIndex = transIndex
  313. self.onDt = onDt
  314. self.onIndex = onIndex
  315. self.nodesReady = -1
  316. self.integratedOffWorkload = -1
  317. self.maxTransitionLatency = -1
  318. self.avgTransitionWorkload = -1
  319. self.schedDt = transDt
  320. self.schedIndex = transIndex
  321. def __str__(self):
  322. return "Transition(tid=%d, node=%s, offDt=%s, transDt=%s, onDt=%s, nodesReady=%d, integratedOffWorkload=%0.2f, maxTransitionLatency=%0.2f, avgTransitionWorkload=%0.2f" % (self.tid, self.node, str(self.offDt), str(self.transDt), str(self.onDt), self.nodesReady, self.integratedOffWorkload, self.maxTransitionLatency, self.avgTransitionWorkload)
  323. class TransitionAnalyzer(object):
  324. def __init__(self, results):
  325. self.results = results
  326. self.extractTransitions()
  327. def integrateOffWorkload(self, offIndex, transIndex, timestamps):
  328. integral = 0.0
  329. for i in xrange(offIndex+1, transIndex):
  330. thisDt = timestamps[i]
  331. lastDt = timestamps[i-1]
  332. delta = thisDt - lastDt
  333. seconds = delta.total_seconds()
  334. workload = self.results.getCommonValue(timestamps[i], "actual_workload")
  335. integral += workload*seconds
  336. return integral / 100000
  337. def maxTransitionLatency(self, transIndex, onIndex, timestamps):
  338. maxLatency = 0.0
  339. for i in xrange(transIndex, onIndex):
  340. value = self.results.getCommonValue(timestamps[i], "readlatency_99_window")
  341. if value > maxLatency:
  342. maxLatency = value
  343. return maxLatency
  344. def avgTransitionWorkload(self, transIndex, onIndex, timestamps):
  345. accum = 0.0
  346. count = 0
  347. for i in xrange(transIndex, onIndex):
  348. value = self.results.getCommonValue(timestamps[i], "actual_workload")
  349. if not math.isnan(value):
  350. accum += value
  351. count += 1
  352. try:
  353. return accum / count
  354. except:
  355. return 0.0
  356. def postprocessTransitions(self, transitions, timestamps):
  357. transitions.sort(key=lambda(x):x.onDt)
  358. transitions.reverse()
  359. for i in xrange(0, len(transitions)-1):
  360. transitions[i].transIndex = max(transitions[i].transIndex, transitions[i+1].onIndex)
  361. transitions[i].transDt = timestamps[transitions[i].transIndex]
  362. for transition in transitions:
  363. transition.nodesReady = self.results.getCommonValue(transition.transDt, "nodes_ready")
  364. transition.integratedOffWorkload = self.integrateOffWorkload(transition.offIndex, transition.transIndex, timestamps)
  365. transition.maxTransitionLatency = self.maxTransitionLatency(transition.transIndex, transition.onIndex, timestamps)
  366. transition.avgTransitionWorkload = self.avgTransitionWorkload(transition.transIndex, transition.onIndex, timestamps)
  367. def extractTransitions(self):
  368. tids = 0
  369. transitions = []
  370. timestamps = self.results.availableTimestamps()
  371. max_i = len(timestamps)
  372. for node in OPTIONAL_NODES:
  373. current_i = 0
  374. while current_i < max_i:
  375. # find first off time
  376. offIndex = next((i for i in xrange(current_i, max_i) if self.results.getNodeValue(timestamps[i], node, "state") == 0), None)
  377. if offIndex == None:
  378. break
  379. off_dt = timestamps[offIndex]
  380. transIndex = next((i for i in xrange(offIndex, max_i) if self.results.getNodeValue(timestamps[i], node, "state") == 1), None)
  381. if transIndex == None:
  382. break
  383. trans_dt = timestamps[transIndex]
  384. onIndex = next((i for i in xrange(transIndex, max_i) if self.results.getNodeValue(timestamps[i], node, "state") == 2), None)
  385. if onIndex == None:
  386. onIndex = max_i-1
  387. else:
  388. on_dt = timestamps[onIndex]
  389. transitions.append(Transition(tids, node, off_dt, offIndex, trans_dt, transIndex, on_dt, onIndex))
  390. tids += 1
  391. current_i = onIndex + 1
  392. self.postprocessTransitions(transitions, timestamps)
  393. transitions.sort(key=lambda(x):x.transDt)
  394. self.transitions = transitions
  395. def getTransition(self, dt):
  396. for transition in self.transitions:
  397. if transition.transDt <= dt and dt <= transition.onDt:
  398. return transition
  399. raise(Exception("No match for dt %s..." % str(dt)))
  400. def getTransitionLength(self, dt):
  401. transition = self.getTransition(dt)
  402. delta = transition.onDt - transition.transDt
  403. return delta.total_seconds()
  404. def getOffLength(self, dt):
  405. transition = self.getTransition(dt)
  406. delta = transition.transDt - transition.offDt
  407. return delta.total_seconds()
  408. def getIntegratedOffWorkload(self, dt):
  409. transition = self.getTransition(dt)
  410. return transition.integratedOffWorkload
  411. def getMaxTransitionLatency(self, dt):
  412. transition = self.getTransition(dt)
  413. return transition.maxTransitionLatency
  414. def getAvgTransitionWorkload(self, dt):
  415. transition = self.getTransition(dt)
  416. return transition.avgTransitionWorkload