PageRenderTime 51ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/ghadoopmonitor.py

https://gitlab.com/abushoeb/gc-v1
Python | 960 lines | 897 code | 24 blank | 39 comment | 14 complexity | 382faa4460169aff6a9907308343150c MD5 | raw file
  1. #!/usr/bin/env python2.7
  2. """
  3. GreenHadoop makes Hadoop aware of solar energy availability.
  4. http://www.research.rutgers.edu/~goiri/
  5. Copyright (C) 2012 Inigo Goiri, Rutgers University
  6. This program is free software: you can redistribute it and/or modify
  7. it under the terms of the GNU General Public License as published by
  8. the Free Software Foundation, either version 3 of the License, or
  9. (at your option) any later version.
  10. This program is distributed in the hope that it will be useful,
  11. but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. GNU General Public License for more details.
  14. You should have received a copy of the GNU General Public License
  15. along with this program. If not, see <http://www.gnu.org/licenses/>
  16. """
  17. import threading
  18. import os
  19. import time
  20. import signal
  21. from subprocess import PIPE, Popen
  22. from optparse import OptionParser
  23. from operator import itemgetter, attrgetter
  24. from datetime import datetime, timedelta
  25. from ghadoopcommons import *
  26. class MonitorMapred(threading.Thread):
  27. def __init__(self, parseOld=False, logfile=None):
  28. threading.Thread.__init__(self)
  29. self.running = True
  30. # Store info
  31. self.jobs = {}
  32. self.nodes = {}
  33. self.debug = 0
  34. # Counter for the attempts not in the system anymore
  35. self.attemptsToClean = {}
  36. # JobTracker log
  37. self.logfileMapr = HADOOP_HOME+"/logs/hadoop-"+USER+"-jobtracker-"+MASTER_NODE+".log"
  38. if logfile != None:
  39. self.logfileMapr = logfile
  40. # http://forums.devshed.com/python-programming-11/how-to-monitor-a-file-for-changes-85767.html
  41. self.fileMapr = open(self.logfileMapr, 'r')
  42. self.watcherMapr = os.stat(self.logfileMapr)
  43. self.this_modifiedMapr = self.last_modifiedMapr = self.watcherMapr.st_mtime
  44. self.this_sizeMapr = self.last_sizeMapr = self.watcherMapr.st_size
  45. # Read previous state of the system
  46. if parseOld:
  47. self.parseOld()
  48. # Go to the end of the file
  49. self.fileMapr.seek(0,2)
  50. def run(self):
  51. previous = datetime.now()
  52. # Get nodes from files (masters and slaves)
  53. #for nodeId in readHostFile(CONF_MASTERS):
  54. #node = self.getNode(nodeId)
  55. for nodeId in readHostFile(CONF_SLAVES):
  56. node = self.getNode(nodeId)
  57. # Get nodes currently running in the system
  58. for nodeId in self.getActiveTaskTrackers():
  59. node = self.getNode(nodeId)
  60. node.status = 'UP'
  61. # Monitor loop
  62. while self.running:
  63. try:
  64. # Update from log: JobTracker
  65. if self.this_modifiedMapr > self.last_modifiedMapr:
  66. self.last_modifiedMapr = self.this_modifiedMapr
  67. self.last_sizeMapr = self.this_sizeMapr
  68. # File was modified, so read new lines, look for error keywords
  69. while True:
  70. line = self.fileMapr.readline()
  71. if not line:
  72. break
  73. date = self.parseLine(line)
  74. self.watcherMapr = os.stat(self.logfileMapr)
  75. self.this_modifiedMapr = self.watcherMapr.st_mtime
  76. self.this_sizeMapr = self.watcherMapr.st_size
  77. # Check if the file changed
  78. if self.this_sizeMapr < self.last_sizeMapr:
  79. print 'Size is shorter. New file!'
  80. self.fileMapr = open(self.logfileMapr, 'r')
  81. self.watcherMapr = os.stat(self.logfileMapr)
  82. self.this_modifiedMapr = self.last_modifiedMapr = self.watcherMapr.st_mtime
  83. self.this_sizeMapr = self.last_sizeMapr = self.watcherMapr.st_size
  84. # Check status
  85. self.checkStatus()
  86. # Print output (every 3 seconds)
  87. if (datetime.now()-previous) > timedelta(seconds=3):
  88. if self.debug > 0:
  89. self.printOutput()
  90. previous = datetime.now()
  91. except Exception, e:
  92. print 'Error reading log file:', e
  93. # Wait a little bit for new lines
  94. time.sleep(0.5) # 500ms
  95. def kill(self):
  96. self.running = False
  97. def reset(self):
  98. self.jobs = {}
  99. self.nodes = {}
  100. self.attemptsToClean = {}
  101. # Get task trackers
  102. def getActiveTaskTrackers(self):
  103. ret = []
  104. # Query Hadoop
  105. p = Popen([HADOOP_HOME+'/bin/hadoop', 'job', '-list-active-trackers'], stdout=PIPE, stderr=open('/dev/null', 'w'))
  106. p.wait()
  107. line = p.stdout.readline()
  108. while line:
  109. # Parse node
  110. trackerId = line.replace('\n', '')
  111. if trackerId.startswith('tracker_'):
  112. trackerId = trackerId.replace('tracker_', '')
  113. if trackerId.find(':') >= 0:
  114. trackerId = trackerId[:trackerId.find(':')]
  115. if trackerId not in ret:
  116. ret.append(trackerId)
  117. # Read next line
  118. line = p.stdout.readline()
  119. return sorted(ret)
  120. def getJob(self, jobId):
  121. if jobId not in self.jobs:
  122. job = HadoopJob(jobId)
  123. self.jobs[jobId] = job
  124. return self.jobs[jobId]
  125. def getTask(self, taskId):
  126. jobId, taskId = self.parseTaskId(taskId)
  127. job = self.getJob(jobId)
  128. if taskId not in job.tasks:
  129. task = HadoopTask(taskId)
  130. job.tasks[taskId] = task
  131. return self.jobs[jobId].tasks[taskId]
  132. def getAttempt(self, attemptId):
  133. jobId, taskId, attemptId = self.parseAttemptId(attemptId)
  134. task = self.getTask(taskId)
  135. if attemptId not in task.attempts:
  136. attempt = HadoopAttempt(attemptId)
  137. task.attempts[attemptId] = attempt
  138. return self.jobs[jobId].tasks[taskId].attempts[attemptId]
  139. def getNode(self, nodeId):
  140. node = None
  141. if nodeId != None and nodeId not in self.nodes:
  142. self.nodes[nodeId] = HadoopTaskTracker(nodeId)
  143. if nodeId in self.nodes:
  144. node = self.nodes[nodeId]
  145. return node
  146. # Parsing functions
  147. def parseOld(self):
  148. prev = None
  149. while True:
  150. line = self.fileMapr.readline()
  151. if not line:
  152. break
  153. date = self.parseLine(line)
  154. if date != None and prev != dateToSeconds(date):
  155. prev = dateToSeconds(date)
  156. print dateToSeconds(date), date, self.getMapReduceSummary()
  157. def parseLine(self, line):
  158. date = None
  159. try:
  160. date = datetime.strptime(line.split(",")[0], "%Y-%m-%d %H:%M:%S")
  161. #if self.startTime == None:
  162. #self.startTime = date
  163. line = line.replace("\n", "")
  164. lineSplit = line.split(" ")
  165. # Parse lines
  166. if lineSplit[3].startswith("org.apache.hadoop.mapred.JobTracker"):
  167. self.parseLineJobTracker(date, lineSplit[4:])
  168. elif lineSplit[3].startswith("org.apache.hadoop.mapred.JobInProgress"):
  169. self.parseLineJobInProgress(date, lineSplit[4:])
  170. elif lineSplit[3].startswith("org.apache.hadoop.mapred.JobHistory"):
  171. #self.parseLineJobHistory(date, lineSplit[4:])
  172. pass
  173. elif lineSplit[3].startswith("org.apache.hadoop.mapred.TaskInProgress"):
  174. self.parseLineTaskInProgress(date, lineSplit[4:])
  175. elif lineSplit[3].startswith("org.apache.hadoop.mapred.TaskStatus"):
  176. self.parseLineTaskStatus(date, lineSplit[4:])
  177. elif lineSplit[3].startswith("org.apache.hadoop.net.NetworkTopology"):
  178. self.parseLineNetworkTopology(date, lineSplit[4:])
  179. elif lineSplit[3].startswith("org.apache.hadoop.mapred.AuditLogger"):
  180. self.parseLineAuditLogger(date, lineSplit[4:])
  181. elif lineSplit[3].startswith("org.apache.hadoop.mapred.DisableNodesTaskScheduler"):
  182. self.parseLineDisableNode(date, lineSplit[4:])
  183. elif lineSplit[3].startswith("org.apache.hadoop.security.token.delegation.AbstractDelegationTokenSecretManager:"):
  184. pass
  185. elif lineSplit[3].startswith("org.apache.hadoop.hdfs.DFSClient"):
  186. pass
  187. elif lineSplit[3].startswith("org.apache.hadoop.ipc.Server"):
  188. pass
  189. elif lineSplit[3].startswith("org.apache.hadoop.metrics2.impl.Metrics"):
  190. pass
  191. elif lineSplit[3].startswith("org.apache.hadoop.util.NativeCodeLoader:"):
  192. pass
  193. elif lineSplit[3].startswith("org.apache.hadoop.mapred.CompletedJobStatusStore:"):
  194. pass
  195. elif lineSplit[3].startswith("org.apache.hadoop.http.HttpServer:"):
  196. pass
  197. elif lineSplit[3].startswith("org.mortbay.log:"):
  198. pass
  199. elif lineSplit[3].startswith("org.apache.hadoop.ipc.Client:"):
  200. pass
  201. elif lineSplit[3].startswith("org.apache.hadoop.util.HostsFileReader"):
  202. #TODO
  203. pass
  204. else:
  205. #print date, ' '.join(lineSplit)
  206. pass
  207. except ValueError, e:
  208. #print "Error line: "+line
  209. #print e
  210. # TODO
  211. pass
  212. except Exception, e:
  213. #print "Error line: "+line
  214. print e
  215. return date
  216. def parseLineJobInProgress(self, date, lineSplit):
  217. # job_201204290141_0118: nMaps=2 nReduces=1 max=-1
  218. # Initializing job_201204290141_0118
  219. # jobToken generated and stored with users keys in /opt/hadoop-goiri/mapred/system/job_201204290141_0118/jobToken
  220. # Input size for job job_201204290141_0118 = 66880960. Number of splits = 2
  221. # tip:task_201204290141_0118_m_000000 has split on node:/default-rack/sol054
  222. # tip:task_201204290141_0118_m_000000 has split on node:/default-rack/sol039
  223. # tip:task_201204290141_0118_m_000000 has split on node:/default-rack/sol048
  224. # tip:task_201204290141_0118_m_000001 has split on node:/default-rack/sol054
  225. # tip:task_201204290141_0118_m_000001 has split on node:/default-rack/sol039
  226. # tip:task_201204290141_0118_m_000001 has split on node:/default-rack/sol048
  227. # job_201204290141_0118 LOCALITY_WAIT_FACTOR=0.05882353
  228. # Job job_201204290141_0118 initialized successfully with 2 map tasks and 1 reduce tasks.
  229. # Task 'attempt_201204290141_0118_m_000003_0' has completed task_201204290141_0118_m_000003 successfully.
  230. # Choosing rack-local task task_201204290141_0118_m_000000
  231. # Choosing rack-local task task_201204290141_0118_m_000001
  232. # Task 'attempt_201204290141_0118_m_000001_0' has completed task_201204290141_0118_m_000001 successfully.
  233. # Task 'attempt_201204290141_0118_m_000000_0' has completed task_201204290141_0118_m_000000 successfully.
  234. # Task 'attempt_201204290141_0118_r_000000_0' has completed task_201204290141_0118_r_000000 successfully.
  235. # Task 'attempt_201204290141_0118_m_000002_0' has completed task_201204290141_0118_m_000002 successfully.
  236. # Job job_201204290141_0118 has completed successfully.
  237. # jobId=job_201204290141_0118,submitTime=1335682931160,launchTime=1335682931390,firstMapTaskLaunchTime=1335682942774,firstReduceTaskLaunchTime=1335682957818,firstJobSetupTaskLaunchTime=1335682933749,firstJobCleanupTaskLaunchTime=1335682972861,finishTime=1335682981886,numMaps=2,numSlotsPerMap=1,numReduces=1,numSlotsPerReduce=1,user=goiri,queue=default,status=SUCCEEDED,mapSlotSeconds=40,reduceSlotsSeconds=12,clusterMapCapacity=204,clusterReduceCapacity=102
  238. if len(lineSplit)==4 and lineSplit[1].startswith('nMaps=') and lineSplit[2].startswith('nReduces='):
  239. jobId = self.parseJobId(lineSplit[0])
  240. nmap = int(lineSplit[1].split('=')[1])
  241. nred = int(lineSplit[2].split('=')[1])
  242. # Update information
  243. job = self.getJob(jobId)
  244. job.nmap=nmap
  245. job.nred=nred
  246. job.state = HadoopState.SUBMIT
  247. elif len(lineSplit)==2 and lineSplit[0] == 'Initializing':
  248. jobId = self.parseJobId(lineSplit[1])
  249. # Update information
  250. job = self.getJob(jobId)
  251. job.state = HadoopState.INIT
  252. elif len(lineSplit)==9 and lineSplit[0] == 'jobToken':
  253. pass
  254. elif len(lineSplit)==12 and lineSplit[0] == 'Input' and lineSplit[1] == 'size':
  255. jobId = self.parseJobId(lineSplit[4])
  256. inputSize = int(lineSplit[6].replace('.',''))
  257. nsplits = int(lineSplit[11])
  258. # Update information
  259. job = self.getJob(jobId)
  260. job.inputSize = inputSize
  261. job.nsplits = nsplits
  262. job.state = HadoopState.INIT
  263. elif len(lineSplit)==5 and lineSplit[1] == 'has' and lineSplit[2] == 'split' and lineSplit[3] == 'on':
  264. jobId, taskId = self.parseTaskId(lineSplit[0].replace('tip:', ''))
  265. nodeId = self.parseNodeId(lineSplit[4])
  266. # Update information
  267. task = self.getTask(taskId)
  268. task.state = HadoopState.INIT
  269. task.addSplit(nodeId)
  270. node = self.getNode(nodeId)
  271. elif len(lineSplit)==2 and lineSplit[1].startswith('LOCALITY_WAIT_FACTOR'):
  272. jobId = self.parseJobId(lineSplit[0])
  273. # Update information
  274. job = self.getJob(jobId)
  275. elif len(lineSplit)==12 and lineSplit[0] == 'Job' and lineSplit[2] == 'initialized' and lineSplit[3] == 'successfully':
  276. jobId = self.parseJobId(lineSplit[1])
  277. nmap = int(lineSplit[5])
  278. nred = int(lineSplit[9])
  279. # Update information
  280. job = self.getJob(jobId)
  281. job.nmap = nmap
  282. job.nred = nred
  283. job.state = HadoopState.PREP
  284. elif len(lineSplit)==4 and lineSplit[0] == 'Choosing' and (lineSplit[1] == 'rack-local' or lineSplit[1] == 'data-local') and lineSplit[2] == 'task':
  285. jobId, taskId = self.parseTaskId(lineSplit[3])
  286. locality = lineSplit[1]
  287. # Update information
  288. task = self.getTask(taskId)
  289. if lineSplit[1] == 'data-local':
  290. task.dataLocal = True
  291. elif len(lineSplit)>=5 and lineSplit[0] == 'Choosing' and lineSplit[1] == 'a' and lineSplit[2] == 'non-local' and lineSplit[3] == 'task':
  292. jobId, taskId = self.parseTaskId(lineSplit[4])
  293. # Update information
  294. task = self.getTask(taskId)
  295. if len(lineSplit)>=7 and lineSplit[6] == 'speculation':
  296. task.speculation += 1
  297. elif len(lineSplit)==6 and lineSplit[2] == 'has' and lineSplit[3] == 'completed' and lineSplit[5] == 'successfully.':
  298. jobId, taskId, attemptId = self.parseAttemptId(lineSplit[1])
  299. # Update information
  300. attempt = self.getAttempt(attemptId)
  301. attempt.state = HadoopState.SUCCEEDED
  302. node = self.getNode(attempt.nodeId)
  303. if node != None and attemptId in node.attempts:
  304. node.status = 'UP'
  305. node.attempts.remove(attemptId)
  306. elif len(lineSplit)==5 and lineSplit[2] == 'has' and lineSplit[3] == 'completed' and lineSplit[4] == 'successfully.':
  307. jobId = self.parseJobId(lineSplit[1])
  308. # Update information
  309. job = self.getJob(jobId)
  310. job.state = HadoopState.SUCCEEDED
  311. elif len(lineSplit)>=1 and lineSplit[0].find('jobId')>=0 and lineSplit[0].find('submitTime')>=0 and lineSplit[0].find('launchTime')>=0 and lineSplit[0].find('firstMapTaskLaunchTime')>=0:
  312. jobId = None
  313. for keyvalue in lineSplit[0].split(','):
  314. key, value = keyvalue.split('=')
  315. if jobId != None:
  316. job = self.getJob(jobId)
  317. if key == 'jobId':
  318. jobId = self.parseJobId(value)
  319. elif key == 'status':
  320. if value == 'SUCCEEDED':
  321. job.state = HadoopState.SUCCEEDED
  322. elif value == 'FAILED':
  323. job.state = HadoopState.FAILED
  324. elif value == 'KILLED':
  325. job.state = HadoopState.KILLED
  326. else:
  327. job.state = HadoopState.UNKNOWN_FINISHED
  328. elif key == 'numMaps':
  329. job.nmap = int(value)
  330. elif key == 'numReduces':
  331. job.nred = int(value)
  332. elif key == 'submitTime':
  333. job.attrs['submitTime'] = int(value)/1000.0
  334. elif key == 'launchTime':
  335. job.attrs['launchTime'] = int(value)/1000.0
  336. elif key == 'firstMapTaskLaunchTime':
  337. job.attrs['firstMapTaskLaunchTime'] = int(value)/1000.0
  338. elif key == 'firstReduceTaskLaunchTime':
  339. job.attrs['firstReduceTaskLaunchTime'] = int(value)/1000.0
  340. elif key == 'firstJobSetupTaskLaunchTime':
  341. job.attrs['firstJobSetupTaskLaunchTime'] = int(value)/1000.0
  342. elif key == 'firstJobCleanupTaskLaunchTime':
  343. job.attrs['firstJobCleanupTaskLaunchTime'] = int(value)/1000.0
  344. elif key == 'finishTime':
  345. job.attrs['finishTime'] = int(value)/1000.0
  346. elif key == 'user':
  347. job.attrs['user'] = value
  348. elif key == 'queue':
  349. job.attrs['queue'] = value
  350. elif key == 'numSlotsPerMap':
  351. job.attrs['slotpermap'] = int(value)
  352. elif key == 'numSlotsPerReduce':
  353. job.attrs['slotperred'] = int(value)
  354. elif key == 'mapSlotSeconds':
  355. job.attrs['mapseconds'] = int(value)
  356. elif key == 'reduceSlotsSeconds':
  357. job.attrs['redseconds'] = int(value)
  358. elif key == 'clusterMapCapacity':
  359. job.attrs['mapcap'] = int(value)
  360. elif key == 'clusterReduceCapacity':
  361. job.attrs['redcap'] = int(value)
  362. #else:
  363. #print '\t', key, '->', value
  364. elif len(lineSplit)==5 and lineSplit[0] == 'Choosing' and lineSplit[1] == 'a' and lineSplit[2] == 'failed':
  365. jobId, taskId = self.parseTaskId(lineSplit[4])
  366. elif lineSplit[0] == 'Failed' and lineSplit[1] == 'fetch' and lineSplit[2] == 'notification':
  367. # TODO
  368. pass
  369. elif len(lineSplit)==3 and lineSplit[0] == 'Aborting' and lineSplit[1] == 'job':
  370. jobId = self.parseJobId(lineSplit[2])
  371. # Update information
  372. job = self.getJob(jobId)
  373. job.state = HadoopState.FAILED
  374. elif len(lineSplit)==3 and lineSplit[0] == 'Killing' and lineSplit[1] == 'job':
  375. jobId = self.parseJobId(lineSplit[2])
  376. # Update information
  377. job = self.getJob(jobId)
  378. job.state = HadoopState.KILLED
  379. else:
  380. #print 'JobInProgress', date, ' '.join(lineSplit)
  381. pass
  382. def parseLineJobTracker(self, date, lineSplit):
  383. # Job job_201204290141_0118 added successfully for user 'goiri' to queue 'default'
  384. # Initializing job_201204290141_0118
  385. # Adding task (JOB_SETUP) 'attempt_201204290141_0118_m_000003_0' to tip task_201204290141_0118_m_000003, for tracker 'tracker_sol047:localhost/127.0.0.1:55786'
  386. # Adding task (MAP) 'attempt_201204290141_0118_m_000000_0' to tip task_201204290141_0118_m_000000, for tracker 'tracker_sol047:localhost/127.0.0.1:55786'
  387. # Adding task (MAP) 'attempt_201204290141_0118_m_000001_0' to tip task_201204290141_0118_m_000001, for tracker 'tracker_sol003:localhost/127.0.0.1:53891'
  388. # Adding task (REDUCE) 'attempt_201204290141_0118_r_000000_0' to tip task_201204290141_0118_r_000000, for tracker 'tracker_sol003:localhost/127.0.0.1:53891'
  389. # Adding task (JOB_CLEANUP) 'attempt_201204290141_0118_m_000002_0' to tip task_201204290141_0118_m_000002, for tracker 'tracker_sol003:localhost/127.0.0.1:53891'
  390. # Removing task 'attempt_201204290141_0118_m_000001_0'
  391. # Removing task 'attempt_201204290141_0118_m_000002_0'
  392. # Removing task 'attempt_201204290141_0118_r_000000_0'
  393. # Removing task 'attempt_201204290141_0118_m_000000_0'
  394. # Removing task 'attempt_201204290141_0118_m_000003_0'
  395. # User limit exceeded. Marking job: job_201204290141_0118 for retire.
  396. # Retired job with id: 'job_201204290141_0118' of user 'goiri'
  397. # Retired job removed from cache job_201204290141_0118
  398. if lineSplit[0] == 'STARTUP_MSG:' or lineSplit[0] == 'SHUTDOWN_MSG:':
  399. self.reset()
  400. elif len(lineSplit)==10 and lineSplit[0] == 'Job' and lineSplit[2] == 'added' and lineSplit[3] == 'successfully':
  401. jobId = self.parseJobId(lineSplit[1])
  402. user = lineSplit[6].replace("'", '')
  403. queue = lineSplit[9].replace("'", '')
  404. # Update information
  405. job = self.getJob(jobId)
  406. job.attrs['user'] = user
  407. job.attrs['queue'] = queue
  408. job.state = HadoopState.SUBMIT
  409. elif len(lineSplit)==2 and lineSplit[0] == 'Initializing':
  410. jobId = self.parseJobId(lineSplit[1])
  411. # Update information
  412. job = self.getJob(jobId)
  413. job.state = HadoopState.INIT
  414. elif len(lineSplit)==10 and lineSplit[0]=='Adding' and lineSplit[1]=='task':
  415. jobId, taskId, attemptId = self.parseAttemptId(lineSplit[3])
  416. nodeId = self.parseNodeId(lineSplit[9])
  417. taskType = self.parseTaskType(lineSplit[2])
  418. # Update information
  419. job = self.getJob(jobId)
  420. job.state = HadoopState.RUNNING
  421. task = self.getTask(taskId)
  422. task.setType(taskType)
  423. attempt = self.getAttempt(attemptId)
  424. attempt.nodeId = nodeId
  425. attempt.state = HadoopState.RUNNING
  426. node = self.getNode(nodeId)
  427. if node != None and attemptId not in node.attempts:
  428. node.status = 'UP'
  429. node.attempts.append(attemptId)
  430. elif len(lineSplit)==3 and lineSplit[0]=='Removing' and lineSplit[1]=='task':
  431. jobId, taskId, attemptId = self.parseAttemptId(lineSplit[2])
  432. # Update information
  433. attempt = self.getAttempt(attemptId)
  434. node = self.getNode(attempt.nodeId)
  435. if node != None and attemptId in node.attempts:
  436. node.attempts.remove(attemptId)
  437. if HadoopState.isRunning(attempt.state):
  438. attempt.state = HadoopState.UNKNOWN_FINISHED
  439. elif len(lineSplit)==8 and lineSplit[0]=='User' and lineSplit[1]=='limit':
  440. jobId = self.parseJobId(lineSplit[5])
  441. # Update information
  442. job = self.getJob(jobId)
  443. elif len(lineSplit)==8 and lineSplit[0]=='Retired' and lineSplit[1]=='job' and lineSplit[2]=='with' and lineSplit[3]=='id:':
  444. jobId = self.parseJobId(lineSplit[4])
  445. # Update information
  446. job = self.getJob(jobId)
  447. if job.state != HadoopState.SUCCEEDED and job.state != HadoopState.FAILED:
  448. job.state = HadoopState.UNKNOWN_FINISHED
  449. elif len(lineSplit)==6 and lineSplit[0]=='Retired' and lineSplit[1]=='job' and lineSplit[2]=='removed' and lineSplit[3]=='from' and lineSplit[4]=='cache':
  450. jobId = self.parseJobId(lineSplit[5])
  451. # Update information
  452. job = self.getJob(jobId)
  453. if job.state != HadoopState.SUCCEEDED and job.state != HadoopState.FAILED:
  454. job.state = HadoopState.UNKNOWN_FINISHED
  455. elif len(lineSplit)==5 and lineSplit[1]=='is' and lineSplit[3]=='ms' and lineSplit[4]=='debug.':
  456. jobId, taskId, attemptId = self.parseAttemptId(lineSplit[0])
  457. # Update information
  458. attempt = self.getAttempt(attemptId)
  459. elif len(lineSplit)==5 and lineSplit[0]=='Launching' and lineSplit[1]=='task' and lineSplit[3]=='timed':
  460. jobId, taskId, attemptId = self.parseAttemptId(lineSplit[2])
  461. # Update information
  462. attempt = self.getAttempt(attemptId)
  463. attempt.state = HadoopState.FAILED
  464. elif len(lineSplit)==6 and lineSplit[0]=='Adding' and lineSplit[1]=='tracker':
  465. nodeId = self.parseNodeId(lineSplit[2])
  466. #nodeId = self.parseNodeId(lineSplit[5])
  467. # Update information
  468. node = self.getNode(nodeId)
  469. node.status = 'UP'
  470. elif len(lineSplit)==3 and lineSplit[0]=='Lost' and lineSplit[1]=='tracker':
  471. nodeId = self.parseNodeId(lineSplit[2])
  472. # Update information
  473. node = self.getNode(nodeId)
  474. node.status = 'DOWN'
  475. elif len(lineSplit)==6 and lineSplit[0]=='Status' and lineSplit[1]=='from' and lineSplit[2]=='unknown' and lineSplit[3]=='Tracker':
  476. nodeId = self.parseNodeId(lineSplit[5])
  477. # Update information
  478. node = self.getNode(nodeId)
  479. elif len(lineSplit)==3 and lineSplit[0]=='Refreshing' and lineSplit[1]=='host' and lineSplit[2]=='information':
  480. pass
  481. elif len(lineSplit)==3 and lineSplit[0]=='Decommisioning' and lineSplit[2]=='nodes':
  482. numnodes = int(lineSplit[1])
  483. pass
  484. elif len(lineSplit)==2 and lineSplit[0]=='Starting' and lineSplit[1]=='RUNNING':
  485. pass
  486. elif len(lineSplit)==10 and lineSplit[0]=='Serious' and lineSplit[1]=='problem.':
  487. jobId, taskId, attemptId = self.parseAttemptId(lineSplit[9])
  488. # Update information
  489. attempt = self.getAttempt(attemptId)
  490. attempt.state = HadoopState.FAILED
  491. elif lineSplit[0]=='Cleaning' or (lineSplit[0]=='problem' and lineSplit[1]=='cleaning'):
  492. pass
  493. elif len(lineSplit)==4 and lineSplit[0]=='JobTracker' and lineSplit[1]=='up':
  494. pass
  495. else:
  496. #print 'JobTracker', date, ' '.join(lineSplit)
  497. pass
  498. def parseLineTaskInProgress(self, date, lineSplit):
  499. # Error from attempt_201205092227_0248_m_000001_0: Lost task tracker: tracker_sol026:localhost/127.0.0.1:43321
  500. if len(lineSplit)==7 and lineSplit[0] == 'Error' and lineSplit[1] == 'from' and lineSplit[3] == 'Lost':
  501. jobId, taskId, attemptId = self.parseAttemptId(lineSplit[2])
  502. nodeId = self.parseNodeId(lineSplit[6])
  503. # Update information
  504. attempt = self.getAttempt(attemptId)
  505. attempt.state = HadoopState.FAILED
  506. node = self.getNode(nodeId)
  507. node.status = 'DOWN'
  508. elif len(lineSplit)==6 and lineSplit[0] == 'Error' and lineSplit[1] == 'from' and lineSplit[3] == 'Error':
  509. jobId, taskId, attemptId = self.parseAttemptId(lineSplit[2])
  510. # Update information
  511. attempt = self.getAttempt(attemptId)
  512. attempt.state = HadoopState.FAILED
  513. elif len(lineSplit)>=2 and lineSplit[0] == 'Error' and lineSplit[1] == 'from':
  514. jobId, taskId, attemptId = self.parseAttemptId(lineSplit[2])
  515. # Update information
  516. attempt = self.getAttempt(attemptId)
  517. attempt.state = HadoopState.FAILED
  518. elif len(lineSplit)==6 and lineSplit[0] == 'TaskInProgress' and lineSplit[2] == 'has' and lineSplit[3] == 'failed':
  519. jobId, taskId = self.parseTaskId(lineSplit[1])
  520. # Update information
  521. task = self.getTask(taskId)
  522. task.state = HadoopState.FAILED
  523. else:
  524. print 'TaskInProgress', date, ' '.join(lineSplit)
  525. def parseLineTaskStatus(self, date, lineSplit):
  526. if len(lineSplit)==13 and lineSplit[0] == 'Trying' and lineSplit[1] == 'to' and lineSplit[2] == 'set' and lineSplit[3] == 'illegal':
  527. jobId, taskId, attemptId = self.parseAttemptId(lineSplit[8])
  528. # Update information
  529. attempt = self.getAttempt(attemptId)
  530. elif len(lineSplit)==18 and lineSplit[0] == 'Trying' and lineSplit[1] == 'to' and lineSplit[2] == 'set' and lineSplit[3] == 'finish':
  531. jobId, taskId, attemptId = self.parseAttemptId(lineSplit[7])
  532. # Update information
  533. attempt = self.getAttempt(attemptId)
  534. else:
  535. print 'TaskStatus', date, ' '.join(lineSplit)
  536. def parseLineNetworkTopology(self, date, lineSplit):
  537. if len(lineSplit)==5 and lineSplit[0] == 'Adding' and lineSplit[1] == 'a' and lineSplit[2] == 'new':
  538. nodeId = self.parseNodeId(lineSplit[4])
  539. else:
  540. print 'NetworkTopology', date, ' '.join(lineSplit)
  541. def parseLineJobHistory(self, date, lineSplit):
  542. # Moving file:/auto/home/goiri/hadoop-1.0.2/logs/history/job_201204290141_0118_1335682931160_goiri_workGen to file:/auto/home/goiri/hadoop-1.0.2/logs/history/done/version-1/sol000_1335678069919_/2012/04/29/000000
  543. # Moving file:/auto/home/goiri/hadoop-1.0.2/logs/history/job_201204290141_0118_conf.xml to file:/auto/home/goiri/hadoop-1.0.2/logs/history/done/version-1/sol000_1335678069919_/2012/04/29/000000
  544. # Deleting localized job conf at /auto/home/goiri/hadoop-1.0.2/libexec/../logs/job_201204290141_0118_conf.xml
  545. #print '*** JobHistory', date, lineSplit
  546. pass
  547. def parseLineDisableNode(self, date, lineSplit):
  548. #print date, "Disable node:", lineSplit
  549. pass
  550. def parseLineAuditLogger(self, date, lineSplit):
  551. # USER=goiri IP=172.16.28.128 OPERATION=SUBMIT_JOB TARGET=job_201204290141_0118 RESULT=SUCCESS
  552. #print '*** AuditLogger', date, lineSplit
  553. pass
  554. # Id parsing functions
  555. def parseJobId(self, intext):
  556. intext = intext.replace("'", '')
  557. intext = intext.replace(':', '')
  558. jobId = intext
  559. return jobId
  560. def parseTaskId(self, intext):
  561. intext = intext.replace("'", "")
  562. intext = intext.replace(":", "")
  563. split = intext.split('_')
  564. jobId = "_".join(['job'] + split[1:3])
  565. taskId = "_".join(['task'] + split[1:5])
  566. return (jobId, taskId)
  567. def parseAttemptId(self, intext):
  568. intext = intext.replace("'", "")
  569. intext = intext.replace(":", "")
  570. split = intext.split('_')
  571. jobId = "_".join(['job'] + split[1:3])
  572. taskId = "_".join(['task'] + split[1:5])
  573. attemptId = "_".join(['attempt'] + split[1:6])
  574. return (jobId, taskId, attemptId)
  575. def parseNodeId(self, intext):
  576. nodeId = intext.replace("'", '')
  577. if nodeId.startswith('node:/default-rack/'):
  578. nodeId = nodeId.replace('node:/default-rack/', '')
  579. if nodeId.startswith('node:/rack-default/'):
  580. nodeId = nodeId.replace('node:/rack-default/', '')
  581. if nodeId.startswith('node:/rack-covering/'):
  582. nodeId = nodeId.replace('node:/rack-covering/', '')
  583. if nodeId.startswith('node:/covering-rack/'):
  584. nodeId = nodeId.replace('node:/covering-rack/', '')
  585. if nodeId.startswith('tracker_'):
  586. nodeId = nodeId.replace('tracker_', '')
  587. if nodeId.find(':') >= 0:
  588. nodeId = nodeId.split(':')[0]
  589. return nodeId
  590. def parseTaskType(self, intext):
  591. taskType = intext.replace('(', '').replace(')', '')
  592. return taskType
  593. def checkStatus(self):
  594. # Check from job perspective
  595. '''
  596. for job in self.jobs:
  597. for task in job.tasks:
  598. for attempt in task.attempts:
  599. if not HadoopState.isRunning(job.state):
  600. # Check if the node is running the task
  601. nodeId = attempt.nodeId
  602. node = self.getNode(nodeId)
  603. if attempt.attemptId in node.attempts:
  604. node.attempts.remove(attempt.attemptId)
  605. '''
  606. # Check from node perspective
  607. for nodeId in self.nodes:
  608. node = self.nodes[nodeId]
  609. # Remove attempts from finished nodes
  610. for attemptId in list(node.attempts):
  611. jobId, taskId, attemptId = self.parseAttemptId(attemptId)
  612. job = self.getJob(jobId)
  613. # If the job is already done, nothing is running anymore
  614. if not job.isRunning():
  615. if attemptId not in self.attemptsToClean:
  616. self.attemptsToClean[attemptId] = 0
  617. self.attemptsToClean[attemptId] += 1
  618. if self.attemptsToClean[attemptId] >= 5:
  619. del self.attemptsToClean[attemptId]
  620. node.attempts.remove(attemptId)
  621. # Remove attempts from down nodes
  622. if node.status == 'DOWN':
  623. node.attempts = []
  624. def printOutput(self, jobs=True, nodes=True):
  625. if jobs:
  626. runJobs = 0
  627. print 'Jobs (%d)' % (len(self.jobs))
  628. print '================='
  629. for jobId in sorted(self.jobs.keys()):
  630. job = self.jobs[jobId]
  631. if job.nmap == None or job.nred == None:
  632. print "%s %s" % (jobId, HadoopState.toString(job.state))
  633. elif 'submitTime' not in job.attrs or 'launchTime' not in job.attrs or 'finishTime' not in job.attrs or 'firstMapTaskLaunchTime' not in job.attrs:
  634. print "%s %s map=%d red=%d" % (jobId, HadoopState.toString(job.state).ljust(10), job.nmap, job.nred)
  635. else:
  636. #print "%s %s %.1fs %.1fs" % (jobId, HadoopState.toString(job.state), job.launchTime-job.submitTime, job.finishTime-job.launchTime)
  637. print "%s %s map=%d red=%d %.1fs %.1fs" % (jobId, HadoopState.toString(job.state).ljust(10), job.nmap, job.nred, job.attrs['firstMapTaskLaunchTime']-job.attrs['submitTime'], job.attrs['finishTime']-job.attrs['launchTime'])
  638. if job.isRunning():
  639. runJobs += 1
  640. for task in sorted(sorted(job.tasks.values(), key=attrgetter('taskId')), key=attrgetter('type')):
  641. taskId = task.taskId
  642. task = job.tasks[taskId]
  643. if len(task.attempts) == 0:
  644. print '\t', taskId, task.getType().ljust(14), ','.join(task.splits).ljust(30), HadoopState.toString(task.state)
  645. else:
  646. for attemptId in sorted(task.attempts.keys()):
  647. attempt = task.attempts[attemptId]
  648. local = ' '
  649. if attempt.nodeId != None and task.splits != None and attempt.nodeId in task.splits:
  650. local = '*'
  651. print '\t', taskId, task.getType().ljust(14), ','.join(task.splits).ljust(30), attemptId, attempt.nodeId, local, HadoopState.toString(attempt.state)
  652. print "%d/%d" % (runJobs, len(self.jobs))
  653. if nodes:
  654. # Count number of attempts
  655. numAttempts = {}
  656. for job in monitor.jobs.values():
  657. if HadoopState.isRunning(job.state):
  658. # Add those servers running tasks
  659. for task in job.tasks.values():
  660. for attempt in task.attempts.values():
  661. # Check in which node it was running
  662. if attempt.nodeId != None:
  663. if attempt.nodeId not in numAttempts:
  664. numAttempts[attempt.nodeId] = 1
  665. else:
  666. numAttempts[attempt.nodeId] += 1
  667. print 'Nodes (%d)' % (len(self.nodes))
  668. print '================='
  669. out = ''
  670. for node in sorted(sorted(self.nodes.values(), key=attrgetter('nodeId')), key=attrgetter('status'), reverse=True):
  671. if node.nodeId not in numAttempts:
  672. numAttempts[node.nodeId] = 0
  673. #node = self.getNode(nodeId)
  674. print node.nodeId, node.status.ljust(4), ("%d/%d" % (len(node.attempts), numAttempts[node.nodeId])).ljust(5), ' '.join(node.attempts)
  675. if node.status == 'UP':
  676. out += bcolors.GREENBG+' '+bcolors.ENDC
  677. elif node.status == 'DOWN':
  678. out += bcolors.REDBG+' '+bcolors.ENDC
  679. else:
  680. out += ' '
  681. print out
  682. def printNode(self, nodeId):
  683. print nodeId
  684. for job in monitor.jobs.values():
  685. if HadoopState.isRunning(job.state):
  686. # Add those servers running tasks
  687. for task in job.tasks.values():
  688. for attempt in task.attempts.values():
  689. # Check in which node it was running
  690. if attempt.nodeId != None:
  691. if attempt.nodeId == nodeId:
  692. local = ' '
  693. if attempt.nodeId != None and task.splits != None and attempt.nodeId in task.splits:
  694. local = '*'
  695. print '\t', task.taskId, task.getType().ljust(14), ','.join(task.splits).ljust(30), attempt.attemptId, local, HadoopState.toString(attempt.state)
  696. def printNodeManager(self):
  697. # Node management
  698. # ===============
  699. # Get nodes information
  700. offNodes = []
  701. onNodes = []
  702. for node in monitor.nodes.values():
  703. if node.status == 'DOWN':
  704. offNodes.append(node.nodeId)
  705. elif node.status == 'UP':
  706. onNodes.append(node.nodeId)
  707. offNodes = sorted(offNodes)
  708. onNodes = sorted(onNodes)
  709. # Active nodes: nodeId -> tasks runnng
  710. activeNodes = {}
  711. activeTasks = 0
  712. # Get nodes running tasks
  713. for node in monitor.nodes.values():
  714. if node.status == 'UP' and len(node.attempts)>0:
  715. if node.nodeId not in activeNodes:
  716. #activeNodes[node.nodeId] = len(node.attempts)
  717. activeNodes[node.nodeId] = 0
  718. # Get nodes that were running an active job
  719. numRunMap = 0
  720. numRunRed = 0
  721. for job in monitor.jobs.values():
  722. if HadoopState.isRunning(job.state):
  723. # Check if not all the tasks have been initialized
  724. jobRunMap = 0
  725. jobRunRed = 0
  726. jobQueueMap = job.nmap
  727. jobQueueRed = job.nred
  728. if jobQueueMap == None:
  729. jobQueueMap = 0
  730. if jobQueueRed == None:
  731. jobQueueRed = 0
  732. # Add those servers running tasks
  733. for task in job.tasks.values():
  734. # Only if the task is useful
  735. if task.getType() != 'JOB_SETUP':
  736. for attempt in task.attempts.values():
  737. # Check in which node it was running
  738. if attempt.nodeId != None and attempt.state!=HadoopState.FAILED:
  739. if attempt.nodeId not in activeNodes:
  740. activeNodes[attempt.nodeId] = 1
  741. activeTasks += 1
  742. else:
  743. activeNodes[attempt.nodeId] += 1
  744. activeTasks += 1
  745. # Check if the task is running
  746. if HadoopState.isRunning(attempt.state):
  747. if attempt.attemptId.find('_m_') >= 0:
  748. jobRunMap += 1
  749. elif attempt.attemptId.find('_r_') >= 0:
  750. jobRunRed += 1
  751. else:
  752. if attempt.attemptId.find('_m_') >= 0:
  753. jobQueueMap -= 1
  754. elif attempt.attemptId.find('_r_') >= 0:
  755. jobQueueRed -= 1
  756. # Task maximum
  757. if jobRunMap < jobQueueMap:
  758. jobRunMap = jobQueueMap
  759. if jobRunRed < jobQueueRed:
  760. jobRunRed = jobQueueRed
  761. numRunMap += jobRunMap
  762. numRunRed += jobRunRed
  763. # Get inactive nodes
  764. inactiveNodes = list(onNodes)
  765. for nodeId in activeNodes:
  766. if nodeId in inactiveNodes:
  767. inactiveNodes.remove(nodeId)
  768. nodesToOff = []
  769. for nodeId in sorted(onNodes, reverse=True):
  770. # Account the inactivity
  771. inactivity = 0
  772. #if nodeId in inactivityAccount:
  773. # inactivity = inactivityAccount[nodeId]
  774. # Account the number of tasks running
  775. if nodeId in activeNodes:
  776. inactivity -= activeNodes[nodeId]
  777. # Save value to sort
  778. nodesToOff.append((nodeId, inactivity))
  779. if len(nodesToOff)>0:
  780. nodesToOff = sorted(nodesToOff, key=itemgetter(1), reverse=True)
  781. nodesToOff = [nodeId for nodeId, inactivity in nodesToOff]
  782. if len(onNodes)>0:
  783. print 'On nodes:', ",".join(onNodes)
  784. if len(offNodes)>0:
  785. print 'Off node:', ",".join(offNodes)
  786. if len(activeNodes)>0:
  787. print 'Active nodes:'
  788. for nodeId,v in sorted(activeNodes.iteritems(), key=itemgetter(1)):
  789. print '\t',nodeId, v
  790. if len(inactiveNodes)>0:
  791. print 'Inactive nodes:', ",".join(inactiveNodes)
  792. if len(nodesToOff)>0:
  793. print 'Nodes to off:', ",".join(nodesToOff)
  794. def getMapReduceSummary(self):
  795. out = ''
  796. # Jobs
  797. jobsRunning = 0
  798. jobsInit = 0
  799. jobsPrep = 0
  800. jobsSucceeded = 0
  801. jobsFailed = 0
  802. for job in self.jobs.values():
  803. if job.state == HadoopState.INIT:
  804. jobsInit += 1
  805. elif job.state == HadoopState.PREP:
  806. running = False
  807. for task in job.tasks.values():
  808. for attempt in task.attempts.values():
  809. if attempt.state == HadoopState.RUNNING:
  810. running = True
  811. if running:
  812. jobsRunning += 1
  813. else:
  814. jobsPrep += 1
  815. elif job.state == HadoopState.RUNNING:
  816. jobsRunning += 1
  817. elif job.state == HadoopState.SUCCEEDED:
  818. jobsSucceeded += 1
  819. elif job.state == HadoopState.FAILED:
  820. jobsFailed += 1
  821. out += 'jobs '
  822. out += str(len(self.jobs))+' '
  823. out += str(jobsInit)+' '
  824. out += str(jobsPrep)+' '
  825. out += str(jobsRunning)+' '
  826. out += str(jobsSucceeded)+' '
  827. out += str(jobsFailed)+' '
  828. # Nodes
  829. nodesTotal = 0
  830. nodesRunning = 0
  831. nodesAttempts = 0
  832. for node in self.nodes.values():
  833. nodesTotal += 1
  834. if len(node.attempts) > 0:
  835. nodesRunning += 1
  836. for attemptId in node.attempts:
  837. nodesAttempts += 1
  838. out += 'nodes '
  839. out += str(nodesTotal)+' '
  840. out += str(nodesRunning)+' '
  841. out += str(nodesAttempts)+' '
  842. return out
  843. if __name__=='__main__':
  844. parser = OptionParser()
  845. parser.add_option("-p", "--parse", dest="parseFile", help="specify file to parse", type="string", default=None)
  846. #parser.add_option("-p", "--parse", action="store_true", dest="parse", help="specify if the scheduler supports brown pricing")
  847. (options, args) = parser.parse_args()
  848. if options.parseFile != None:
  849. # Parse a given file
  850. # HADOOP_HOME+"/logs/hadoop-"+USER+"-jobtracker-"+MASTER_NODE+".log.2012-04-29"
  851. monitor = MonitorMapred(parseOld=True, logfile=options.parseFile)
  852. else:
  853. monitor = MonitorMapred()
  854. monitor.start()
  855. signal.signal(signal.SIGINT, signal_handler)
  856. # Console input
  857. while True:
  858. cmd = raw_input("$ ")
  859. if cmd.lower() == "debug":
  860. if monitor.debug == 0:
  861. monitor.debug = 1
  862. else:
  863. monitor.debug = 0
  864. elif cmd.lower() == "jobs":
  865. monitor.printOutput(jobs=True, nodes=False)
  866. elif cmd.lower() == "nodes":
  867. monitor.printOutput(jobs=False, nodes=True)
  868. elif cmd.lower().startswith('node'):
  869. cmdSplit = cmd.split(' ')
  870. if len(cmdSplit) >= 2:
  871. monitor.printNode(cmdSplit[1])
  872. elif cmd.lower().startswith('checkoff'):
  873. monitor.printNodeManager()
  874. elif cmd.lower() == "show":
  875. monitor.printOutput()
  876. elif cmd.lower() == "summary":
  877. print monitor.getMapReduceSummary()
  878. elif cmd.lower() == "exit":
  879. monitor.kill()
  880. break
  881. elif cmd.lower() == "help" or cmd.lower() == "h" or cmd.lower() == "?":
  882. print "debug"
  883. print "jobs"
  884. print "nodes"
  885. print "exit"
  886. #while True:
  887. #time.sleep(10.0)
  888. monitor.join()