/ghadoopmonitor.py
Python | 960 lines | 897 code | 24 blank | 39 comment | 14 complexity | 382faa4460169aff6a9907308343150c MD5 | raw file
- #!/usr/bin/env python2.7
- """
- GreenHadoop makes Hadoop aware of solar energy availability.
- http://www.research.rutgers.edu/~goiri/
- Copyright (C) 2012 Inigo Goiri, Rutgers University
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>
- """
- import threading
- import os
- import time
- import signal
- from subprocess import PIPE, Popen
- from optparse import OptionParser
- from operator import itemgetter, attrgetter
- from datetime import datetime, timedelta
- from ghadoopcommons import *
- class MonitorMapred(threading.Thread):
- def __init__(self, parseOld=False, logfile=None):
- threading.Thread.__init__(self)
- self.running = True
-
- # Store info
- self.jobs = {}
- self.nodes = {}
-
- self.debug = 0
-
- # Counter for the attempts not in the system anymore
- self.attemptsToClean = {}
-
- # JobTracker log
- self.logfileMapr = HADOOP_HOME+"/logs/hadoop-"+USER+"-jobtracker-"+MASTER_NODE+".log"
- if logfile != None:
- self.logfileMapr = logfile
-
- # http://forums.devshed.com/python-programming-11/how-to-monitor-a-file-for-changes-85767.html
- self.fileMapr = open(self.logfileMapr, 'r')
- self.watcherMapr = os.stat(self.logfileMapr)
- self.this_modifiedMapr = self.last_modifiedMapr = self.watcherMapr.st_mtime
- self.this_sizeMapr = self.last_sizeMapr = self.watcherMapr.st_size
- # Read previous state of the system
- if parseOld:
- self.parseOld()
- # Go to the end of the file
- self.fileMapr.seek(0,2)
-
- def run(self):
- previous = datetime.now()
-
- # Get nodes from files (masters and slaves)
- #for nodeId in readHostFile(CONF_MASTERS):
- #node = self.getNode(nodeId)
- for nodeId in readHostFile(CONF_SLAVES):
- node = self.getNode(nodeId)
- # Get nodes currently running in the system
- for nodeId in self.getActiveTaskTrackers():
- node = self.getNode(nodeId)
- node.status = 'UP'
-
- # Monitor loop
- while self.running:
- try:
- # Update from log: JobTracker
- if self.this_modifiedMapr > self.last_modifiedMapr:
- self.last_modifiedMapr = self.this_modifiedMapr
- self.last_sizeMapr = self.this_sizeMapr
- # File was modified, so read new lines, look for error keywords
- while True:
- line = self.fileMapr.readline()
- if not line:
- break
- date = self.parseLine(line)
- self.watcherMapr = os.stat(self.logfileMapr)
- self.this_modifiedMapr = self.watcherMapr.st_mtime
- self.this_sizeMapr = self.watcherMapr.st_size
-
- # Check if the file changed
- if self.this_sizeMapr < self.last_sizeMapr:
- print 'Size is shorter. New file!'
- self.fileMapr = open(self.logfileMapr, 'r')
- self.watcherMapr = os.stat(self.logfileMapr)
- self.this_modifiedMapr = self.last_modifiedMapr = self.watcherMapr.st_mtime
- self.this_sizeMapr = self.last_sizeMapr = self.watcherMapr.st_size
-
- # Check status
- self.checkStatus()
-
- # Print output (every 3 seconds)
- if (datetime.now()-previous) > timedelta(seconds=3):
- if self.debug > 0:
- self.printOutput()
- previous = datetime.now()
- except Exception, e:
- print 'Error reading log file:', e
- # Wait a little bit for new lines
- time.sleep(0.5) # 500ms
-
- def kill(self):
- self.running = False
-
- def reset(self):
- self.jobs = {}
- self.nodes = {}
- self.attemptsToClean = {}
-
- # Get task trackers
- def getActiveTaskTrackers(self):
- ret = []
- # Query Hadoop
- p = Popen([HADOOP_HOME+'/bin/hadoop', 'job', '-list-active-trackers'], stdout=PIPE, stderr=open('/dev/null', 'w'))
- p.wait()
-
- line = p.stdout.readline()
- while line:
- # Parse node
- trackerId = line.replace('\n', '')
- if trackerId.startswith('tracker_'):
- trackerId = trackerId.replace('tracker_', '')
- if trackerId.find(':') >= 0:
- trackerId = trackerId[:trackerId.find(':')]
- if trackerId not in ret:
- ret.append(trackerId)
- # Read next line
- line = p.stdout.readline()
- return sorted(ret)
-
- def getJob(self, jobId):
- if jobId not in self.jobs:
- job = HadoopJob(jobId)
- self.jobs[jobId] = job
- return self.jobs[jobId]
-
- def getTask(self, taskId):
- jobId, taskId = self.parseTaskId(taskId)
- job = self.getJob(jobId)
- if taskId not in job.tasks:
- task = HadoopTask(taskId)
- job.tasks[taskId] = task
- return self.jobs[jobId].tasks[taskId]
-
- def getAttempt(self, attemptId):
- jobId, taskId, attemptId = self.parseAttemptId(attemptId)
- task = self.getTask(taskId)
- if attemptId not in task.attempts:
- attempt = HadoopAttempt(attemptId)
- task.attempts[attemptId] = attempt
- return self.jobs[jobId].tasks[taskId].attempts[attemptId]
-
- def getNode(self, nodeId):
- node = None
- if nodeId != None and nodeId not in self.nodes:
- self.nodes[nodeId] = HadoopTaskTracker(nodeId)
- if nodeId in self.nodes:
- node = self.nodes[nodeId]
- return node
-
- # Parsing functions
- def parseOld(self):
- prev = None
- while True:
- line = self.fileMapr.readline()
- if not line:
- break
- date = self.parseLine(line)
- if date != None and prev != dateToSeconds(date):
- prev = dateToSeconds(date)
- print dateToSeconds(date), date, self.getMapReduceSummary()
-
- def parseLine(self, line):
- date = None
- try:
- date = datetime.strptime(line.split(",")[0], "%Y-%m-%d %H:%M:%S")
- #if self.startTime == None:
- #self.startTime = date
-
- line = line.replace("\n", "")
- lineSplit = line.split(" ")
-
- # Parse lines
- if lineSplit[3].startswith("org.apache.hadoop.mapred.JobTracker"):
- self.parseLineJobTracker(date, lineSplit[4:])
- elif lineSplit[3].startswith("org.apache.hadoop.mapred.JobInProgress"):
- self.parseLineJobInProgress(date, lineSplit[4:])
- elif lineSplit[3].startswith("org.apache.hadoop.mapred.JobHistory"):
- #self.parseLineJobHistory(date, lineSplit[4:])
- pass
- elif lineSplit[3].startswith("org.apache.hadoop.mapred.TaskInProgress"):
- self.parseLineTaskInProgress(date, lineSplit[4:])
- elif lineSplit[3].startswith("org.apache.hadoop.mapred.TaskStatus"):
- self.parseLineTaskStatus(date, lineSplit[4:])
- elif lineSplit[3].startswith("org.apache.hadoop.net.NetworkTopology"):
- self.parseLineNetworkTopology(date, lineSplit[4:])
- elif lineSplit[3].startswith("org.apache.hadoop.mapred.AuditLogger"):
- self.parseLineAuditLogger(date, lineSplit[4:])
- elif lineSplit[3].startswith("org.apache.hadoop.mapred.DisableNodesTaskScheduler"):
- self.parseLineDisableNode(date, lineSplit[4:])
- elif lineSplit[3].startswith("org.apache.hadoop.security.token.delegation.AbstractDelegationTokenSecretManager:"):
- pass
- elif lineSplit[3].startswith("org.apache.hadoop.hdfs.DFSClient"):
- pass
- elif lineSplit[3].startswith("org.apache.hadoop.ipc.Server"):
- pass
- elif lineSplit[3].startswith("org.apache.hadoop.metrics2.impl.Metrics"):
- pass
- elif lineSplit[3].startswith("org.apache.hadoop.util.NativeCodeLoader:"):
- pass
- elif lineSplit[3].startswith("org.apache.hadoop.mapred.CompletedJobStatusStore:"):
- pass
- elif lineSplit[3].startswith("org.apache.hadoop.http.HttpServer:"):
- pass
- elif lineSplit[3].startswith("org.mortbay.log:"):
- pass
- elif lineSplit[3].startswith("org.apache.hadoop.ipc.Client:"):
- pass
- elif lineSplit[3].startswith("org.apache.hadoop.util.HostsFileReader"):
- #TODO
- pass
- else:
- #print date, ' '.join(lineSplit)
- pass
- except ValueError, e:
- #print "Error line: "+line
- #print e
- # TODO
- pass
- except Exception, e:
- #print "Error line: "+line
- print e
- return date
-
- def parseLineJobInProgress(self, date, lineSplit):
- # job_201204290141_0118: nMaps=2 nReduces=1 max=-1
- # Initializing job_201204290141_0118
- # jobToken generated and stored with users keys in /opt/hadoop-goiri/mapred/system/job_201204290141_0118/jobToken
- # Input size for job job_201204290141_0118 = 66880960. Number of splits = 2
- # tip:task_201204290141_0118_m_000000 has split on node:/default-rack/sol054
- # tip:task_201204290141_0118_m_000000 has split on node:/default-rack/sol039
- # tip:task_201204290141_0118_m_000000 has split on node:/default-rack/sol048
- # tip:task_201204290141_0118_m_000001 has split on node:/default-rack/sol054
- # tip:task_201204290141_0118_m_000001 has split on node:/default-rack/sol039
- # tip:task_201204290141_0118_m_000001 has split on node:/default-rack/sol048
- # job_201204290141_0118 LOCALITY_WAIT_FACTOR=0.05882353
- # Job job_201204290141_0118 initialized successfully with 2 map tasks and 1 reduce tasks.
- # Task 'attempt_201204290141_0118_m_000003_0' has completed task_201204290141_0118_m_000003 successfully.
- # Choosing rack-local task task_201204290141_0118_m_000000
- # Choosing rack-local task task_201204290141_0118_m_000001
- # Task 'attempt_201204290141_0118_m_000001_0' has completed task_201204290141_0118_m_000001 successfully.
- # Task 'attempt_201204290141_0118_m_000000_0' has completed task_201204290141_0118_m_000000 successfully.
- # Task 'attempt_201204290141_0118_r_000000_0' has completed task_201204290141_0118_r_000000 successfully.
- # Task 'attempt_201204290141_0118_m_000002_0' has completed task_201204290141_0118_m_000002 successfully.
- # Job job_201204290141_0118 has completed successfully.
- # jobId=job_201204290141_0118,submitTime=1335682931160,launchTime=1335682931390,firstMapTaskLaunchTime=1335682942774,firstReduceTaskLaunchTime=1335682957818,firstJobSetupTaskLaunchTime=1335682933749,firstJobCleanupTaskLaunchTime=1335682972861,finishTime=1335682981886,numMaps=2,numSlotsPerMap=1,numReduces=1,numSlotsPerReduce=1,user=goiri,queue=default,status=SUCCEEDED,mapSlotSeconds=40,reduceSlotsSeconds=12,clusterMapCapacity=204,clusterReduceCapacity=102
-
- if len(lineSplit)==4 and lineSplit[1].startswith('nMaps=') and lineSplit[2].startswith('nReduces='):
- jobId = self.parseJobId(lineSplit[0])
- nmap = int(lineSplit[1].split('=')[1])
- nred = int(lineSplit[2].split('=')[1])
- # Update information
- job = self.getJob(jobId)
- job.nmap=nmap
- job.nred=nred
- job.state = HadoopState.SUBMIT
- elif len(lineSplit)==2 and lineSplit[0] == 'Initializing':
- jobId = self.parseJobId(lineSplit[1])
- # Update information
- job = self.getJob(jobId)
- job.state = HadoopState.INIT
- elif len(lineSplit)==9 and lineSplit[0] == 'jobToken':
- pass
- elif len(lineSplit)==12 and lineSplit[0] == 'Input' and lineSplit[1] == 'size':
- jobId = self.parseJobId(lineSplit[4])
- inputSize = int(lineSplit[6].replace('.',''))
- nsplits = int(lineSplit[11])
- # Update information
- job = self.getJob(jobId)
- job.inputSize = inputSize
- job.nsplits = nsplits
- job.state = HadoopState.INIT
- elif len(lineSplit)==5 and lineSplit[1] == 'has' and lineSplit[2] == 'split' and lineSplit[3] == 'on':
- jobId, taskId = self.parseTaskId(lineSplit[0].replace('tip:', ''))
- nodeId = self.parseNodeId(lineSplit[4])
- # Update information
- task = self.getTask(taskId)
- task.state = HadoopState.INIT
- task.addSplit(nodeId)
- node = self.getNode(nodeId)
- elif len(lineSplit)==2 and lineSplit[1].startswith('LOCALITY_WAIT_FACTOR'):
- jobId = self.parseJobId(lineSplit[0])
- # Update information
- job = self.getJob(jobId)
- elif len(lineSplit)==12 and lineSplit[0] == 'Job' and lineSplit[2] == 'initialized' and lineSplit[3] == 'successfully':
- jobId = self.parseJobId(lineSplit[1])
- nmap = int(lineSplit[5])
- nred = int(lineSplit[9])
- # Update information
- job = self.getJob(jobId)
- job.nmap = nmap
- job.nred = nred
- job.state = HadoopState.PREP
- elif len(lineSplit)==4 and lineSplit[0] == 'Choosing' and (lineSplit[1] == 'rack-local' or lineSplit[1] == 'data-local') and lineSplit[2] == 'task':
- jobId, taskId = self.parseTaskId(lineSplit[3])
- locality = lineSplit[1]
- # Update information
- task = self.getTask(taskId)
- if lineSplit[1] == 'data-local':
- task.dataLocal = True
- elif len(lineSplit)>=5 and lineSplit[0] == 'Choosing' and lineSplit[1] == 'a' and lineSplit[2] == 'non-local' and lineSplit[3] == 'task':
- jobId, taskId = self.parseTaskId(lineSplit[4])
- # Update information
- task = self.getTask(taskId)
- if len(lineSplit)>=7 and lineSplit[6] == 'speculation':
- task.speculation += 1
- elif len(lineSplit)==6 and lineSplit[2] == 'has' and lineSplit[3] == 'completed' and lineSplit[5] == 'successfully.':
- jobId, taskId, attemptId = self.parseAttemptId(lineSplit[1])
- # Update information
- attempt = self.getAttempt(attemptId)
- attempt.state = HadoopState.SUCCEEDED
- node = self.getNode(attempt.nodeId)
- if node != None and attemptId in node.attempts:
- node.status = 'UP'
- node.attempts.remove(attemptId)
- elif len(lineSplit)==5 and lineSplit[2] == 'has' and lineSplit[3] == 'completed' and lineSplit[4] == 'successfully.':
- jobId = self.parseJobId(lineSplit[1])
- # Update information
- job = self.getJob(jobId)
- job.state = HadoopState.SUCCEEDED
- elif len(lineSplit)>=1 and lineSplit[0].find('jobId')>=0 and lineSplit[0].find('submitTime')>=0 and lineSplit[0].find('launchTime')>=0 and lineSplit[0].find('firstMapTaskLaunchTime')>=0:
- jobId = None
- for keyvalue in lineSplit[0].split(','):
- key, value = keyvalue.split('=')
- if jobId != None:
- job = self.getJob(jobId)
- if key == 'jobId':
- jobId = self.parseJobId(value)
- elif key == 'status':
- if value == 'SUCCEEDED':
- job.state = HadoopState.SUCCEEDED
- elif value == 'FAILED':
- job.state = HadoopState.FAILED
- elif value == 'KILLED':
- job.state = HadoopState.KILLED
- else:
- job.state = HadoopState.UNKNOWN_FINISHED
- elif key == 'numMaps':
- job.nmap = int(value)
- elif key == 'numReduces':
- job.nred = int(value)
- elif key == 'submitTime':
- job.attrs['submitTime'] = int(value)/1000.0
- elif key == 'launchTime':
- job.attrs['launchTime'] = int(value)/1000.0
- elif key == 'firstMapTaskLaunchTime':
- job.attrs['firstMapTaskLaunchTime'] = int(value)/1000.0
- elif key == 'firstReduceTaskLaunchTime':
- job.attrs['firstReduceTaskLaunchTime'] = int(value)/1000.0
- elif key == 'firstJobSetupTaskLaunchTime':
- job.attrs['firstJobSetupTaskLaunchTime'] = int(value)/1000.0
- elif key == 'firstJobCleanupTaskLaunchTime':
- job.attrs['firstJobCleanupTaskLaunchTime'] = int(value)/1000.0
- elif key == 'finishTime':
- job.attrs['finishTime'] = int(value)/1000.0
- elif key == 'user':
- job.attrs['user'] = value
- elif key == 'queue':
- job.attrs['queue'] = value
- elif key == 'numSlotsPerMap':
- job.attrs['slotpermap'] = int(value)
- elif key == 'numSlotsPerReduce':
- job.attrs['slotperred'] = int(value)
- elif key == 'mapSlotSeconds':
- job.attrs['mapseconds'] = int(value)
- elif key == 'reduceSlotsSeconds':
- job.attrs['redseconds'] = int(value)
- elif key == 'clusterMapCapacity':
- job.attrs['mapcap'] = int(value)
- elif key == 'clusterReduceCapacity':
- job.attrs['redcap'] = int(value)
- #else:
- #print '\t', key, '->', value
- elif len(lineSplit)==5 and lineSplit[0] == 'Choosing' and lineSplit[1] == 'a' and lineSplit[2] == 'failed':
- jobId, taskId = self.parseTaskId(lineSplit[4])
- elif lineSplit[0] == 'Failed' and lineSplit[1] == 'fetch' and lineSplit[2] == 'notification':
- # TODO
- pass
- elif len(lineSplit)==3 and lineSplit[0] == 'Aborting' and lineSplit[1] == 'job':
- jobId = self.parseJobId(lineSplit[2])
- # Update information
- job = self.getJob(jobId)
- job.state = HadoopState.FAILED
- elif len(lineSplit)==3 and lineSplit[0] == 'Killing' and lineSplit[1] == 'job':
- jobId = self.parseJobId(lineSplit[2])
- # Update information
- job = self.getJob(jobId)
- job.state = HadoopState.KILLED
- else:
- #print 'JobInProgress', date, ' '.join(lineSplit)
- pass
-
- def parseLineJobTracker(self, date, lineSplit):
- # Job job_201204290141_0118 added successfully for user 'goiri' to queue 'default'
- # Initializing job_201204290141_0118
- # Adding task (JOB_SETUP) 'attempt_201204290141_0118_m_000003_0' to tip task_201204290141_0118_m_000003, for tracker 'tracker_sol047:localhost/127.0.0.1:55786'
- # Adding task (MAP) 'attempt_201204290141_0118_m_000000_0' to tip task_201204290141_0118_m_000000, for tracker 'tracker_sol047:localhost/127.0.0.1:55786'
- # Adding task (MAP) 'attempt_201204290141_0118_m_000001_0' to tip task_201204290141_0118_m_000001, for tracker 'tracker_sol003:localhost/127.0.0.1:53891'
- # Adding task (REDUCE) 'attempt_201204290141_0118_r_000000_0' to tip task_201204290141_0118_r_000000, for tracker 'tracker_sol003:localhost/127.0.0.1:53891'
- # Adding task (JOB_CLEANUP) 'attempt_201204290141_0118_m_000002_0' to tip task_201204290141_0118_m_000002, for tracker 'tracker_sol003:localhost/127.0.0.1:53891'
- # Removing task 'attempt_201204290141_0118_m_000001_0'
- # Removing task 'attempt_201204290141_0118_m_000002_0'
- # Removing task 'attempt_201204290141_0118_r_000000_0'
- # Removing task 'attempt_201204290141_0118_m_000000_0'
- # Removing task 'attempt_201204290141_0118_m_000003_0'
- # User limit exceeded. Marking job: job_201204290141_0118 for retire.
- # Retired job with id: 'job_201204290141_0118' of user 'goiri'
- # Retired job removed from cache job_201204290141_0118
- if lineSplit[0] == 'STARTUP_MSG:' or lineSplit[0] == 'SHUTDOWN_MSG:':
- self.reset()
- elif len(lineSplit)==10 and lineSplit[0] == 'Job' and lineSplit[2] == 'added' and lineSplit[3] == 'successfully':
- jobId = self.parseJobId(lineSplit[1])
- user = lineSplit[6].replace("'", '')
- queue = lineSplit[9].replace("'", '')
- # Update information
- job = self.getJob(jobId)
- job.attrs['user'] = user
- job.attrs['queue'] = queue
- job.state = HadoopState.SUBMIT
- elif len(lineSplit)==2 and lineSplit[0] == 'Initializing':
- jobId = self.parseJobId(lineSplit[1])
- # Update information
- job = self.getJob(jobId)
- job.state = HadoopState.INIT
- elif len(lineSplit)==10 and lineSplit[0]=='Adding' and lineSplit[1]=='task':
- jobId, taskId, attemptId = self.parseAttemptId(lineSplit[3])
- nodeId = self.parseNodeId(lineSplit[9])
- taskType = self.parseTaskType(lineSplit[2])
- # Update information
- job = self.getJob(jobId)
- job.state = HadoopState.RUNNING
- task = self.getTask(taskId)
- task.setType(taskType)
- attempt = self.getAttempt(attemptId)
- attempt.nodeId = nodeId
- attempt.state = HadoopState.RUNNING
- node = self.getNode(nodeId)
- if node != None and attemptId not in node.attempts:
- node.status = 'UP'
- node.attempts.append(attemptId)
- elif len(lineSplit)==3 and lineSplit[0]=='Removing' and lineSplit[1]=='task':
- jobId, taskId, attemptId = self.parseAttemptId(lineSplit[2])
- # Update information
- attempt = self.getAttempt(attemptId)
- node = self.getNode(attempt.nodeId)
- if node != None and attemptId in node.attempts:
- node.attempts.remove(attemptId)
- if HadoopState.isRunning(attempt.state):
- attempt.state = HadoopState.UNKNOWN_FINISHED
- elif len(lineSplit)==8 and lineSplit[0]=='User' and lineSplit[1]=='limit':
- jobId = self.parseJobId(lineSplit[5])
- # Update information
- job = self.getJob(jobId)
- elif len(lineSplit)==8 and lineSplit[0]=='Retired' and lineSplit[1]=='job' and lineSplit[2]=='with' and lineSplit[3]=='id:':
- jobId = self.parseJobId(lineSplit[4])
- # Update information
- job = self.getJob(jobId)
- if job.state != HadoopState.SUCCEEDED and job.state != HadoopState.FAILED:
- job.state = HadoopState.UNKNOWN_FINISHED
- elif len(lineSplit)==6 and lineSplit[0]=='Retired' and lineSplit[1]=='job' and lineSplit[2]=='removed' and lineSplit[3]=='from' and lineSplit[4]=='cache':
- jobId = self.parseJobId(lineSplit[5])
- # Update information
- job = self.getJob(jobId)
- if job.state != HadoopState.SUCCEEDED and job.state != HadoopState.FAILED:
- job.state = HadoopState.UNKNOWN_FINISHED
- elif len(lineSplit)==5 and lineSplit[1]=='is' and lineSplit[3]=='ms' and lineSplit[4]=='debug.':
- jobId, taskId, attemptId = self.parseAttemptId(lineSplit[0])
- # Update information
- attempt = self.getAttempt(attemptId)
- elif len(lineSplit)==5 and lineSplit[0]=='Launching' and lineSplit[1]=='task' and lineSplit[3]=='timed':
- jobId, taskId, attemptId = self.parseAttemptId(lineSplit[2])
- # Update information
- attempt = self.getAttempt(attemptId)
- attempt.state = HadoopState.FAILED
- elif len(lineSplit)==6 and lineSplit[0]=='Adding' and lineSplit[1]=='tracker':
- nodeId = self.parseNodeId(lineSplit[2])
- #nodeId = self.parseNodeId(lineSplit[5])
- # Update information
- node = self.getNode(nodeId)
- node.status = 'UP'
- elif len(lineSplit)==3 and lineSplit[0]=='Lost' and lineSplit[1]=='tracker':
- nodeId = self.parseNodeId(lineSplit[2])
- # Update information
- node = self.getNode(nodeId)
- node.status = 'DOWN'
- elif len(lineSplit)==6 and lineSplit[0]=='Status' and lineSplit[1]=='from' and lineSplit[2]=='unknown' and lineSplit[3]=='Tracker':
- nodeId = self.parseNodeId(lineSplit[5])
- # Update information
- node = self.getNode(nodeId)
- elif len(lineSplit)==3 and lineSplit[0]=='Refreshing' and lineSplit[1]=='host' and lineSplit[2]=='information':
- pass
- elif len(lineSplit)==3 and lineSplit[0]=='Decommisioning' and lineSplit[2]=='nodes':
- numnodes = int(lineSplit[1])
- pass
- elif len(lineSplit)==2 and lineSplit[0]=='Starting' and lineSplit[1]=='RUNNING':
- pass
- elif len(lineSplit)==10 and lineSplit[0]=='Serious' and lineSplit[1]=='problem.':
- jobId, taskId, attemptId = self.parseAttemptId(lineSplit[9])
- # Update information
- attempt = self.getAttempt(attemptId)
- attempt.state = HadoopState.FAILED
- elif lineSplit[0]=='Cleaning' or (lineSplit[0]=='problem' and lineSplit[1]=='cleaning'):
- pass
- elif len(lineSplit)==4 and lineSplit[0]=='JobTracker' and lineSplit[1]=='up':
- pass
- else:
- #print 'JobTracker', date, ' '.join(lineSplit)
- pass
- def parseLineTaskInProgress(self, date, lineSplit):
- # Error from attempt_201205092227_0248_m_000001_0: Lost task tracker: tracker_sol026:localhost/127.0.0.1:43321
- if len(lineSplit)==7 and lineSplit[0] == 'Error' and lineSplit[1] == 'from' and lineSplit[3] == 'Lost':
- jobId, taskId, attemptId = self.parseAttemptId(lineSplit[2])
- nodeId = self.parseNodeId(lineSplit[6])
- # Update information
- attempt = self.getAttempt(attemptId)
- attempt.state = HadoopState.FAILED
- node = self.getNode(nodeId)
- node.status = 'DOWN'
- elif len(lineSplit)==6 and lineSplit[0] == 'Error' and lineSplit[1] == 'from' and lineSplit[3] == 'Error':
- jobId, taskId, attemptId = self.parseAttemptId(lineSplit[2])
- # Update information
- attempt = self.getAttempt(attemptId)
- attempt.state = HadoopState.FAILED
- elif len(lineSplit)>=2 and lineSplit[0] == 'Error' and lineSplit[1] == 'from':
- jobId, taskId, attemptId = self.parseAttemptId(lineSplit[2])
- # Update information
- attempt = self.getAttempt(attemptId)
- attempt.state = HadoopState.FAILED
- elif len(lineSplit)==6 and lineSplit[0] == 'TaskInProgress' and lineSplit[2] == 'has' and lineSplit[3] == 'failed':
- jobId, taskId = self.parseTaskId(lineSplit[1])
- # Update information
- task = self.getTask(taskId)
- task.state = HadoopState.FAILED
- else:
- print 'TaskInProgress', date, ' '.join(lineSplit)
-
- def parseLineTaskStatus(self, date, lineSplit):
- if len(lineSplit)==13 and lineSplit[0] == 'Trying' and lineSplit[1] == 'to' and lineSplit[2] == 'set' and lineSplit[3] == 'illegal':
- jobId, taskId, attemptId = self.parseAttemptId(lineSplit[8])
- # Update information
- attempt = self.getAttempt(attemptId)
- elif len(lineSplit)==18 and lineSplit[0] == 'Trying' and lineSplit[1] == 'to' and lineSplit[2] == 'set' and lineSplit[3] == 'finish':
- jobId, taskId, attemptId = self.parseAttemptId(lineSplit[7])
- # Update information
- attempt = self.getAttempt(attemptId)
- else:
- print 'TaskStatus', date, ' '.join(lineSplit)
-
- def parseLineNetworkTopology(self, date, lineSplit):
- if len(lineSplit)==5 and lineSplit[0] == 'Adding' and lineSplit[1] == 'a' and lineSplit[2] == 'new':
- nodeId = self.parseNodeId(lineSplit[4])
- else:
- print 'NetworkTopology', date, ' '.join(lineSplit)
-
- def parseLineJobHistory(self, date, lineSplit):
- # Moving file:/auto/home/goiri/hadoop-1.0.2/logs/history/job_201204290141_0118_1335682931160_goiri_workGen to file:/auto/home/goiri/hadoop-1.0.2/logs/history/done/version-1/sol000_1335678069919_/2012/04/29/000000
- # Moving file:/auto/home/goiri/hadoop-1.0.2/logs/history/job_201204290141_0118_conf.xml to file:/auto/home/goiri/hadoop-1.0.2/logs/history/done/version-1/sol000_1335678069919_/2012/04/29/000000
- # Deleting localized job conf at /auto/home/goiri/hadoop-1.0.2/libexec/../logs/job_201204290141_0118_conf.xml
- #print '*** JobHistory', date, lineSplit
- pass
-
- def parseLineDisableNode(self, date, lineSplit):
- #print date, "Disable node:", lineSplit
- pass
-
- def parseLineAuditLogger(self, date, lineSplit):
- # USER=goiri IP=172.16.28.128 OPERATION=SUBMIT_JOB TARGET=job_201204290141_0118 RESULT=SUCCESS
- #print '*** AuditLogger', date, lineSplit
- pass
-
- # Id parsing functions
- def parseJobId(self, intext):
- intext = intext.replace("'", '')
- intext = intext.replace(':', '')
- jobId = intext
- return jobId
-
- def parseTaskId(self, intext):
- intext = intext.replace("'", "")
- intext = intext.replace(":", "")
- split = intext.split('_')
- jobId = "_".join(['job'] + split[1:3])
- taskId = "_".join(['task'] + split[1:5])
- return (jobId, taskId)
-
- def parseAttemptId(self, intext):
- intext = intext.replace("'", "")
- intext = intext.replace(":", "")
- split = intext.split('_')
- jobId = "_".join(['job'] + split[1:3])
- taskId = "_".join(['task'] + split[1:5])
- attemptId = "_".join(['attempt'] + split[1:6])
- return (jobId, taskId, attemptId)
-
- def parseNodeId(self, intext):
- nodeId = intext.replace("'", '')
- if nodeId.startswith('node:/default-rack/'):
- nodeId = nodeId.replace('node:/default-rack/', '')
- if nodeId.startswith('node:/rack-default/'):
- nodeId = nodeId.replace('node:/rack-default/', '')
- if nodeId.startswith('node:/rack-covering/'):
- nodeId = nodeId.replace('node:/rack-covering/', '')
- if nodeId.startswith('node:/covering-rack/'):
- nodeId = nodeId.replace('node:/covering-rack/', '')
- if nodeId.startswith('tracker_'):
- nodeId = nodeId.replace('tracker_', '')
- if nodeId.find(':') >= 0:
- nodeId = nodeId.split(':')[0]
- return nodeId
-
- def parseTaskType(self, intext):
- taskType = intext.replace('(', '').replace(')', '')
- return taskType
-
- def checkStatus(self):
- # Check from job perspective
- '''
- for job in self.jobs:
- for task in job.tasks:
- for attempt in task.attempts:
- if not HadoopState.isRunning(job.state):
- # Check if the node is running the task
- nodeId = attempt.nodeId
- node = self.getNode(nodeId)
- if attempt.attemptId in node.attempts:
- node.attempts.remove(attempt.attemptId)
- '''
- # Check from node perspective
- for nodeId in self.nodes:
- node = self.nodes[nodeId]
- # Remove attempts from finished nodes
- for attemptId in list(node.attempts):
- jobId, taskId, attemptId = self.parseAttemptId(attemptId)
- job = self.getJob(jobId)
- # If the job is already done, nothing is running anymore
- if not job.isRunning():
- if attemptId not in self.attemptsToClean:
- self.attemptsToClean[attemptId] = 0
- self.attemptsToClean[attemptId] += 1
- if self.attemptsToClean[attemptId] >= 5:
- del self.attemptsToClean[attemptId]
- node.attempts.remove(attemptId)
- # Remove attempts from down nodes
- if node.status == 'DOWN':
- node.attempts = []
-
- def printOutput(self, jobs=True, nodes=True):
- if jobs:
- runJobs = 0
- print 'Jobs (%d)' % (len(self.jobs))
- print '================='
- for jobId in sorted(self.jobs.keys()):
- job = self.jobs[jobId]
- if job.nmap == None or job.nred == None:
- print "%s %s" % (jobId, HadoopState.toString(job.state))
- elif 'submitTime' not in job.attrs or 'launchTime' not in job.attrs or 'finishTime' not in job.attrs or 'firstMapTaskLaunchTime' not in job.attrs:
- print "%s %s map=%d red=%d" % (jobId, HadoopState.toString(job.state).ljust(10), job.nmap, job.nred)
- else:
- #print "%s %s %.1fs %.1fs" % (jobId, HadoopState.toString(job.state), job.launchTime-job.submitTime, job.finishTime-job.launchTime)
- print "%s %s map=%d red=%d %.1fs %.1fs" % (jobId, HadoopState.toString(job.state).ljust(10), job.nmap, job.nred, job.attrs['firstMapTaskLaunchTime']-job.attrs['submitTime'], job.attrs['finishTime']-job.attrs['launchTime'])
- if job.isRunning():
- runJobs += 1
- for task in sorted(sorted(job.tasks.values(), key=attrgetter('taskId')), key=attrgetter('type')):
- taskId = task.taskId
- task = job.tasks[taskId]
- if len(task.attempts) == 0:
- print '\t', taskId, task.getType().ljust(14), ','.join(task.splits).ljust(30), HadoopState.toString(task.state)
- else:
- for attemptId in sorted(task.attempts.keys()):
- attempt = task.attempts[attemptId]
- local = ' '
- if attempt.nodeId != None and task.splits != None and attempt.nodeId in task.splits:
- local = '*'
- print '\t', taskId, task.getType().ljust(14), ','.join(task.splits).ljust(30), attemptId, attempt.nodeId, local, HadoopState.toString(attempt.state)
- print "%d/%d" % (runJobs, len(self.jobs))
- if nodes:
- # Count number of attempts
- numAttempts = {}
- for job in monitor.jobs.values():
- if HadoopState.isRunning(job.state):
- # Add those servers running tasks
- for task in job.tasks.values():
- for attempt in task.attempts.values():
- # Check in which node it was running
- if attempt.nodeId != None:
- if attempt.nodeId not in numAttempts:
- numAttempts[attempt.nodeId] = 1
- else:
- numAttempts[attempt.nodeId] += 1
- print 'Nodes (%d)' % (len(self.nodes))
- print '================='
- out = ''
- for node in sorted(sorted(self.nodes.values(), key=attrgetter('nodeId')), key=attrgetter('status'), reverse=True):
- if node.nodeId not in numAttempts:
- numAttempts[node.nodeId] = 0
- #node = self.getNode(nodeId)
- print node.nodeId, node.status.ljust(4), ("%d/%d" % (len(node.attempts), numAttempts[node.nodeId])).ljust(5), ' '.join(node.attempts)
- if node.status == 'UP':
- out += bcolors.GREENBG+' '+bcolors.ENDC
- elif node.status == 'DOWN':
- out += bcolors.REDBG+' '+bcolors.ENDC
- else:
- out += ' '
- print out
-
- def printNode(self, nodeId):
- print nodeId
- for job in monitor.jobs.values():
- if HadoopState.isRunning(job.state):
- # Add those servers running tasks
- for task in job.tasks.values():
- for attempt in task.attempts.values():
- # Check in which node it was running
- if attempt.nodeId != None:
- if attempt.nodeId == nodeId:
- local = ' '
- if attempt.nodeId != None and task.splits != None and attempt.nodeId in task.splits:
- local = '*'
- print '\t', task.taskId, task.getType().ljust(14), ','.join(task.splits).ljust(30), attempt.attemptId, local, HadoopState.toString(attempt.state)
- def printNodeManager(self):
- # Node management
- # ===============
- # Get nodes information
- offNodes = []
- onNodes = []
- for node in monitor.nodes.values():
- if node.status == 'DOWN':
- offNodes.append(node.nodeId)
- elif node.status == 'UP':
- onNodes.append(node.nodeId)
- offNodes = sorted(offNodes)
- onNodes = sorted(onNodes)
-
- # Active nodes: nodeId -> tasks runnng
- activeNodes = {}
- activeTasks = 0
- # Get nodes running tasks
- for node in monitor.nodes.values():
- if node.status == 'UP' and len(node.attempts)>0:
- if node.nodeId not in activeNodes:
- #activeNodes[node.nodeId] = len(node.attempts)
- activeNodes[node.nodeId] = 0
-
- # Get nodes that were running an active job
- numRunMap = 0
- numRunRed = 0
- for job in monitor.jobs.values():
- if HadoopState.isRunning(job.state):
- # Check if not all the tasks have been initialized
- jobRunMap = 0
- jobRunRed = 0
- jobQueueMap = job.nmap
- jobQueueRed = job.nred
- if jobQueueMap == None:
- jobQueueMap = 0
- if jobQueueRed == None:
- jobQueueRed = 0
- # Add those servers running tasks
- for task in job.tasks.values():
- # Only if the task is useful
- if task.getType() != 'JOB_SETUP':
- for attempt in task.attempts.values():
- # Check in which node it was running
- if attempt.nodeId != None and attempt.state!=HadoopState.FAILED:
- if attempt.nodeId not in activeNodes:
- activeNodes[attempt.nodeId] = 1
- activeTasks += 1
- else:
- activeNodes[attempt.nodeId] += 1
- activeTasks += 1
- # Check if the task is running
- if HadoopState.isRunning(attempt.state):
- if attempt.attemptId.find('_m_') >= 0:
- jobRunMap += 1
- elif attempt.attemptId.find('_r_') >= 0:
- jobRunRed += 1
- else:
- if attempt.attemptId.find('_m_') >= 0:
- jobQueueMap -= 1
- elif attempt.attemptId.find('_r_') >= 0:
- jobQueueRed -= 1
- # Task maximum
- if jobRunMap < jobQueueMap:
- jobRunMap = jobQueueMap
- if jobRunRed < jobQueueRed:
- jobRunRed = jobQueueRed
- numRunMap += jobRunMap
- numRunRed += jobRunRed
-
- # Get inactive nodes
- inactiveNodes = list(onNodes)
- for nodeId in activeNodes:
- if nodeId in inactiveNodes:
- inactiveNodes.remove(nodeId)
-
- nodesToOff = []
- for nodeId in sorted(onNodes, reverse=True):
- # Account the inactivity
- inactivity = 0
- #if nodeId in inactivityAccount:
- # inactivity = inactivityAccount[nodeId]
- # Account the number of tasks running
- if nodeId in activeNodes:
- inactivity -= activeNodes[nodeId]
- # Save value to sort
- nodesToOff.append((nodeId, inactivity))
- if len(nodesToOff)>0:
- nodesToOff = sorted(nodesToOff, key=itemgetter(1), reverse=True)
- nodesToOff = [nodeId for nodeId, inactivity in nodesToOff]
-
- if len(onNodes)>0:
- print 'On nodes:', ",".join(onNodes)
- if len(offNodes)>0:
- print 'Off node:', ",".join(offNodes)
- if len(activeNodes)>0:
- print 'Active nodes:'
- for nodeId,v in sorted(activeNodes.iteritems(), key=itemgetter(1)):
- print '\t',nodeId, v
- if len(inactiveNodes)>0:
- print 'Inactive nodes:', ",".join(inactiveNodes)
- if len(nodesToOff)>0:
- print 'Nodes to off:', ",".join(nodesToOff)
-
- def getMapReduceSummary(self):
- out = ''
-
- # Jobs
- jobsRunning = 0
- jobsInit = 0
- jobsPrep = 0
- jobsSucceeded = 0
- jobsFailed = 0
- for job in self.jobs.values():
- if job.state == HadoopState.INIT:
- jobsInit += 1
- elif job.state == HadoopState.PREP:
- running = False
- for task in job.tasks.values():
- for attempt in task.attempts.values():
- if attempt.state == HadoopState.RUNNING:
- running = True
- if running:
- jobsRunning += 1
- else:
- jobsPrep += 1
- elif job.state == HadoopState.RUNNING:
- jobsRunning += 1
- elif job.state == HadoopState.SUCCEEDED:
- jobsSucceeded += 1
- elif job.state == HadoopState.FAILED:
- jobsFailed += 1
- out += 'jobs '
- out += str(len(self.jobs))+' '
- out += str(jobsInit)+' '
- out += str(jobsPrep)+' '
- out += str(jobsRunning)+' '
- out += str(jobsSucceeded)+' '
- out += str(jobsFailed)+' '
-
- # Nodes
- nodesTotal = 0
- nodesRunning = 0
- nodesAttempts = 0
- for node in self.nodes.values():
- nodesTotal += 1
- if len(node.attempts) > 0:
- nodesRunning += 1
- for attemptId in node.attempts:
- nodesAttempts += 1
- out += 'nodes '
- out += str(nodesTotal)+' '
- out += str(nodesRunning)+' '
- out += str(nodesAttempts)+' '
-
- return out
- if __name__=='__main__':
- parser = OptionParser()
- parser.add_option("-p", "--parse", dest="parseFile", help="specify file to parse", type="string", default=None)
- #parser.add_option("-p", "--parse", action="store_true", dest="parse", help="specify if the scheduler supports brown pricing")
- (options, args) = parser.parse_args()
-
- if options.parseFile != None:
- # Parse a given file
- # HADOOP_HOME+"/logs/hadoop-"+USER+"-jobtracker-"+MASTER_NODE+".log.2012-04-29"
- monitor = MonitorMapred(parseOld=True, logfile=options.parseFile)
- else:
- monitor = MonitorMapred()
- monitor.start()
-
- signal.signal(signal.SIGINT, signal_handler)
-
- # Console input
- while True:
- cmd = raw_input("$ ")
- if cmd.lower() == "debug":
- if monitor.debug == 0:
- monitor.debug = 1
- else:
- monitor.debug = 0
- elif cmd.lower() == "jobs":
- monitor.printOutput(jobs=True, nodes=False)
- elif cmd.lower() == "nodes":
- monitor.printOutput(jobs=False, nodes=True)
- elif cmd.lower().startswith('node'):
- cmdSplit = cmd.split(' ')
- if len(cmdSplit) >= 2:
- monitor.printNode(cmdSplit[1])
- elif cmd.lower().startswith('checkoff'):
- monitor.printNodeManager()
- elif cmd.lower() == "show":
- monitor.printOutput()
- elif cmd.lower() == "summary":
- print monitor.getMapReduceSummary()
- elif cmd.lower() == "exit":
- monitor.kill()
- break
- elif cmd.lower() == "help" or cmd.lower() == "h" or cmd.lower() == "?":
- print "debug"
- print "jobs"
- print "nodes"
- print "exit"
-
-
- #while True:
- #time.sleep(10.0)
-
- monitor.join()