PageRenderTime 25ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/ghadoopmonitor.py

https://gitlab.com/abushoeb/ghadoop-v1
Python | 584 lines | 532 code | 18 blank | 34 comment | 17 complexity | 301df9fdd856c0ecc7cc0d1a67501447 MD5 | raw file
  1. #!/usr/bin/env python2.5
  2. """
  3. GreenHadoop makes Hadoop aware of solar energy availability.
  4. http://www.research.rutgers.edu/~goiri/
  5. Copyright (C) 2012 Inigo Goiri, Rutgers University
  6. This program is free software: you can redistribute it and/or modify
  7. it under the terms of the GNU General Public License as published by
  8. the Free Software Foundation, either version 3 of the License, or
  9. (at your option) any later version.
  10. This program is distributed in the hope that it will be useful,
  11. but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. GNU General Public License for more details.
  14. You should have received a copy of the GNU General Public License
  15. along with this program. If not, see <http://www.gnu.org/licenses/>
  16. """
  17. import math
  18. import time
  19. import threading
  20. import os
  21. import sys
  22. import signal
  23. from datetime import datetime,timedelta
  24. from subprocess import call, PIPE, Popen
  25. from ghadoopcommons import *
  26. class MonitorMapred(threading.Thread):
  27. def __init__(self):
  28. threading.Thread.__init__(self)
  29. self.running = True
  30. # JobTracker
  31. self.logfileMapr = HADOOP_HOME+"/logs/hadoop-"+USER+"-jobtracker-"+MASTER_NODE+".log"
  32. #self.logfileMapr = "/scratch/muhammed/hadoop_log/hadoop-muhammed-jobtracker-crypt10.log"
  33. # http://forums.devshed.com/python-programming-11/how-to-monitor-a-file-for-changes-85767.html
  34. self.fileMapr = open(self.logfileMapr, 'r')
  35. self.watcherMapr = os.stat(self.logfileMapr)
  36. self.this_modifiedMapr = self.last_modifiedMapr = self.watcherMapr.st_mtime
  37. # Get nodes:
  38. nodes = getNodes()
  39. for nodeId in nodes:
  40. nodeTasks[nodeId] = []
  41. nodeJobs[nodeId] = []
  42. if nodes[nodeId][1] == "UP" or nodes[nodeId][1] == "DEC":
  43. if nodeId not in getNodesHdfsReady():
  44. getNodesHdfsReady().append(nodeId)
  45. # Read previous state of the system
  46. self.startTime = None
  47. # TODO read previous history
  48. if False:
  49. while True:
  50. line = self.fileMapr.readline()
  51. if not line: break
  52. #print line
  53. change = self.parseLine(line)
  54. #print "Tasks "+str(len(tasks))
  55. #print "Jobs "+str(len(jobs))
  56. print "Ready!"
  57. # Go to the end of the file
  58. self.fileMapr.seek(0,2)
  59. # Start helper threads
  60. self.checkstatus = MonitorMapredCheckStatus(self)
  61. self.checkstatus.start()
  62. self.nodestatus = MonitorNodeCheckStatus()
  63. self.nodestatus.start()
  64. def kill(self):
  65. self.running = False
  66. self.checkstatus.kill()
  67. self.nodestatus.kill()
  68. def run(self):
  69. # Monitor
  70. lastUpdate = 0
  71. change = True
  72. while self.running:
  73. # Update from log: JobTracker
  74. if self.this_modifiedMapr > self.last_modifiedMapr:
  75. self.last_modifiedMapr = self.this_modifiedMapr
  76. # File was modified, so read new lines, look for error keywords
  77. while 1:
  78. line = self.fileMapr.readline()
  79. if not line: break
  80. auxChange = self.parseLine(line)
  81. if auxChange:
  82. change = True
  83. self.watcherMapr = os.stat(self.logfileMapr)
  84. self.this_modifiedMapr = self.watcherMapr.st_mtime
  85. # Updates
  86. lastUpdate -= 1
  87. if lastUpdate<0:
  88. lastUpdate=5
  89. change=True
  90. if DEBUG>3 and change:
  91. self.printOutput()
  92. change=False
  93. time.sleep(1.0)
  94. def parseLine(self, line):
  95. change = False
  96. try:
  97. date = datetime.strptime(line.split(",")[0], "%Y-%m-%d %H:%M:%S")
  98. if self.startTime == None:
  99. self.startTime = date
  100. line = line.replace("\n", "")
  101. lineSplit = line.split(" ")
  102. if lineSplit[3].startswith("org.apache.hadoop.mapred.JobTracker") and len(lineSplit)>5:
  103. if line.find("added successfully")>=0:
  104. # Add job
  105. jobId = lineSplit[5]
  106. jobId = jobId[jobId.find("_")+1:]
  107. if jobId not in jobs:
  108. jobs[jobId] = Job(jobId, "PREP", date)
  109. else:
  110. jobs[jobId].state = "PREP"
  111. if jobs[jobId].submit == None:
  112. jobs[jobId].submit = date
  113. #job = addMonitorJob(id, jobId)
  114. #job.submit = date
  115. #job.state = "PREP"
  116. ## Store in data structures
  117. #runningJobs.append(id)
  118. if DEBUG>3:
  119. print str(date)+" Job "+jobId+"("+jobId+") started."
  120. change=True
  121. elif line.find("Adding task")>=0 and len(lineSplit)>13:
  122. # Add task to job
  123. taskId = lineSplit[10]
  124. if taskId.endswith(","):
  125. taskId = taskId[0:len(taskId)-1]
  126. taskIdSplit= taskId.split("_")
  127. jobId = taskIdSplit[1]+"_"+taskIdSplit[2]
  128. nodeId = lineSplit[13]
  129. nodeId = nodeId.replace("'tracker_","")
  130. nodeId = nodeId[0:nodeId.find(":")]
  131. if taskId not in tasks:
  132. task = Task(taskId, jobId, "RUNNING", date)
  133. task.start = date
  134. if task.submit == None:
  135. task.submit = date
  136. tasks[taskId] = task
  137. else:
  138. task = tasks[taskId]
  139. task.state = "RUNNING"
  140. task.start = date
  141. if task.submit == None:
  142. task.submit = date
  143. task.node = nodeId
  144. # check if this is actual job or just setting up
  145. if line.find("JOB_SETUP")>=0 or line.find("TASK_CLEANUP")>=0 or line.find("JOB_CLEANUP")>=0:
  146. task.jobsetup = True
  147. # Check if job existed and update status
  148. if jobId not in jobs:
  149. if not task.jobsetup:
  150. job = Job(jobId, "RUNNING", date, date)
  151. else:
  152. job = Job(jobId, "PREP", date)
  153. else:
  154. job = jobs[jobId]
  155. if not task.jobsetup:
  156. job.state = "RUNNING"
  157. if job.submit == None:
  158. job.submit = date
  159. if not task.jobsetup and job.start == None:
  160. job.start = date
  161. if taskId not in job.tasks:
  162. job.tasks.append(taskId)
  163. # Removing from other nodes
  164. for otherNodeId in nodeTasks:
  165. if taskId in nodeTasks[otherNodeId]:
  166. nodeTasks[otherNodeId].remove(taskId)
  167. # Assign task to node
  168. if nodeId not in nodeTasks:
  169. nodeTasks[nodeId] = []
  170. if taskId not in nodeTasks[nodeId]:
  171. nodeTasks[nodeId].append(taskId)
  172. # Assign job to node
  173. if nodeId not in nodeJobs:
  174. nodeJobs[nodeId] = []
  175. if jobId not in nodeJobs[nodeId] and not task.jobsetup:
  176. nodeJobs[nodeId].append(jobId)
  177. if DEBUG>3:
  178. print str(date)+" Task "+taskId+"("+jobId+") to "+nodeId+": "+str(nodeTasks[nodeId])
  179. change=True
  180. elif line.find("Removing task")>=0 and len(lineSplit)>6:
  181. #2011-07-13 17:18:51,532 INFO org.apache.hadoop.mapred.JobTracker: Removing task 'attempt_201107131634_0017_m_000126_0'
  182. #0 2011-07-13 17:18:51,532
  183. #2 INFO
  184. #3 org.apache.hadoop.mapred.JobTracker:
  185. #4 Removing
  186. #5 task
  187. #6 'attempt_201107131634_0017_m_000126_0'
  188. # Add task to job
  189. attemptId = lineSplit[6]
  190. if attemptId.startswith("\'"):
  191. attemptId = attemptId[1:len(attemptId)]
  192. if attemptId.endswith("\'"):
  193. attemptId = attemptId[0:len(attemptId)-1]
  194. if attemptId.endswith(","):
  195. attemptId = attemptId[0:len(attemptId)-1]
  196. attemptIdSplit= attemptId.split("_")
  197. taskId = "task_"+attemptIdSplit[1]+"_"+attemptIdSplit[2]+"_"+attemptIdSplit[3]+"_"+attemptIdSplit[4]
  198. jobId = attemptIdSplit[1]+"_"+attemptIdSplit[2]
  199. # Removing failed task nodes
  200. for nodeId in nodeTasks:
  201. if taskId in nodeTasks[nodeId]:
  202. nodeTasks[nodeId].remove(taskId)
  203. elif line.find("Retired job with id")>=0 and len(lineSplit)>8:
  204. # Job finished
  205. jobId = lineSplit[8]
  206. jobId = jobId.replace("'","")
  207. jobId = jobId[jobId.find("_")+1:]
  208. if jobId not in jobs:
  209. job = Job(jobId)
  210. jobs[jobId] = job
  211. else:
  212. job = jobs[jobId]
  213. job.end = date
  214. if job.submit == None:
  215. job.submit = date
  216. if job.start == None:
  217. job.start = date
  218. job.state = "SUCCEEDED"
  219. # Remove node -> Job
  220. for nodeId in nodeJobs:
  221. if jobId in nodeJobs[nodeId]:
  222. nodeJobs[nodeId].remove(jobId)
  223. if DEBUG>3:
  224. print str(date)+" Job "+jobId+"("+jobId+") finished"
  225. change=True
  226. elif lineSplit[3].startswith("org.apache.hadoop.mapred.JobInProgress"):
  227. if line.find("Task")>=0 and line.find("has completed")>=0:
  228. # Task finished
  229. taskId = lineSplit[8]
  230. taskIdSplit= taskId.split("_")
  231. jobId = taskIdSplit[1]+"_"+taskIdSplit[2]
  232. if taskId in tasks:
  233. task = tasks[taskId]
  234. task.end = date
  235. if task.submit == None:
  236. task.submit = date
  237. if task.start == None:
  238. task.start = date
  239. task.state = "SUCCEEDED"
  240. # Cleaning running task nodes
  241. if task.node in nodeTasks:
  242. if taskId in nodeTasks[task.node]:
  243. nodeTasks[task.node].remove(taskId)
  244. if DEBUG>3:
  245. print str(date)+" Task "+taskId+"("+jobId+") finished"
  246. change=True
  247. elif line.find("has split on")>=0:
  248. # Tasks generated
  249. taskId = lineSplit[4]
  250. taskId = taskId.replace("tip:","")
  251. taskIdSplit= taskId.split("_")
  252. jobId = taskIdSplit[1]+"_"+taskIdSplit[2]
  253. if taskId not in tasks:
  254. task = Task(taskId, jobId, "PREP", date)
  255. tasks[taskId] = task
  256. else:
  257. task = tasks[taskId]
  258. task.state = "PREP"
  259. if task.submit == None:
  260. task.submit = date
  261. # Check if job existed and update status
  262. if jobId not in jobs:
  263. job = Job(jobId, "PREP", date, date)
  264. else:
  265. job = jobs[jobId]
  266. if job.state == "UNKNOWN":
  267. job.state = "PREP"
  268. if job.submit == None:
  269. job.submit = date
  270. if taskId not in job.tasks:
  271. job.tasks.append(taskId)
  272. #task = addMonitorTask(id, taskId)
  273. #if task.submit == None:
  274. #task.submit = date
  275. #task.state = "PREP"
  276. if DEBUG>3:
  277. print str(date)+" Task "+taskId+"("+jobId+") created"
  278. change=True
  279. elif line.find("initialized successfully with")>=0:
  280. # Generate maps and reduces
  281. jobId = lineSplit[5]
  282. jobId = jobId.replace("job_", "")
  283. nmap=int(lineSplit[9])
  284. nred=int(lineSplit[13])
  285. # Check if job existed and update status
  286. if jobId not in jobs:
  287. job = Job(jobId, "PREP", date, date)
  288. else:
  289. job = jobs[jobId]
  290. if job.state == "UNKNOWN":
  291. job.state = "PREP"
  292. if job.submit == None:
  293. job.submit = date
  294. for i in range(0, nred):
  295. taskId = "task_"+jobId+"_r_"+str(i).zfill(6)
  296. task = Task(taskId, jobId, "PREP", date)
  297. tasks[taskId] = task
  298. if taskId not in job.tasks:
  299. job.tasks.append(taskId)
  300. for i in range(0, nmap):
  301. taskId = "task_"+jobId+"_m_"+str(i).zfill(6)
  302. task = Task(taskId, jobId, "PREP", date)
  303. tasks[taskId] = task
  304. if taskId not in job.tasks:
  305. job.tasks.append(taskId)
  306. change=True
  307. except ValueError:
  308. if DEBUG>3:
  309. print "Error line: "+line
  310. except Exception, e:
  311. print e
  312. print "Error line: "+line
  313. return change
  314. def printOutput(self):
  315. print "========================================================="
  316. print "Tasks ("+str(len(tasks))+"):"
  317. runn = 0
  318. prep = 0
  319. comp = 0
  320. unkn = 0
  321. for taskId in sorted(tasks):
  322. task = tasks[taskId]
  323. if task.state == "SUCCEEDED":
  324. comp+=1
  325. elif task.state == "RUNNING":
  326. runn+=1
  327. elif task.state == "PREP":
  328. prep+=1
  329. else:
  330. unkn+=1
  331. if len(tasks)<30:
  332. print " "+str(task)
  333. if len(tasks)>=30:
  334. print " Unknown: "+str(unkn)
  335. print " Queue: "+str(prep)
  336. print " Running: "+str(runn)
  337. print " Complete: "+str(comp)
  338. nodes = getNodes()
  339. print "Nodes->Tasks ("+str(len(nodeTasks))+"):"
  340. for nodeId in sorted(nodeTasks):
  341. out = "\t"+str(nodeId)
  342. if nodeId in nodes:
  343. for status in nodes[nodeId]:
  344. out += " "+status
  345. if nodeId in nodeTasks and len(nodeTasks[nodeId])>0:
  346. out+=":\t"+str(nodeTasks[nodeId])
  347. print out
  348. print "Nodes->Jobs ("+str(len(nodeJobs))+"):"
  349. for nodeId in sorted(nodeJobs):
  350. out = "\t"+str(nodeId)
  351. if nodeId in nodes:
  352. for status in nodes[nodeId]:
  353. out += " "+status
  354. if nodeId in nodeJobs and len(nodeJobs[nodeId])>0:
  355. out+=":\t"+str(nodeJobs[nodeId])
  356. print out
  357. print "Jobs ("+str(len(jobs))+"):"
  358. for jobId in sorted(jobs):
  359. out = ""
  360. job = jobs[jobId]
  361. for taskId in job.tasks:
  362. task = tasks[taskId]
  363. if task.state == "RUNNING":
  364. out +=bcolors.BLUEBG+" "+bcolors.ENDC
  365. elif task.state == "SUCCEEDED":
  366. out +=bcolors.GREENBG+" "+bcolors.ENDC
  367. else:
  368. out +=" "
  369. print "\t"+str(job)+"\t"+out+" "+str(len(job.tasks))
  370. #print "Required files ("+str(len(requiredFiles))+"):"
  371. #for fileId in sorted(requiredFiles):
  372. #print "\t"+str(fileId)
  373. class MonitorMapredCheckStatus(threading.Thread):
  374. def __init__(self, monitor):
  375. threading.Thread.__init__(self)
  376. self.monitor = monitor
  377. self.running = True
  378. self.times = 0
  379. def kill(self):
  380. self.running = False
  381. def run(self):
  382. # Monitor
  383. while self.running:
  384. self.checkStatus()
  385. time.sleep(5.0)
  386. def checkStatus(self):
  387. self.times += 1
  388. if self.times%3 == 0:
  389. # Check with Hadoop info
  390. for job in getJobsHadoop().values():
  391. # Update info in local structure
  392. if job.id not in jobs:
  393. jobs[job.id] = job
  394. else:
  395. if job.state=="SUCCEEDED" and jobs[job.id].end == None:
  396. jobs[job.id].end = job.end
  397. jobs[job.id].state = job.state
  398. jobs[job.id].priority = job.priority
  399. # Update jobs succeeded
  400. for job in jobs.values():
  401. if job.state == "SUCCEEDED":
  402. for taskId in job.tasks:
  403. task = tasks[taskId]
  404. if task.end == None:
  405. task.end = job.end
  406. task.state = "SUCCEEDED"
  407. # Update tasks succeeded
  408. for task in tasks.values():
  409. if task.state == "SUCCEEDED":
  410. try:
  411. if task.id in nodeTasks[task.node]:
  412. nodeTasks[task.node].remove(task.id)
  413. except KeyError:
  414. None
  415. # Update node->job
  416. for nodeId in nodeJobs:
  417. for jobId in list(nodeJobs[nodeId]):
  418. try:
  419. job = jobs[jobId]
  420. if job.state == "SUCCEEDED":
  421. nodeJobs[nodeId].remove(jobId)
  422. except KeyError:
  423. None
  424. except ValueError:
  425. None
  426. # Update node->task
  427. for nodeId in nodeTasks:
  428. for taskId in list(nodeTasks[nodeId]):
  429. try:
  430. task = tasks[taskId]
  431. if task.state == "SUCCEEDED" or task.node != nodeId:
  432. nodeTasks[nodeId].remove(taskId)
  433. except KeyError:
  434. None
  435. except ValueError:
  436. None
  437. class MonitorNodeCheckStatus(threading.Thread):
  438. def __init__(self):
  439. threading.Thread.__init__(self)
  440. self.running = True
  441. # Namenode
  442. self.logfileHdfs = HADOOP_HOME+"/logs/hadoop-"+USER+"-namenode-"+MASTER_NODE+".log"
  443. self.fileHdfs = open(self.logfileHdfs, 'r')
  444. self.watcherHdfs = os.stat(self.logfileHdfs)
  445. self.this_modifiedHdfs = self.last_modifiedHdfs = self.watcherHdfs.st_mtime
  446. # Go to the end of the file
  447. self.fileHdfs.seek(0,2)
  448. # Read nodes
  449. pipe=Popen([HADOOP_HOME+"/bin/hdfs", "dfsadmin", "-printTopology"], stdout=PIPE, stderr=open('/dev/null', 'w'))
  450. text = pipe.communicate()[0]
  451. self.nodeName = {}
  452. for line in text.split('\n'):
  453. if line !="" and not line.startswith("Rack:"):
  454. line = line.strip()
  455. lineSplit = line.split(" ")
  456. if len(lineSplit)>=2:
  457. nodeId = lineSplit[1].replace("(", "").replace(")", "")
  458. self.nodeName[lineSplit[0]] = nodeId
  459. def kill(self):
  460. self.running = False
  461. def run(self):
  462. # Monitor
  463. while self.running:
  464. # Update from log: Namenode
  465. if self.this_modifiedHdfs > self.last_modifiedHdfs:
  466. self.last_modifiedHdfs = self.this_modifiedHdfs
  467. # File was modified, so read new lines, look for error keywords
  468. while True:
  469. line = self.fileHdfs.readline()
  470. if not line:
  471. break
  472. try:
  473. if line.find("org.apache.hadoop.net.NetworkTopology")>0:
  474. lineSplit = line.split(" ")
  475. if len(lineSplit)>3 and lineSplit[3].startswith("org.apache.hadoop.net.NetworkTopology"):
  476. date = datetime.strptime(line.split(",")[0], "%Y-%m-%d %H:%M:%S")
  477. if line.find("Removing a node")>=0:
  478. nodeId = lineSplit[7]
  479. nodeId = nodeId.replace("\n", "")
  480. nodeId = nodeId[nodeId.rindex("/")+1:]
  481. if nodeId in self.nodeName:
  482. nodeId = self.nodeName[nodeId]
  483. if nodeId in getNodesHdfsReady():
  484. getNodesHdfsReady().remove(nodeId)
  485. change = True
  486. elif line.find("Adding a new node")>=0:
  487. nodeId = lineSplit[8]
  488. nodeId = nodeId.replace("\n", "")
  489. nodeId = nodeId[nodeId.rindex("/")+1:]
  490. if nodeId in self.nodeName:
  491. nodeId = self.nodeName[nodeId]
  492. if nodeId not in getNodesHdfsReady():
  493. getNodesHdfsReady().append(nodeId)
  494. change = True
  495. except ValueError:
  496. if DEBUG>3:
  497. print "Error line: "+line
  498. except TypeError:
  499. if DEBUG>3:
  500. print "Error line: "+line
  501. self.watcherHdfs = os.stat(self.logfileHdfs)
  502. self.this_modifiedHdfs = self.watcherHdfs.st_mtime
  503. time.sleep(2.0)
  504. if __name__=='__main__':
  505. DEBUG=4
  506. thread = MonitorMapred()
  507. thread.start()
  508. signal.signal(signal.SIGINT, signal_handler)
  509. while True:
  510. time.sleep(10.0)
  511. thread.join()