PageRenderTime 28ms CodeModel.GetById 18ms RepoModel.GetById 1ms app.codeStats 0ms

/contrib/collect_sge_job_timings.sh

https://bitbucket.org/cistrome/cistrome-harvard/
Shell | 126 lines | 108 code | 5 blank | 13 comment | 2 complexity | 8db239898322d04da08fb4d6a6f28282 MD5 | raw file
  1. #!/bin/sh
  2. ##
  3. ## CHANGE ME to galaxy's database name
  4. ##
  5. DATABASE=galaxyprod
  6. ##
  7. ## AWK script to extract the relevant fields of SGE's qacct report
  8. ## and write them all in one line.
  9. AWKSCRIPT='
  10. $1=="jobnumber" { job_number = $2 }
  11. $1=="qsub_time" { qsub_time = $2 }
  12. $1=="start_time" { start_time = $2 }
  13. $1=="end_time" { end_time = $2
  14. print job_number, qsub_time, start_time, end_time
  15. }
  16. '
  17. FIFO=$(mktemp -u) || exit 1
  18. mkfifo "$FIFO" || exit 1
  19. ##
  20. ## Write the SGE/QACCT job report into a pipe
  21. ## (later will be loaded into a temporary table)
  22. qacct -j |
  23. egrep "jobnumber|qsub_time|start_time|end_time" |
  24. sed 's/ */\t/' |
  25. awk -v FS="\t" -v OFS="\t" "$AWKSCRIPT" |
  26. grep -v -- "-/-" > "$FIFO" &
  27. ##
  28. ## The SQL to generate the report
  29. ##
  30. SQL="
  31. --
  32. -- Temporary table which contains the qsub/start/end times, based on SGE's qacct report.
  33. --
  34. CREATE TEMPORARY TABLE sge_times (
  35. sge_job_id INTEGER PRIMARY KEY,
  36. qsub_time TIMESTAMP WITHOUT TIME ZONE,
  37. start_time TIMESTAMP WITHOUT TIME ZONE,
  38. end_time TIMESTAMP WITHOUT TIME ZONE
  39. );
  40. COPY sge_times FROM '$FIFO' ;
  41. --
  42. -- Temporary table which contains a unified view of all galaxy jobs.
  43. -- for each job:
  44. -- the user name, total input size (bytes), and input file types, DBKEY
  45. -- creation time, update time, SGE job runner parameters
  46. -- If a job had more than one input file, then some parameters might not be accurate (e.g. DBKEY)
  47. -- as one will be chosen arbitrarily
  48. CREATE TEMPORARY TABLE job_input_sizes AS
  49. SELECT
  50. job.job_runner_external_id as job_runner_external_id,
  51. min(job.id) as job_id,
  52. min(job.create_time) as job_create_time,
  53. min(job.update_time) as job_update_time,
  54. min(galaxy_user.email) as email,
  55. min(job.tool_id) as tool_name,
  56. -- This hack requires a user-custom aggregate function, comment it out for now
  57. -- textcat_all(hda.extension || ' ') as file_types,
  58. sum(dataset.file_size) as total_input_size,
  59. count(dataset.file_size) as input_dataset_count,
  60. min(job.job_runner_name) as job_runner_name,
  61. -- This hack tries to extract the DBKEY attribute from the metadata JSON string
  62. min(substring(encode(metadata,'escape') from '\"dbkey\": \\\\[\"(.*?)\"\\\\]')) as dbkey
  63. FROM
  64. job,
  65. galaxy_user,
  66. job_to_input_dataset,
  67. history_dataset_association hda,
  68. dataset
  69. WHERE
  70. job.user_id = galaxy_user.id
  71. AND
  72. job.id = job_to_input_dataset.job_id
  73. AND
  74. hda.id = job_to_input_dataset.dataset_id
  75. AND
  76. dataset.id = hda.dataset_id
  77. AND
  78. job.job_runner_external_id is not NULL
  79. GROUP BY
  80. job.job_runner_external_id;
  81. --
  82. -- Join the two temporary tables, create a nice report
  83. --
  84. SELECT
  85. job_input_sizes.job_runner_external_id as sge_job_id,
  86. job_input_sizes.job_id as galaxy_job_id,
  87. job_input_sizes.email,
  88. job_input_sizes.tool_name,
  89. -- ## SEE previous query for commented-out filetypes field
  90. -- job_input_sizes.file_types,
  91. job_input_sizes.job_runner_name as sge_params,
  92. job_input_sizes.dbkey,
  93. job_input_sizes.total_input_size,
  94. job_input_sizes.input_dataset_count,
  95. job_input_sizes.job_update_time - job_input_sizes.job_create_time as galaxy_total_time,
  96. sge_times.end_time - sge_times.qsub_time as sge_total_time,
  97. sge_times.start_time - sge_times.qsub_time as sge_waiting_time,
  98. sge_times.end_time - sge_times.start_time as sge_running_time,
  99. job_input_sizes.job_create_time as galaxy_job_create_time
  100. -- ## no need to show the exact times, the deltas (above) are informative enough
  101. -- job_input_sizes.job_update_time as galaxy_job_update_time,
  102. -- sge_times.qsub_time as sge_qsub_time,
  103. -- sge_times.start_time as sge_start_time,
  104. -- sge_times.end_time as sge_end_time
  105. FROM
  106. job_input_sizes
  107. LEFT OUTER JOIN
  108. SGE_TIMES
  109. ON (job_input_sizes.job_runner_external_id = sge_times.sge_job_id)
  110. ORDER BY
  111. galaxy_job_create_time
  112. "
  113. echo "$SQL" | psql --pset "footer=off" -F" " -A --quiet "$DATABASE"