PageRenderTime 18ms CodeModel.GetById 10ms app.highlight 5ms RepoModel.GetById 1ms app.codeStats 0ms

/contrib/collect_sge_job_timings.sh

https://bitbucket.org/cistrome/cistrome-harvard/
Shell | 126 lines | 108 code | 5 blank | 13 comment | 2 complexity | 8db239898322d04da08fb4d6a6f28282 MD5 | raw file
  1#!/bin/sh
  2
  3##
  4## CHANGE ME to galaxy's database name
  5## 
  6DATABASE=galaxyprod
  7
  8##
  9## AWK script to extract the relevant fields of SGE's qacct report
 10##   and write them all in one line.
 11AWKSCRIPT='
 12$1=="jobnumber" { job_number = $2 }
 13$1=="qsub_time" { qsub_time = $2 }
 14$1=="start_time" { start_time = $2 }
 15$1=="end_time" { end_time = $2 
 16        print job_number, qsub_time, start_time, end_time
 17}
 18'
 19
 20FIFO=$(mktemp -u) || exit 1
 21mkfifo "$FIFO" || exit 1
 22
 23##
 24## Write the SGE/QACCT job report into a pipe
 25## (later will be loaded into a temporary table)
 26qacct -j |
 27    egrep "jobnumber|qsub_time|start_time|end_time" |
 28    sed 's/  */\t/'  |
 29    awk -v FS="\t" -v OFS="\t" "$AWKSCRIPT" |
 30    grep -v -- "-/-" > "$FIFO" &
 31
 32##
 33##  The SQL to generate the report
 34##
 35SQL="
 36--
 37-- Temporary table which contains the qsub/start/end times, based on SGE's qacct report.
 38--
 39CREATE TEMPORARY TABLE sge_times (
 40  sge_job_id INTEGER PRIMARY KEY,
 41  qsub_time TIMESTAMP WITHOUT TIME ZONE,
 42  start_time TIMESTAMP WITHOUT TIME ZONE,
 43  end_time TIMESTAMP WITHOUT TIME ZONE
 44);
 45
 46COPY sge_times FROM '$FIFO' ;
 47
 48--
 49-- Temporary table which contains a unified view of all galaxy jobs.
 50-- for each job:
 51--   the user name, total input size (bytes), and input file types, DBKEY
 52--   creation time, update time, SGE job runner parameters
 53-- If a job had more than one input file, then some parameters might not be accurate (e.g. DBKEY)
 54-- as one will be chosen arbitrarily
 55CREATE TEMPORARY TABLE job_input_sizes AS
 56SELECT
 57 job.job_runner_external_id as job_runner_external_id,
 58 min(job.id) as job_id,
 59 min(job.create_time) as job_create_time,
 60 min(job.update_time) as job_update_time,
 61 min(galaxy_user.email) as email,
 62 min(job.tool_id) as tool_name,
 63-- This hack requires a user-custom aggregate function, comment it out for now
 64-- textcat_all(hda.extension || ' ') as file_types,
 65 sum(dataset.file_size) as total_input_size,
 66 count(dataset.file_size) as input_dataset_count,
 67 min(job.job_runner_name) as job_runner_name,
 68-- This hack tries to extract the DBKEY attribute from the metadata JSON string
 69 min(substring(encode(metadata,'escape') from '\"dbkey\": \\\\[\"(.*?)\"\\\\]')) as dbkey
 70FROM
 71 job,
 72 galaxy_user,
 73 job_to_input_dataset,
 74 history_dataset_association hda,
 75 dataset
 76WHERE
 77 job.user_id = galaxy_user.id
 78 AND
 79 job.id = job_to_input_dataset.job_id
 80 AND
 81 hda.id = job_to_input_dataset.dataset_id
 82 AND
 83 dataset.id = hda.dataset_id
 84 AND
 85 job.job_runner_external_id is not NULL
 86GROUP BY
 87 job.job_runner_external_id;
 88
 89
 90--
 91-- Join the two temporary tables, create a nice report
 92--
 93SELECT
 94 job_input_sizes.job_runner_external_id as sge_job_id,
 95 job_input_sizes.job_id as galaxy_job_id,
 96 job_input_sizes.email,
 97 job_input_sizes.tool_name,
 98-- ## SEE previous query for commented-out filetypes field
 99-- job_input_sizes.file_types,
100 job_input_sizes.job_runner_name as sge_params,
101 job_input_sizes.dbkey,
102 job_input_sizes.total_input_size,
103 job_input_sizes.input_dataset_count,
104 job_input_sizes.job_update_time - job_input_sizes.job_create_time as galaxy_total_time,
105 sge_times.end_time - sge_times.qsub_time as sge_total_time,
106 sge_times.start_time - sge_times.qsub_time as sge_waiting_time,
107 sge_times.end_time - sge_times.start_time as sge_running_time,
108 job_input_sizes.job_create_time as galaxy_job_create_time
109-- ## no need to show the exact times, the deltas (above) are informative enough
110-- job_input_sizes.job_update_time as galaxy_job_update_time,
111-- sge_times.qsub_time as sge_qsub_time,
112-- sge_times.start_time as sge_start_time,
113-- sge_times.end_time as sge_end_time
114FROM
115 job_input_sizes
116LEFT OUTER JOIN
117 SGE_TIMES
118ON (job_input_sizes.job_runner_external_id = sge_times.sge_job_id)
119ORDER BY
120 galaxy_job_create_time
121 
122"
123
124echo "$SQL" | psql --pset "footer=off" -F"  " -A --quiet "$DATABASE"
125
126