/cron/check_galaxy.sh

https://bitbucket.org/cistrome/cistrome-harvard/ · Shell · 222 lines · 188 code · 17 blank · 17 comment · 37 complexity · 0fbb17ffe6cbf77ad54cc325d109816e MD5 · raw file

  1. #!/bin/sh
  2. #set -xv
  3. #
  4. # Runs the scripts/check_galaxy.py script in a way that's easy to handle from cron
  5. #
  6. # defaults (note: default sleep is below since it depends on debug)
  7. DEBUG=0
  8. STAGGER=0
  9. INTERVAL=3
  10. MAIL=
  11. PAGE=
  12. NEWHIST=
  13. BARDARG=0
  14. # get commandline opts
  15. while getopts dsi:l:m:p:n optname
  16. do
  17. case $optname in
  18. d) DEBUG=1 ;;
  19. s) STAGGER=1 ;;
  20. i) INTERVAL=$OPTARG ;;
  21. l) SLEEP=$OPTARG ;;
  22. m) MAIL="$MAIL $OPTARG" ;;
  23. p) PAGE="$PAGE $OPTARG" ;;
  24. n) NEWHIST="-n" ;;
  25. *) BADARG=1 ;;
  26. esac
  27. done
  28. shift `expr $OPTIND - 1`
  29. if [ -z "$1" -o "$BADARG" ]; then
  30. cat <<EOF
  31. usage: `basename $0` [-ds] [-i interval] [-m email_address]+ [-p pager_address]+ <galaxy_host>"
  32. -d Print debugging information.
  33. -s Stagger mailing the pagers/emails, instead of all at once when
  34. there's a problem. Useful for running check_galaxy at night.
  35. -i <interval> The number of times this wrapper should execute before mailing
  36. the next address, when staggering is enabled. Mail is sent
  37. every <interval> runs of the program, so the actual time
  38. between emails is:
  39. time = (<interval>) * (how often wrapper runs from cron)
  40. -l <seconds> This wrapper runs check_galaxy a second time if the first check
  41. fails, in case the problem is intermittent. <seconds> is how
  42. many seconds to sleep between checks.
  43. -m <address> Email addresses to send the full check_galaxy output to, if
  44. Galaxy is down. Use multiple -m options to specify multiple
  45. addresses. When staggering, email will be sent in the order
  46. which you specify -m options on the command line.
  47. -p <address> Like -m, but sends just the last line of check_galaxy's output.
  48. Useful for pagers. When staggering is enabled and both -m and
  49. -p options are present, the first -m address and the first -p
  50. address are mailed simultaneously, followed by the second -m
  51. and second -p, and so on.
  52. -n Create a new history (passes the -n option to check_galaxy.py).
  53. <galaxy_host> The hostname of the Galaxy server to check. Use a : if running
  54. on a non-80 port (e.g. galaxy.example.com:8080).
  55. EOF
  56. exit 1
  57. fi
  58. if [ -z "$SLEEP" ]; then
  59. if [ $DEBUG ]; then
  60. SLEEP=2
  61. else
  62. SLEEP=60
  63. fi
  64. fi
  65. # globals
  66. CRON_DIR=`dirname $0`
  67. SCRIPTS_DIR="$CRON_DIR/../scripts"
  68. CHECK_GALAXY="$SCRIPTS_DIR/check_galaxy.py"
  69. VAR="$HOME/.check_galaxy"
  70. # sanity
  71. if [ ! -f $CHECK_GALAXY ]; then
  72. [ $DEBUG = 1 ] && echo "$CHECK_GALAXY is missing"
  73. exit 0
  74. fi
  75. # Do any other systems' default ps not take BSD ps args?
  76. case `uname -s` in
  77. SunOS) PS="/usr/ucb/ps" ;;
  78. *) PS="ps" ;;
  79. esac
  80. NOTIFIED_MAIL="$VAR/$1/mail"
  81. NOTIFIED_PAGE="$VAR/$1/page"
  82. MUTEX="$VAR/$1/wrap.mutex"
  83. COUNT="$VAR/$1/wrap.count"
  84. STAGGER_FILE="$VAR/$1/wrap.stagger"
  85. for dir in $VAR/$1 $NOTIFIED_MAIL $NOTIFIED_PAGE; do
  86. if [ ! -d $dir ]; then
  87. mkdir -p -m 0700 $dir
  88. if [ $? -ne 0 ]; then
  89. [ $DEBUG = 1 ] && echo "unable to create dir: $dir"
  90. exit 0
  91. fi
  92. fi
  93. done
  94. if [ ! -f "$VAR/$1/login" ]; then
  95. [ $DEBUG = 1 ] && cat <<EOF
  96. Please create the file:
  97. $VAR/$1/login
  98. This should contain a username and password to log in to
  99. Galaxy with, on one line, separated by whitespace, e.g.:
  100. check_galaxy@example.com password
  101. If the user does not exist, check_galaxy will create it
  102. for you.
  103. EOF
  104. exit 0
  105. fi
  106. if [ $STAGGER ]; then
  107. if [ -f "$STAGGER_FILE" ]; then
  108. STAGGER_COUNT=`cat $STAGGER_FILE`
  109. else
  110. STAGGER_COUNT=$INTERVAL
  111. fi
  112. fi
  113. # only run one at once
  114. if [ -f $MUTEX ]; then
  115. pid=`cat $MUTEX`
  116. $PS p $pid >/dev/null 2>&1
  117. if [ $? -eq 0 ]; then
  118. if [ -f $COUNT ]; then
  119. count=`cat $COUNT`
  120. else
  121. count=0
  122. fi
  123. if [ "$count" -eq 3 ]; then
  124. echo "A check_galaxy process for $1 has been running for an unusually long time. Something is broken." \
  125. | mail -s "$1 problems" $MAIL
  126. fi
  127. expr $count + 1 > $COUNT
  128. exit 0
  129. else
  130. # stale mutex
  131. rm -f $MUTEX
  132. fi
  133. fi
  134. rm -f $COUNT
  135. echo $$ > $MUTEX
  136. [ $DEBUG = 1 ] && echo "running first check"
  137. first_try=`$CHECK_GALAXY $NEWHIST $1 2>&1`
  138. if [ $? -ne 0 ]; then
  139. # if failure, wait and try again
  140. [ $DEBUG = 1 ] && echo "first check failed, sleeping $SLEEP seconds for second run"
  141. sleep $SLEEP
  142. else
  143. # if successful
  144. [ $DEBUG = 1 ] && echo "first check succeeded"
  145. for file in $NOTIFIED_MAIL/* $NOTIFIED_PAGE/*; do
  146. recip=`basename $file`
  147. # the literal string including the * will be passed if the dir is empty
  148. [ "$recip" = '*' ] && continue
  149. echo "$1 is now okay" | mail -s "$1 OK" $recip
  150. rm -f $file
  151. [ $DEBUG = 1 ] && echo "up: mailed $recip"
  152. done
  153. rm -f $MUTEX $STAGGER_FILE
  154. exit 0
  155. fi
  156. [ $DEBUG = 1 ] && echo "running second check"
  157. second_try=`$CHECK_GALAXY $NEWHIST $1 2>&1`
  158. if [ $? -ne 0 ]; then
  159. [ $DEBUG = 1 ] && echo "second check failed"
  160. if [ $STAGGER = 1 ]; then
  161. if [ "$STAGGER_COUNT" -eq "$INTERVAL" ]; then
  162. # send notification this run
  163. echo 1 > $STAGGER_FILE
  164. else
  165. # don't send notification this run
  166. [ $DEBUG = 1 ] && echo "$1 is down, but it's not time to send an email. STAGGER_COUNT was $STAGGER_COUNT"
  167. expr $STAGGER_COUNT + 1 > $STAGGER_FILE
  168. rm -f $MUTEX
  169. exit 0
  170. fi
  171. fi
  172. for recip in $MAIL; do
  173. if [ ! -f "$NOTIFIED_MAIL/$recip" ]; then
  174. cat <<HERE | mail -s "$1 problems" $recip
  175. $second_try
  176. HERE
  177. touch "$NOTIFIED_MAIL/$recip"
  178. [ $DEBUG = 1 ] && echo "dn: mailed $recip"
  179. [ $STAGGER = 1 ] && break
  180. fi
  181. done
  182. for recip in $PAGE; do
  183. if [ ! -f "$NOTIFIED_PAGE/$recip" ]; then
  184. cat <<HERE | tail -1 | mail -s "$1 problems" $recip
  185. $second_try
  186. HERE
  187. touch "$NOTIFIED_PAGE/$recip"
  188. [ $DEBUG = 1 ] && echo "dn: mailed $recip"
  189. [ $STAGGER = 1 ] && break
  190. fi
  191. done
  192. else
  193. [ $DEBUG = 1 ] && echo "second check succeeded"
  194. for file in $NOTIFIED_MAIL/* $NOTIFIED_PAGE/*; do
  195. recip=`basename $file`
  196. [ "$recip" = '*' ] && continue
  197. echo "$1 is now okay" | mail -s "$1 OK" $recip
  198. rm -f $file
  199. [ $DEBUG = 1 ] && echo "up: mailed $recip"
  200. done
  201. rm -f $STAGGER_FILE
  202. fi
  203. rm -f $MUTEX
  204. exit 0