/tests/t1003-slurm.sh

https://code.google.com/ · Shell · 133 lines · 96 code · 13 blank · 24 comment · 10 complexity · a9ffa2d9824d6e73446f2dd434605cae MD5 · raw file

  1. #!/bin/sh
  2. #
  3. # Run tests of the SLURM module if slurm is available and there are
  4. # any currently running jobs, or if we can start a job.
  5. #
  6. test_description='slurm module'
  7. . ${srcdir:-.}/test-lib.sh
  8. if ! test_have_prereq MOD_MISC_SLURM; then
  9. skip_all='skipping slurm tests, slurm module not available'
  10. test_done
  11. fi
  12. if ! squeue >/dev/null 2>&1; then
  13. skip_all='skipping slurm tests, slurm install not available'
  14. test_done
  15. fi
  16. export KILLJOBIDS=""
  17. #
  18. # Create a batch job and return the jobid or FAILED on stdout
  19. #
  20. create_batch_job() {
  21. ID=$(printf '#!/bin/sh\nsleep 100\n'|sbatch "$@" |sed 's/Submitted batch job //')
  22. count=0
  23. while test "$(squeue -j $ID -ho %t)" != "R" && $count -lt 30; do
  24. sleep 1;
  25. $((count=count+1))
  26. done
  27. if test "$count" -ge 30; then
  28. echo FAILED
  29. else
  30. KILLJOBIDS="$KILLJOBIDS $JOBID"
  31. echo $ID
  32. fi
  33. }
  34. #
  35. # Ensure slurm module is loaded (i.e. same as -M genders)
  36. #
  37. export PDSH_MISC_MODULES=slurm
  38. JOBIDS=$(squeue -ho %i -trunning)
  39. if [ -n "$JOBIDS" ]; then
  40. #
  41. # There are already running jobs we can use for testing
  42. #
  43. JOBID=$(echo $JOBIDS | tr ' ' '\n' | head -1)
  44. else
  45. #
  46. # Need to create our own job
  47. # (Only run if long tests were requested)
  48. #
  49. if ! test_have_prereq LONGTESTS; then
  50. skip_all='skipping slurm tests, run with --long or PDSH_TEST_LONG'
  51. test_done
  52. fi
  53. echo "Attempting to initiate slurm job" >&2
  54. JOBID=$(create_batch_job -N2)
  55. if test "$JOBID" = "FAILED"; then
  56. skip_all='skipping slurm tests, unable to run a job'
  57. test_done
  58. fi
  59. fi
  60. #
  61. # Capture the nodes in job JOBID
  62. #
  63. NODES=$(squeue -ho %N -j $JOBID)
  64. test_expect_success 'slurm -j option works' '
  65. O=$(pdsh -j$JOBID -q | tail -1)
  66. if test "x$O" != "x$NODES"; then
  67. say_color error "Error: pdsh -j$JOBID selected nodes $O expected $NODES"
  68. squeue -hj $JOBID
  69. false
  70. fi
  71. '
  72. test_expect_success 'slurm module reads SLURM_JOBID if no wcoll set' '
  73. O=$(SLURM_JOBID=$JOBID pdsh -q | tail -1)
  74. if test "x$O" != "x$NODES"; then
  75. say_color error "Error: pdsh -j$JOBID selected nodes $O expected $NODES"
  76. squeue -hj $JOBID
  77. false
  78. fi
  79. '
  80. test_expect_success 'slurm -j all option works' '
  81. O1=$(pdsh -j all -q | tail -1)
  82. O2=$(pdsh -j$(squeue -ho %i -trunning | tr " \n" ,,) -q | tail -1)
  83. if ! test "$O1" = "$O2"; then
  84. say_color error "Error: pdsh -j all failed to select all allocated nodes"
  85. say_color error "a: $O1"
  86. say_color error "b: $O2"
  87. false
  88. fi
  89. '
  90. test_expect_success LONGTESTS 'slurm -j all does not select completed jobs' '
  91. jobid=$(create_batch_job -N1) && test "$jobid" != "FAILED" &&
  92. node=$(squeue -ho%N -j $jobid) &&
  93. scancel $jobid &&
  94. while test "$(squeue -j "$jobid" -ho %t)" = "CG"; do sleep 0.2; done
  95. if pdsh -j all -Q | tail -1 | tr , "\n" | grep "^$node$"; then
  96. if test "$(squeue -trunning -n$node -ho%t)" != "R"; then
  97. say_color error "pdsh -j all selected node $node from completed job"
  98. false
  99. fi
  100. fi
  101. '
  102. test_expect_success 'slurm -j option handles illegal jobid gracefully' '
  103. pdsh -j garbage 2>&1 | grep -q "invalid setting"
  104. '
  105. test_expect_success 'slurm -P option works' '
  106. part=$(sinfo -ho %P | head -1)
  107. O1=$(sinfo -ho %N -p $part)
  108. O2=$(pdsh -P $part -q | tail -1)
  109. if test "x$O1" != "x$O2"; then
  110. say_color error "Error: pdsh -P $part selected nodes $O2 expected $O1"
  111. false
  112. fi
  113. '
  114. #
  115. # Clean up:
  116. #
  117. echo "$KILLJOBIDS"
  118. test -n "$KILLJOBIDS" && scancel $KILLJOBIDS
  119. test_done