PageRenderTime 29ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 1ms

/src/plugins/select/cons_res/job_test.c

https://github.com/cfenoy/slurm
C | 2550 lines | 1709 code | 199 blank | 642 comment | 519 complexity | 23a5f4d2bd51c6d284676ab4d17fcda5 MD5 | raw file
Possible License(s): GPL-2.0, AGPL-1.0
  1. /*****************************************************************************\
  2. * select_cons_res.c - node selection plugin supporting consumable
  3. * resources policies.
  4. *****************************************************************************\
  5. *
  6. * The following example below illustrates how four jobs are allocated
  7. * across a cluster using when a processor consumable resource approach.
  8. *
  9. * The example cluster is composed of 4 nodes (10 cpus in total):
  10. * linux01 (with 2 processors),
  11. * linux02 (with 2 processors),
  12. * linux03 (with 2 processors), and
  13. * linux04 (with 4 processors).
  14. *
  15. * The four jobs are the following:
  16. * 1. srun -n 4 -N 4 sleep 120 &
  17. * 2. srun -n 3 -N 3 sleep 120 &
  18. * 3. srun -n 1 sleep 120 &
  19. * 4. srun -n 3 sleep 120 &
  20. * The user launches them in the same order as listed above.
  21. *
  22. * Using a processor consumable resource approach we get the following
  23. * job allocation and scheduling:
  24. *
  25. * The output of squeue shows that we have 3 out of the 4 jobs allocated
  26. * and running. This is a 2 running job increase over the default SLURM
  27. * approach.
  28. *
  29. * Job 2, Job 3, and Job 4 are now running concurrently on the cluster.
  30. *
  31. * [<snip>]# squeue
  32. * JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON)
  33. * 5 lsf sleep root PD 0:00 1 (Resources)
  34. * 2 lsf sleep root R 0:13 4 linux[01-04]
  35. * 3 lsf sleep root R 0:09 3 linux[01-03]
  36. * 4 lsf sleep root R 0:05 1 linux04
  37. * [<snip>]#
  38. *
  39. * Once Job 2 finishes, Job 5, which was pending, is allocated
  40. * available resources and is then running as illustrated below:
  41. *
  42. * [<snip>]# squeue4
  43. * JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON)
  44. * 3 lsf sleep root R 1:58 3 linux[01-03]
  45. * 4 lsf sleep root R 1:54 1 linux04
  46. * 5 lsf sleep root R 0:02 3 linux[01-03]
  47. * [<snip>]#
  48. *
  49. * Job 3, Job 4, and Job 5 are now running concurrently on the cluster.
  50. *
  51. * [<snip>]# squeue4
  52. * JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON)
  53. * 5 lsf sleep root R 1:52 3 xc14n[13-15]
  54. * [<snip>]#
  55. *
  56. * The advantage of the consumable resource scheduling policy is that
  57. * the job throughput can increase dramatically.
  58. *
  59. *****************************************************************************
  60. * Copyright (C) 2005-2008 Hewlett-Packard Development Company, L.P.
  61. * Written by Susanne M. Balle <susanne.balle@hp.com>, who borrowed heavily
  62. * from select/linear
  63. *
  64. * This file is part of SLURM, a resource management program.
  65. * For details, see <http://www.schedmd.com/slurmdocs/>.
  66. * Please also read the included file: DISCLAIMER.
  67. *
  68. * SLURM is free software; you can redistribute it and/or modify it under
  69. * the terms of the GNU General Public License as published by the Free
  70. * Software Foundation; either version 2 of the License, or (at your option)
  71. * any later version.
  72. *
  73. * In addition, as a special exception, the copyright holders give permission
  74. * to link the code of portions of this program with the OpenSSL library under
  75. * certain conditions as described in each individual source file, and
  76. * distribute linked combinations including the two. You must obey the GNU
  77. * General Public License in all respects for all of the code used other than
  78. * OpenSSL. If you modify file(s) with this exception, you may extend this
  79. * exception to your version of the file(s), but you are not obligated to do
  80. * so. If you do not wish to do so, delete this exception statement from your
  81. * version. If you delete this exception statement from all source files in
  82. * the program, then also delete it here.
  83. *
  84. * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
  85. * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  86. * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  87. * details.
  88. *
  89. * You should have received a copy of the GNU General Public License along
  90. * with SLURM; if not, write to the Free Software Foundation, Inc.,
  91. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  92. \*****************************************************************************/
  93. #ifdef HAVE_CONFIG_H
  94. # include "config.h"
  95. # if HAVE_STDINT_H
  96. # include <stdint.h>
  97. # endif
  98. # if HAVE_INTTYPES_H
  99. # include <inttypes.h>
  100. # endif
  101. #endif
  102. #include <time.h>
  103. #include "dist_tasks.h"
  104. #include "job_test.h"
  105. #include "select_cons_res.h"
  106. static int _eval_nodes(struct job_record *job_ptr, bitstr_t *node_map,
  107. uint32_t min_nodes, uint32_t max_nodes,
  108. uint32_t req_nodes, uint32_t cr_node_cnt,
  109. uint16_t *cpu_cnt);
  110. static int _eval_nodes_topo(struct job_record *job_ptr, bitstr_t *node_map,
  111. uint32_t min_nodes, uint32_t max_nodes,
  112. uint32_t req_nodes, uint32_t cr_node_cnt,
  113. uint16_t *cpu_cnt);
  114. /* _allocate_sockets - Given the job requirements, determine which sockets
  115. * from the given node can be allocated (if any) to this
  116. * job. Returns the number of cpus that can be used by
  117. * this node AND a core-level bitmap of the selected
  118. * sockets.
  119. *
  120. * IN job_ptr - pointer to job requirements
  121. * IN/OUT core_map - core_bitmap of available cores
  122. * IN node_i - index of node to be evaluated
  123. */
  124. uint16_t _allocate_sockets(struct job_record *job_ptr, bitstr_t *core_map,
  125. const uint32_t node_i)
  126. {
  127. uint16_t cpu_count = 0, cpu_cnt = 0;
  128. uint16_t si, cps, avail_cpus = 0, num_tasks = 0;
  129. uint32_t core_begin = cr_get_coremap_offset(node_i);
  130. uint32_t core_end = cr_get_coremap_offset(node_i+1);
  131. uint32_t c;
  132. uint16_t cpus_per_task = job_ptr->details->cpus_per_task;
  133. uint16_t *used_cores, *free_cores, free_core_count = 0;
  134. uint16_t i, j, sockets = select_node_record[node_i].sockets;
  135. uint16_t cores_per_socket = select_node_record[node_i].cores;
  136. uint16_t threads_per_core = select_node_record[node_i].vpus;
  137. uint16_t min_cores = 1, min_sockets = 1, ntasks_per_socket = 0;
  138. uint16_t ntasks_per_core = 0xffff;
  139. if (job_ptr->details && job_ptr->details->mc_ptr) {
  140. multi_core_data_t *mc_ptr = job_ptr->details->mc_ptr;
  141. if (mc_ptr->cores_per_socket != (uint16_t) NO_VAL) {
  142. min_cores = mc_ptr->cores_per_socket;
  143. }
  144. if (mc_ptr->sockets_per_node != (uint16_t) NO_VAL){
  145. min_sockets = mc_ptr->sockets_per_node;
  146. }
  147. if (mc_ptr->ntasks_per_core) {
  148. ntasks_per_core = mc_ptr->ntasks_per_core;
  149. }
  150. if ((mc_ptr->threads_per_core != (uint16_t) NO_VAL) &&
  151. (mc_ptr->threads_per_core < ntasks_per_core)) {
  152. ntasks_per_core = mc_ptr->threads_per_core;
  153. }
  154. ntasks_per_socket = mc_ptr->ntasks_per_socket;
  155. }
  156. /* These are the job parameters that we must respect:
  157. *
  158. * job_ptr->details->mc_ptr->cores_per_socket (cr_core|cr_socket)
  159. * - min # of cores per socket to allocate to this job
  160. * job_ptr->details->mc_ptr->sockets_per_node (cr_core|cr_socket)
  161. * - min # of sockets per node to allocate to this job
  162. * job_ptr->details->mc_ptr->ntasks_per_core (cr_core|cr_socket)
  163. * - number of tasks to launch per core
  164. * job_ptr->details->mc_ptr->ntasks_per_socket (cr_core|cr_socket)
  165. * - number of tasks to launch per socket
  166. *
  167. * job_ptr->details->ntasks_per_node (all cr_types)
  168. * - total number of tasks to launch on this node
  169. * job_ptr->details->cpus_per_task (all cr_types)
  170. * - number of cpus to allocate per task
  171. *
  172. * These are the hardware constraints:
  173. * cpus = sockets * cores_per_socket * threads_per_core
  174. *
  175. * These are the cores/sockets that are available: core_map
  176. *
  177. * NOTE: currently we only allocate at the socket level, the core
  178. * level, or the cpu level. When hyperthreading is enabled
  179. * in the BIOS, then there can be more than one thread/cpu
  180. * per physical core.
  181. *
  182. * PROCEDURE:
  183. *
  184. * Step 1: Determine the current usage data: used_cores[],
  185. * used_core_count, free_cores[], free_core_count
  186. *
  187. * Step 2: For core-level and socket-level: apply sockets_per_node
  188. * and cores_per_socket to the "free" cores.
  189. *
  190. * Step 3: Compute task-related data: ntasks_per_core,
  191. * ntasks_per_socket, ntasks_per_node and cpus_per_task
  192. * and determine the number of tasks to run on this node
  193. *
  194. * Step 4: Mark the allocated resources in the job_cores bitmap
  195. * and return "num_tasks" from Step 3.
  196. *
  197. *
  198. * For socket and core counts, start by assuming that all available
  199. * resources will be given to the job. Check min_* to ensure that
  200. * there's enough resources. Reduce the resource count to match max_*
  201. * (if necessary). Also reduce resource count (if necessary) to
  202. * match ntasks_per_resource.
  203. *
  204. * NOTE: Memory is not used as a constraint here - should it?
  205. * If not then it needs to be done somewhere else!
  206. */
  207. /* Step 1: create and compute core-count-per-socket
  208. * arrays and total core counts */
  209. free_cores = xmalloc(sockets * sizeof(uint16_t));
  210. used_cores = xmalloc(sockets * sizeof(uint16_t));
  211. for (c = core_begin; c < core_end; c++) {
  212. i = (uint16_t) (c - core_begin) / cores_per_socket;
  213. if (bit_test(core_map, c)) {
  214. free_cores[i]++;
  215. free_core_count++;
  216. } else {
  217. used_cores[i]++;
  218. }
  219. }
  220. /* if a socket is already in use, it cannot be used
  221. * by this job */
  222. for (i = 0; i < sockets; i++) {
  223. if (used_cores[i]) {
  224. free_core_count -= free_cores[i];
  225. used_cores[i] += free_cores[i];
  226. free_cores[i] = 0;
  227. }
  228. }
  229. xfree(used_cores);
  230. used_cores = NULL;
  231. /* Step 2: check min_cores per socket and min_sockets per node */
  232. j = 0;
  233. for (i = 0; i < sockets; i++) {
  234. if (free_cores[i] < min_cores) {
  235. /* cannot use this socket */
  236. free_core_count -= free_cores[i];
  237. free_cores[i] = 0;
  238. continue;
  239. }
  240. /* count this socket as usable */
  241. j++;
  242. }
  243. if (j < min_sockets) {
  244. /* cannot use this node */
  245. num_tasks = 0;
  246. goto fini;
  247. }
  248. if (free_core_count < 1) {
  249. /* no available resources on this node */
  250. num_tasks = 0;
  251. goto fini;
  252. }
  253. /* Step 3: Compute task-related data:
  254. * ntasks_per_socket, ntasks_per_node and cpus_per_task
  255. * to determine the number of tasks to run on this node
  256. *
  257. * Note: cpus_per_task and ntasks_per_core need to play nice
  258. * 2 tasks_per_core vs. 2 cpus_per_task
  259. */
  260. avail_cpus = 0;
  261. num_tasks = 0;
  262. threads_per_core = MIN(threads_per_core, ntasks_per_core);
  263. for (i = 0; i < sockets; i++) {
  264. uint16_t tmp = free_cores[i] * threads_per_core;
  265. avail_cpus += tmp;
  266. if (ntasks_per_socket)
  267. num_tasks += MIN(tmp, ntasks_per_socket);
  268. else
  269. num_tasks += tmp;
  270. }
  271. /* If job requested exclusive rights to the node don't do the
  272. * min here since it will make it so we don't allocate the
  273. * entire node. */
  274. if (job_ptr->details->ntasks_per_node && job_ptr->details->shared)
  275. num_tasks = MIN(num_tasks, job_ptr->details->ntasks_per_node);
  276. if (cpus_per_task < 2) {
  277. avail_cpus = num_tasks;
  278. cps = num_tasks;
  279. } else {
  280. j = avail_cpus / cpus_per_task;
  281. num_tasks = MIN(num_tasks, j);
  282. if (job_ptr->details->ntasks_per_node)
  283. avail_cpus = num_tasks * cpus_per_task;
  284. }
  285. if ((job_ptr->details->ntasks_per_node &&
  286. (num_tasks < job_ptr->details->ntasks_per_node)) ||
  287. (job_ptr->details->pn_min_cpus &&
  288. (avail_cpus < job_ptr->details->pn_min_cpus))) {
  289. /* insufficient resources on this node */
  290. num_tasks = 0;
  291. goto fini;
  292. }
  293. /* Step 4 - make sure that ntasks_per_socket is enforced when
  294. * allocating cores
  295. */
  296. cps = num_tasks;
  297. if (ntasks_per_socket > 1) {
  298. cps = ntasks_per_socket;
  299. if (cpus_per_task > 1)
  300. cps = ntasks_per_socket * cpus_per_task;
  301. }
  302. si = 9999;
  303. for (c = core_begin; c < core_end && avail_cpus > 0; c++) {
  304. if (bit_test(core_map, c) == 0)
  305. continue;
  306. i = (uint16_t) (c - core_begin) / cores_per_socket;
  307. if (free_cores[i] > 0) {
  308. /* this socket has free cores, but make sure
  309. * we don't use more than are needed for
  310. * ntasks_per_socket */
  311. if (si != i) {
  312. si = i;
  313. cpu_cnt = threads_per_core;
  314. } else {
  315. if (cpu_cnt >= cps) {
  316. /* do not allocate this core */
  317. bit_clear(core_map, c);
  318. continue;
  319. }
  320. cpu_cnt += threads_per_core;
  321. }
  322. free_cores[i]--;
  323. /* we have to ensure that cpu_count
  324. * is not bigger than avail_cpus due to
  325. * hyperthreading or this would break
  326. * the selection logic providing more
  327. * cpus than allowed after task-related data
  328. * processing of stage 3
  329. */
  330. if (avail_cpus >= threads_per_core) {
  331. avail_cpus -= threads_per_core;
  332. cpu_count += threads_per_core;
  333. }
  334. else {
  335. cpu_count += avail_cpus;
  336. avail_cpus = 0;
  337. }
  338. } else
  339. bit_clear(core_map, c);
  340. }
  341. /* clear leftovers */
  342. if (c < core_end)
  343. bit_nclear(core_map, c, core_end-1);
  344. fini:
  345. /* if num_tasks == 0 then clear all bits on this node */
  346. if (!num_tasks) {
  347. bit_nclear(core_map, core_begin, core_end-1);
  348. cpu_count = 0;
  349. }
  350. xfree(free_cores);
  351. return cpu_count;
  352. }
  353. /* _allocate_cores - Given the job requirements, determine which cores
  354. * from the given node can be allocated (if any) to this
  355. * job. Returns the number of cpus that can be used by
  356. * this node AND a bitmap of the selected cores.
  357. *
  358. * IN job_ptr - pointer to job requirements
  359. * IN/OUT core_map - bitmap of cores available for use/selected for use
  360. * IN node_i - index of node to be evaluated
  361. * IN cpu_type - if true, allocate CPUs rather than cores
  362. */
  363. uint16_t _allocate_cores(struct job_record *job_ptr, bitstr_t *core_map,
  364. const uint32_t node_i, bool cpu_type)
  365. {
  366. uint16_t avail_cpus = 0, num_tasks = 0;
  367. uint32_t core_begin = cr_get_coremap_offset(node_i);
  368. uint32_t core_end = cr_get_coremap_offset(node_i+1);
  369. uint32_t c;
  370. uint16_t cpus_per_task = job_ptr->details->cpus_per_task;
  371. uint16_t *free_cores, free_core_count = 0;
  372. uint16_t i, j, sockets = select_node_record[node_i].sockets;
  373. uint16_t cores_per_socket = select_node_record[node_i].cores;
  374. uint16_t threads_per_core = select_node_record[node_i].vpus;
  375. uint16_t min_cores = 1, min_sockets = 1;
  376. uint16_t ntasks_per_core = 0xffff;
  377. if (job_ptr->details && job_ptr->details->mc_ptr) {
  378. multi_core_data_t *mc_ptr = job_ptr->details->mc_ptr;
  379. if (mc_ptr->cores_per_socket != (uint16_t) NO_VAL) {
  380. min_cores = mc_ptr->cores_per_socket;
  381. }
  382. if (mc_ptr->sockets_per_node != (uint16_t) NO_VAL){
  383. min_sockets = mc_ptr->sockets_per_node;
  384. }
  385. if (mc_ptr->ntasks_per_core) {
  386. ntasks_per_core = mc_ptr->ntasks_per_core;
  387. }
  388. if ((mc_ptr->threads_per_core != (uint16_t) NO_VAL) &&
  389. (mc_ptr->threads_per_core < ntasks_per_core)) {
  390. ntasks_per_core = mc_ptr->threads_per_core;
  391. }
  392. }
  393. /* These are the job parameters that we must respect:
  394. *
  395. * job_ptr->details->mc_ptr->cores_per_socket (cr_core|cr_socket)
  396. * - number of cores per socket to allocate to this jobb
  397. * job_ptr->details->mc_ptr->sockets_per_node (cr_core|cr_socket)
  398. * - number of sockets per node to allocate to this job
  399. * job_ptr->details->mc_ptr->ntasks_per_core (cr_core|cr_socket)
  400. * - number of tasks to launch per core
  401. * job_ptr->details->mc_ptr->ntasks_per_socket (cr_core|cr_socket)
  402. * - number of tasks to launch per socket
  403. *
  404. * job_ptr->details->ntasks_per_node (all cr_types)
  405. * - total number of tasks to launch on this node
  406. * job_ptr->details->cpus_per_task (all cr_types)
  407. * - number of cpus to allocate per task
  408. *
  409. * These are the hardware constraints:
  410. * cpus = sockets * cores_per_socket * threads_per_core
  411. *
  412. * These are the cores that are available for use: core_map
  413. *
  414. * NOTE: currently we only allocate at the socket level, the core
  415. * level, or the cpu level. When hyperthreading is enabled
  416. * in the BIOS, then there can be more than one thread/cpu
  417. * per physical core.
  418. *
  419. * PROCEDURE:
  420. *
  421. * Step 1: Determine the current usage data: free_cores[] and
  422. * free_core_count
  423. *
  424. * Step 2: check min_cores per socket and min_sockets per node
  425. *
  426. * Step 3: Compute task-related data: use ntasks_per_core,
  427. * ntasks_per_node and cpus_per_task to determine
  428. * the number of tasks that can run on this node
  429. *
  430. * Step 4: Mark the allocated resources in the job_cores bitmap
  431. * and return "num_tasks" from Step 3.
  432. *
  433. *
  434. * Start by assuming that all "free" cores will be given to the
  435. * job. Check min_* to ensure that there's enough resources.
  436. * Reduce the core count to match max_* (if necessary). Also,
  437. * reduce the core count (if necessary) to match ntasks_per_core.
  438. * Note that we're not processing ntasks_per_socket, because the
  439. * srun manpage says that ntasks_per_socket is only valid for
  440. * CR_SOCKET.
  441. */
  442. /* Step 1: create and compute core-count-per-socket
  443. * arrays and total core counts */
  444. free_cores = xmalloc(sockets * sizeof(uint16_t));
  445. for (c = core_begin; c < core_end; c++) {
  446. i = (uint16_t) (c - core_begin) / cores_per_socket;
  447. if (bit_test(core_map, c)) {
  448. free_cores[i]++;
  449. free_core_count++;
  450. }
  451. }
  452. /* Step 2: check min_cores per socket and min_sockets per node */
  453. j = 0;
  454. for (i = 0; i < sockets; i++) {
  455. if (free_cores[i] < min_cores) {
  456. /* cannot use this socket */
  457. free_core_count -= free_cores[i];
  458. free_cores[i] = 0;
  459. continue;
  460. }
  461. /* count this socket as usable */
  462. j++;
  463. }
  464. if (j < min_sockets) {
  465. /* cannot use this node */
  466. num_tasks = 0;
  467. goto fini;
  468. }
  469. if (free_core_count < 1) {
  470. /* no available resources on this node */
  471. num_tasks = 0;
  472. goto fini;
  473. }
  474. /* Step 3: Compute task-related data: use ntasks_per_core,
  475. * ntasks_per_node and cpus_per_task to determine
  476. * the number of tasks to run on this node
  477. *
  478. * Note: cpus_per_task and ntasks_per_core need to play nice
  479. * 2 tasks_per_core vs. 2 cpus_per_task
  480. */
  481. threads_per_core = MIN(threads_per_core, ntasks_per_core);
  482. num_tasks = avail_cpus = threads_per_core;
  483. /* convert from PER_CORE to TOTAL_FOR_NODE */
  484. avail_cpus *= free_core_count;
  485. num_tasks *= free_core_count;
  486. /* If job requested exclusive rights to the node don't do the min here
  487. * since it will make it so we don't allocate the entire node */
  488. if (job_ptr->details->ntasks_per_node && job_ptr->details->shared)
  489. num_tasks = MIN(num_tasks, job_ptr->details->ntasks_per_node);
  490. if (cpus_per_task < 2) {
  491. avail_cpus = num_tasks;
  492. } else {
  493. j = avail_cpus / cpus_per_task;
  494. num_tasks = MIN(num_tasks, j);
  495. }
  496. if ((job_ptr->details->ntasks_per_node &&
  497. (num_tasks < job_ptr->details->ntasks_per_node) &&
  498. (job_ptr->details->overcommit == 0)) ||
  499. (job_ptr->details->pn_min_cpus &&
  500. (avail_cpus < job_ptr->details->pn_min_cpus))) {
  501. /* insufficient resources on this node */
  502. num_tasks = 0;
  503. goto fini;
  504. }
  505. /* Step 4 */
  506. for (c = core_begin; c < core_end && avail_cpus > 0; c++) {
  507. if (bit_test(core_map, c) == 0)
  508. continue;
  509. i = (uint16_t) (c - core_begin) / cores_per_socket;
  510. if (free_cores[i] == 0)
  511. bit_clear(core_map, c);
  512. else {
  513. free_cores[i]--;
  514. if (avail_cpus >= threads_per_core)
  515. avail_cpus -= threads_per_core;
  516. else
  517. avail_cpus = 0;
  518. }
  519. }
  520. /* clear leftovers */
  521. if (c < core_end)
  522. bit_nclear(core_map, c, core_end-1);
  523. fini:
  524. if (!num_tasks) {
  525. bit_nclear(core_map, core_begin, core_end-1);
  526. }
  527. xfree(free_cores);
  528. return num_tasks * cpus_per_task;
  529. }
  530. /*
  531. * _can_job_run_on_node - Given the job requirements, determine which
  532. * resources from the given node (if any) can be
  533. * allocated to this job. Returns the number of
  534. * cpus that can be used by this node and a bitmap
  535. * of available resources for allocation.
  536. * NOTE: This process does NOT support overcommitting resources
  537. *
  538. * IN job_ptr - pointer to job requirements
  539. * IN/OUT core_map - core_bitmap of available cores
  540. * IN n - index of node to be evaluated
  541. * IN cr_type - Consumable Resource setting
  542. * IN test_only - ignore allocated memory check
  543. *
  544. * NOTE: The returned cpu_count may be less than the number of set bits in
  545. * core_map for the given node. The cr_dist functions will determine
  546. * which bits to deselect from the core_map to match the cpu_count.
  547. */
  548. uint16_t _can_job_run_on_node(struct job_record *job_ptr, bitstr_t *core_map,
  549. const uint32_t node_i,
  550. struct node_use_record *node_usage,
  551. uint16_t cr_type,
  552. bool test_only)
  553. {
  554. uint16_t cpus;
  555. uint32_t avail_mem, req_mem, gres_cpus;
  556. int core_start_bit, core_end_bit, cpu_alloc_size;
  557. struct node_record *node_ptr = node_record_table_ptr + node_i;
  558. List gres_list;
  559. if (!test_only && IS_NODE_COMPLETING(node_ptr)) {
  560. /* Do not allocate more jobs to nodes with completing jobs */
  561. cpus = 0;
  562. return cpus;
  563. }
  564. if (cr_type & CR_CORE) {
  565. cpus = _allocate_cores(job_ptr, core_map, node_i, false);
  566. /* cpu_alloc_size = CPUs per core */
  567. cpu_alloc_size = select_node_record[node_i].vpus;
  568. } else if (cr_type & CR_SOCKET) {
  569. cpus = _allocate_sockets(job_ptr, core_map, node_i);
  570. /* cpu_alloc_size = CPUs per socket */
  571. cpu_alloc_size = select_node_record[node_i].cores *
  572. select_node_record[node_i].vpus;
  573. } else {
  574. cpus = _allocate_cores(job_ptr, core_map, node_i, true);
  575. cpu_alloc_size = 1;
  576. }
  577. core_start_bit = cr_get_coremap_offset(node_i);
  578. core_end_bit = cr_get_coremap_offset(node_i+1) - 1;
  579. node_ptr = select_node_record[node_i].node_ptr;
  580. if (cr_type & CR_MEMORY) {
  581. /* Memory Check: check pn_min_memory to see if:
  582. * - this node has enough memory (MEM_PER_CPU == 0)
  583. * - there are enough free_cores (MEM_PER_CPU = 1)
  584. */
  585. req_mem = job_ptr->details->pn_min_memory & ~MEM_PER_CPU;
  586. avail_mem = select_node_record[node_i].real_memory;
  587. if (!test_only)
  588. avail_mem -= node_usage[node_i].alloc_memory;
  589. if (job_ptr->details->pn_min_memory & MEM_PER_CPU) {
  590. /* memory is per-cpu */
  591. while ((cpus > 0) && ((req_mem * cpus) > avail_mem))
  592. cpus -= cpu_alloc_size;
  593. if ((cpus < job_ptr->details->ntasks_per_node) ||
  594. ((job_ptr->details->cpus_per_task > 1) &&
  595. (cpus < job_ptr->details->cpus_per_task)))
  596. cpus = 0;
  597. /* FIXME: Need to recheck min_cores, etc. here */
  598. } else {
  599. /* memory is per node */
  600. if (req_mem > avail_mem)
  601. cpus = 0;
  602. }
  603. }
  604. if (node_usage[node_i].gres_list)
  605. gres_list = node_usage[node_i].gres_list;
  606. else
  607. gres_list = node_ptr->gres_list;
  608. gres_cpus = gres_plugin_job_test(job_ptr->gres_list,
  609. gres_list, test_only,
  610. core_map, core_start_bit,
  611. core_end_bit, job_ptr->job_id,
  612. node_ptr->name);
  613. if ((gres_cpus < job_ptr->details->ntasks_per_node) ||
  614. ((job_ptr->details->cpus_per_task > 1) &&
  615. (gres_cpus < job_ptr->details->cpus_per_task)))
  616. gres_cpus = 0;
  617. while (gres_cpus < cpus)
  618. cpus -= cpu_alloc_size;
  619. if (cpus == 0)
  620. bit_nclear(core_map, core_start_bit, core_end_bit);
  621. if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
  622. info("cons_res: _can_job_run_on_node: %u cpus on %s(%d), "
  623. "mem %u/%u",
  624. cpus, select_node_record[node_i].node_ptr->name,
  625. node_usage[node_i].node_state,
  626. node_usage[node_i].alloc_memory,
  627. select_node_record[node_i].real_memory);
  628. }
  629. return cpus;
  630. }
  631. /* Test to see if a node already has running jobs for _other_ partitions.
  632. * If (sharing_only) then only check sharing partitions. This is because
  633. * the job was submitted to a single-row partition which does not share
  634. * allocated CPUs with multi-row partitions.
  635. */
  636. static int _is_node_busy(struct part_res_record *p_ptr, uint32_t node_i,
  637. int sharing_only, struct part_record *my_part_ptr)
  638. {
  639. uint32_t r, cpu_begin = cr_get_coremap_offset(node_i);
  640. uint32_t i, cpu_end = cr_get_coremap_offset(node_i+1);
  641. for (; p_ptr; p_ptr = p_ptr->next) {
  642. if (sharing_only &&
  643. ((p_ptr->num_rows < 2) ||
  644. (p_ptr->part_ptr == my_part_ptr)))
  645. continue;
  646. if (!p_ptr->row)
  647. continue;
  648. for (r = 0; r < p_ptr->num_rows; r++) {
  649. if (!p_ptr->row[r].row_bitmap)
  650. continue;
  651. for (i = cpu_begin; i < cpu_end; i++) {
  652. if (bit_test(p_ptr->row[r].row_bitmap, i))
  653. return 1;
  654. }
  655. }
  656. }
  657. return 0;
  658. }
  659. /*
  660. * Determine which of these nodes are usable by this job
  661. *
  662. * Remove nodes from the bitmap that don't have enough memory or gres to
  663. * support the job.
  664. *
  665. * Return SLURM_ERROR if a required node can't be used.
  666. *
  667. * if node_state = NODE_CR_RESERVED, clear bitmap (if node is required
  668. * then should we return NODE_BUSY!?!)
  669. *
  670. * if node_state = NODE_CR_ONE_ROW, then this node can only be used by
  671. * another NODE_CR_ONE_ROW job
  672. *
  673. * if node_state = NODE_CR_AVAILABLE AND:
  674. * - job_node_req = NODE_CR_RESERVED, then we need idle nodes
  675. * - job_node_req = NODE_CR_ONE_ROW, then we need idle or non-sharing nodes
  676. */
  677. static int _verify_node_state(struct part_res_record *cr_part_ptr,
  678. struct job_record *job_ptr, bitstr_t * bitmap,
  679. uint16_t cr_type,
  680. struct node_use_record *node_usage,
  681. enum node_cr_state job_node_req)
  682. {
  683. struct node_record *node_ptr;
  684. uint32_t i, free_mem, gres_cpus, min_mem, size;
  685. List gres_list;
  686. if (job_ptr->details->pn_min_memory & MEM_PER_CPU) {
  687. uint16_t min_cpus;
  688. min_mem = job_ptr->details->pn_min_memory & (~MEM_PER_CPU);
  689. min_cpus = MAX(job_ptr->details->ntasks_per_node,
  690. job_ptr->details->pn_min_cpus);
  691. min_cpus = MAX(min_cpus, job_ptr->details->cpus_per_task);
  692. if (min_cpus > 0)
  693. min_mem *= min_cpus;
  694. } else {
  695. min_mem = job_ptr->details->pn_min_memory;
  696. }
  697. size = bit_size(bitmap);
  698. for (i = 0; i < size; i++) {
  699. if (!bit_test(bitmap, i))
  700. continue;
  701. node_ptr = select_node_record[i].node_ptr;
  702. /* node-level memory check */
  703. if ((job_ptr->details->pn_min_memory) &&
  704. (cr_type & CR_MEMORY)) {
  705. if (select_node_record[i].real_memory >
  706. node_usage[i].alloc_memory)
  707. free_mem = select_node_record[i].real_memory -
  708. node_usage[i].alloc_memory;
  709. else
  710. free_mem = 0;
  711. if (free_mem < min_mem) {
  712. debug3("cons_res: _vns: node %s no mem %u < %u",
  713. select_node_record[i].node_ptr->name,
  714. free_mem, min_mem);
  715. goto clear_bit;
  716. }
  717. }
  718. /* node-level gres check */
  719. if (node_usage[i].gres_list)
  720. gres_list = node_usage[i].gres_list;
  721. else
  722. gres_list = node_ptr->gres_list;
  723. gres_cpus = gres_plugin_job_test(job_ptr->gres_list,
  724. gres_list, true,
  725. NULL, 0, 0, job_ptr->job_id,
  726. node_ptr->name);
  727. if (gres_cpus == 0) {
  728. debug3("cons_res: _vns: node %s lacks gres",
  729. node_ptr->name);
  730. goto clear_bit;
  731. }
  732. /* exclusive node check */
  733. if (node_usage[i].node_state >= NODE_CR_RESERVED) {
  734. debug3("cons_res: _vns: node %s in exclusive use",
  735. node_ptr->name);
  736. goto clear_bit;
  737. /* non-resource-sharing node check */
  738. } else if (node_usage[i].node_state >= NODE_CR_ONE_ROW) {
  739. if ((job_node_req == NODE_CR_RESERVED) ||
  740. (job_node_req == NODE_CR_AVAILABLE)) {
  741. debug3("cons_res: _vns: node %s non-sharing",
  742. node_ptr->name);
  743. goto clear_bit;
  744. }
  745. /* cannot use this node if it is running jobs
  746. * in sharing partitions */
  747. if (_is_node_busy(cr_part_ptr, i, 1,
  748. job_ptr->part_ptr)) {
  749. debug3("cons_res: _vns: node %s sharing?",
  750. node_ptr->name);
  751. goto clear_bit;
  752. }
  753. /* node is NODE_CR_AVAILABLE - check job request */
  754. } else {
  755. if (job_node_req == NODE_CR_RESERVED) {
  756. if (_is_node_busy(cr_part_ptr, i, 0,
  757. job_ptr->part_ptr)) {
  758. debug3("cons_res: _vns: node %s busy",
  759. node_ptr->name);
  760. goto clear_bit;
  761. }
  762. } else if (job_node_req == NODE_CR_ONE_ROW) {
  763. /* cannot use this node if it is running jobs
  764. * in sharing partitions */
  765. if (_is_node_busy(cr_part_ptr, i, 1,
  766. job_ptr->part_ptr)) {
  767. debug3("cons_res: _vns: node %s vbusy",
  768. node_ptr->name);
  769. goto clear_bit;
  770. }
  771. }
  772. }
  773. continue; /* node is usable, test next node */
  774. clear_bit: /* This node is not usable by this job */
  775. bit_clear(bitmap, i);
  776. if (job_ptr->details->req_node_bitmap &&
  777. bit_test(job_ptr->details->req_node_bitmap, i))
  778. return SLURM_ERROR;
  779. }
  780. return SLURM_SUCCESS;
  781. }
  782. /* given an "avail" node_bitmap, return a corresponding "avail" core_bitmap */
  783. bitstr_t *_make_core_bitmap(bitstr_t *node_map)
  784. {
  785. uint32_t n, c, nodes, size;
  786. uint32_t coff;
  787. nodes = bit_size(node_map);
  788. size = cr_get_coremap_offset(nodes);
  789. bitstr_t *core_map = bit_alloc(size);
  790. if (!core_map)
  791. return NULL;
  792. nodes = bit_size(node_map);
  793. for (n = 0, c = 0; n < nodes; n++) {
  794. if (bit_test(node_map, n)) {
  795. coff = cr_get_coremap_offset(n+1);
  796. while (c < coff) {
  797. bit_set(core_map, c++);
  798. }
  799. }
  800. }
  801. return core_map;
  802. }
  803. /*
  804. * Determine the number of CPUs that a given job can use on a specific node
  805. * IN: job_ptr - pointer to job we are attempting to start
  806. * IN: node_index - zero origin node being considered for use
  807. * IN: cpu_cnt - array with count of CPU's available to job on each node
  808. * RET: number of usable CPUs on the identified node
  809. */
  810. static int _get_cpu_cnt(struct job_record *job_ptr, const int node_index,
  811. uint16_t *cpu_cnt)
  812. {
  813. int offset, cpus;
  814. uint16_t *layout_ptr = job_ptr->details->req_node_layout;
  815. cpus = cpu_cnt[node_index];
  816. if (layout_ptr &&
  817. bit_test(job_ptr->details->req_node_bitmap, node_index)) {
  818. offset = bit_get_pos_num(job_ptr->details->req_node_bitmap,
  819. node_index);
  820. cpus = MIN(cpus, layout_ptr[offset]);
  821. } else if (layout_ptr) {
  822. cpus = 0; /* should not happen? */
  823. }
  824. return cpus;
  825. }
  826. /* Compute resource usage for the given job on all available resources
  827. *
  828. * IN: job_ptr - pointer to the job requesting resources
  829. * IN: node_map - bitmap of available nodes
  830. * IN/OUT: core_map - bitmap of available cores
  831. * IN: cr_node_cnt - total number of nodes in the cluster
  832. * IN: cr_type - resource type
  833. * OUT: cpu_cnt - number of cpus that can be used by this job
  834. * IN: test_only - ignore allocated memory check
  835. */
  836. static void _get_res_usage(struct job_record *job_ptr, bitstr_t *node_map,
  837. bitstr_t *core_map, uint32_t cr_node_cnt,
  838. struct node_use_record *node_usage,
  839. uint16_t cr_type, uint16_t **cpu_cnt_ptr,
  840. bool test_only)
  841. {
  842. uint16_t *cpu_cnt;
  843. uint32_t n;
  844. cpu_cnt = xmalloc(cr_node_cnt * sizeof(uint16_t));
  845. for (n = 0; n < cr_node_cnt; n++) {
  846. if (!bit_test(node_map, n))
  847. continue;
  848. cpu_cnt[n] = _can_job_run_on_node(job_ptr, core_map, n,
  849. node_usage, cr_type,
  850. test_only);
  851. }
  852. *cpu_cnt_ptr = cpu_cnt;
  853. }
  854. static bool _enough_nodes(int avail_nodes, int rem_nodes,
  855. uint32_t min_nodes, uint32_t req_nodes)
  856. {
  857. int needed_nodes;
  858. if (req_nodes > min_nodes)
  859. needed_nodes = rem_nodes + min_nodes - req_nodes;
  860. else
  861. needed_nodes = rem_nodes;
  862. return (avail_nodes >= needed_nodes);
  863. }
  864. static void _cpus_to_use(int *avail_cpus, int rem_cpus, int rem_nodes,
  865. struct job_details *details_ptr, uint16_t *cpu_cnt)
  866. {
  867. int resv_cpus; /* CPUs to be allocated on other nodes */
  868. if (details_ptr->shared == 0) /* Use all CPUs on this node */
  869. return;
  870. resv_cpus = MAX((rem_nodes - 1), 0);
  871. resv_cpus *= details_ptr->pn_min_cpus; /* At least 1 */
  872. rem_cpus -= resv_cpus;
  873. if (*avail_cpus > rem_cpus) {
  874. *avail_cpus = MAX(rem_cpus, (int)details_ptr->pn_min_cpus);
  875. *cpu_cnt = *avail_cpus;
  876. }
  877. }
  878. /* this is the heart of the selection process */
  879. static int _eval_nodes(struct job_record *job_ptr, bitstr_t *node_map,
  880. uint32_t min_nodes, uint32_t max_nodes,
  881. uint32_t req_nodes, uint32_t cr_node_cnt,
  882. uint16_t *cpu_cnt)
  883. {
  884. int i, j, error_code = SLURM_ERROR;
  885. int *consec_nodes; /* how many nodes we can add from this
  886. * consecutive set of nodes */
  887. int *consec_cpus; /* how many nodes we can add from this
  888. * consecutive set of nodes */
  889. int *consec_start; /* where this consecutive set starts (index) */
  890. int *consec_end; /* where this consecutive set ends (index) */
  891. int *consec_req; /* are nodes from this set required
  892. * (in req_bitmap) */
  893. int consec_index, consec_size, sufficient;
  894. int rem_cpus, rem_nodes; /* remaining resources desired */
  895. int min_rem_nodes; /* remaining resources desired */
  896. int total_cpus = 0; /* #CPUs allocated to job */
  897. int best_fit_nodes, best_fit_cpus, best_fit_req;
  898. int best_fit_sufficient, best_fit_index = 0;
  899. int avail_cpus, ll; /* ll = layout array index */
  900. bool required_node;
  901. struct job_details *details_ptr = job_ptr->details;
  902. bitstr_t *req_map = details_ptr->req_node_bitmap;
  903. uint16_t *layout_ptr = details_ptr->req_node_layout;
  904. xassert(node_map);
  905. if (cr_node_cnt != node_record_count) {
  906. error("cons_res: node count inconsistent with slurmctld");
  907. return error_code;
  908. }
  909. if (bit_set_count(node_map) < min_nodes)
  910. return error_code;
  911. if ((details_ptr->req_node_bitmap) &&
  912. (!bit_super_set(details_ptr->req_node_bitmap, node_map)))
  913. return error_code;
  914. if (switch_record_cnt && switch_record_table) {
  915. /* Perform optimized resource selection based upon topology */
  916. return _eval_nodes_topo(job_ptr, node_map,
  917. min_nodes, max_nodes, req_nodes,
  918. cr_node_cnt, cpu_cnt);
  919. }
  920. consec_size = 50; /* start allocation for 50 sets of
  921. * consecutive nodes */
  922. consec_cpus = xmalloc(sizeof(int) * consec_size);
  923. consec_nodes = xmalloc(sizeof(int) * consec_size);
  924. consec_start = xmalloc(sizeof(int) * consec_size);
  925. consec_end = xmalloc(sizeof(int) * consec_size);
  926. consec_req = xmalloc(sizeof(int) * consec_size);
  927. /* Build table with information about sets of consecutive nodes */
  928. consec_index = 0;
  929. consec_cpus[consec_index] = consec_nodes[consec_index] = 0;
  930. consec_req[consec_index] = -1; /* no required nodes here by default */
  931. rem_cpus = details_ptr->min_cpus;
  932. rem_nodes = MAX(min_nodes, req_nodes);
  933. min_rem_nodes = min_nodes;
  934. for (i = 0, ll = -1; i < cr_node_cnt; i++) {
  935. if (req_map)
  936. required_node = bit_test(req_map, i);
  937. else
  938. required_node = false;
  939. if (layout_ptr && required_node)
  940. ll++;
  941. if (bit_test(node_map, i)) {
  942. if (consec_nodes[consec_index] == 0)
  943. consec_start[consec_index] = i;
  944. avail_cpus = cpu_cnt[i];
  945. if (layout_ptr && required_node) {
  946. avail_cpus = MIN(avail_cpus, layout_ptr[ll]);
  947. } else if (layout_ptr) {
  948. avail_cpus = 0; /* should not happen? */
  949. }
  950. if ((max_nodes > 0) && required_node) {
  951. if (consec_req[consec_index] == -1) {
  952. /* first required node in set */
  953. consec_req[consec_index] = i;
  954. }
  955. total_cpus += avail_cpus;
  956. rem_cpus -= avail_cpus;
  957. rem_nodes--;
  958. min_rem_nodes--;
  959. /* leaving bitmap set, decrement max limit */
  960. max_nodes--;
  961. } else { /* node not selected (yet) */
  962. bit_clear(node_map, i);
  963. consec_cpus[consec_index] += avail_cpus;
  964. consec_nodes[consec_index]++;
  965. }
  966. } else if (consec_nodes[consec_index] == 0) {
  967. consec_req[consec_index] = -1;
  968. /* already picked up any required nodes */
  969. /* re-use this record */
  970. } else {
  971. consec_end[consec_index] = i - 1;
  972. if (++consec_index >= consec_size) {
  973. consec_size *= 2;
  974. xrealloc(consec_cpus, sizeof(int)*consec_size);
  975. xrealloc(consec_nodes,sizeof(int)*consec_size);
  976. xrealloc(consec_start,sizeof(int)*consec_size);
  977. xrealloc(consec_end, sizeof(int)*consec_size);
  978. xrealloc(consec_req, sizeof(int)*consec_size);
  979. }
  980. consec_cpus[consec_index] = 0;
  981. consec_nodes[consec_index] = 0;
  982. consec_req[consec_index] = -1;
  983. }
  984. }
  985. if (consec_nodes[consec_index] != 0)
  986. consec_end[consec_index++] = i - 1;
  987. if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
  988. for (i = 0; i < consec_index; i++) {
  989. info("cons_res: eval_nodes:%d consec "
  990. "c=%d n=%d b=%d e=%d r=%d",
  991. i, consec_cpus[i], consec_nodes[i],
  992. consec_start[i], consec_end[i], consec_req[i]);
  993. }
  994. }
  995. /* Compute CPUs already allocated to required nodes */
  996. if ((details_ptr->max_cpus != NO_VAL) &&
  997. (total_cpus > details_ptr->max_cpus)) {
  998. info("Job %u can't use required nodes due to max CPU limit",
  999. job_ptr->job_id);
  1000. goto fini;
  1001. }
  1002. /* accumulate nodes from these sets of consecutive nodes until */
  1003. /* sufficient resources have been accumulated */
  1004. while (consec_index && (max_nodes > 0)) {
  1005. best_fit_cpus = best_fit_nodes = best_fit_sufficient = 0;
  1006. best_fit_req = -1; /* first required node, -1 if none */
  1007. for (i = 0; i < consec_index; i++) {
  1008. if (consec_nodes[i] == 0)
  1009. continue; /* no usable nodes here */
  1010. if (details_ptr->contiguous &&
  1011. details_ptr->req_node_bitmap &&
  1012. (consec_req[i] == -1))
  1013. break; /* not required nodes */
  1014. sufficient = (consec_cpus[i] >= rem_cpus) &&
  1015. _enough_nodes(consec_nodes[i], rem_nodes,
  1016. min_nodes, req_nodes);
  1017. /* if first possibility OR */
  1018. /* contains required nodes OR */
  1019. /* first set large enough for request OR */
  1020. /* tightest fit (less resource waste) OR */
  1021. /* nothing yet large enough, but this is biggest */
  1022. if ((best_fit_nodes == 0) ||
  1023. ((best_fit_req == -1) && (consec_req[i] != -1)) ||
  1024. (sufficient && (best_fit_sufficient == 0)) ||
  1025. (sufficient && (consec_cpus[i] < best_fit_cpus)) ||
  1026. (!sufficient && (consec_cpus[i] > best_fit_cpus))) {
  1027. best_fit_cpus = consec_cpus[i];
  1028. best_fit_nodes = consec_nodes[i];
  1029. best_fit_index = i;
  1030. best_fit_req = consec_req[i];
  1031. best_fit_sufficient = sufficient;
  1032. }
  1033. if (details_ptr->contiguous &&
  1034. details_ptr->req_node_bitmap) {
  1035. /* Must wait for all required nodes to be
  1036. * in a single consecutive block */
  1037. int j, other_blocks = 0;
  1038. for (j = (i+1); j < consec_index; j++) {
  1039. if (consec_req[j] != -1) {
  1040. other_blocks = 1;
  1041. break;
  1042. }
  1043. }
  1044. if (other_blocks) {
  1045. best_fit_nodes = 0;
  1046. break;
  1047. }
  1048. }
  1049. }
  1050. if (best_fit_nodes == 0)
  1051. break;
  1052. if (details_ptr->contiguous &&
  1053. ((best_fit_cpus < rem_cpus) ||
  1054. (!_enough_nodes(best_fit_nodes, rem_nodes,
  1055. min_nodes, req_nodes))))
  1056. break; /* no hole large enough */
  1057. if (best_fit_req != -1) {
  1058. /* This collection of nodes includes required ones
  1059. * select nodes from this set, first working up
  1060. * then down from the required nodes */
  1061. for (i = best_fit_req;
  1062. i <= consec_end[best_fit_index]; i++) {
  1063. if ((max_nodes <= 0) ||
  1064. ((rem_nodes <= 0) && (rem_cpus <= 0)))
  1065. break;
  1066. if (bit_test(node_map, i)) {
  1067. /* required node already in set */
  1068. continue;
  1069. }
  1070. avail_cpus = _get_cpu_cnt(job_ptr, i, cpu_cnt);
  1071. if (avail_cpus <= 0)
  1072. continue;
  1073. /* This could result in 0, but if the user
  1074. * requested nodes here we will still give
  1075. * them and then the step layout will sort
  1076. * things out. */
  1077. _cpus_to_use(&avail_cpus, rem_cpus,
  1078. min_rem_nodes,
  1079. details_ptr, &cpu_cnt[i]);
  1080. total_cpus += avail_cpus;
  1081. /* enforce the max_cpus limit */
  1082. if ((details_ptr->max_cpus != NO_VAL) &&
  1083. (total_cpus > details_ptr->max_cpus)) {
  1084. debug2("1 can't use this node "
  1085. "since it would put us "
  1086. "over the limit");
  1087. total_cpus -= avail_cpus;
  1088. continue;
  1089. }
  1090. bit_set(node_map, i);
  1091. rem_nodes--;
  1092. min_rem_nodes--;
  1093. max_nodes--;
  1094. rem_cpus -= avail_cpus;
  1095. }
  1096. for (i = (best_fit_req - 1);
  1097. i >= consec_start[best_fit_index]; i--) {
  1098. if ((max_nodes <= 0) ||
  1099. ((rem_nodes <= 0) && (rem_cpus <= 0)))
  1100. break;
  1101. if (bit_test(node_map, i))
  1102. continue;
  1103. avail_cpus = _get_cpu_cnt(job_ptr, i, cpu_cnt);
  1104. if (avail_cpus <= 0)
  1105. continue;
  1106. /* This could result in 0, but if the user
  1107. * requested nodes here we will still give
  1108. * them and then the step layout will sort
  1109. * things out. */
  1110. _cpus_to_use(&avail_cpus, rem_cpus,
  1111. min_rem_nodes,
  1112. details_ptr, &cpu_cnt[i]);
  1113. total_cpus += avail_cpus;
  1114. /* enforce the max_cpus limit */
  1115. if ((details_ptr->max_cpus != NO_VAL) &&
  1116. (total_cpus > details_ptr->max_cpus)) {
  1117. debug2("2 can't use this node "
  1118. "since it would put us "
  1119. "over the limit");
  1120. total_cpus -= avail_cpus;
  1121. continue;
  1122. }
  1123. rem_cpus -= avail_cpus;
  1124. bit_set(node_map, i);
  1125. rem_nodes--;
  1126. min_rem_nodes--;
  1127. max_nodes--;
  1128. }
  1129. } else {
  1130. /* No required nodes, try best fit single node */
  1131. int *cpus_array = NULL, array_len;
  1132. int best_fit = -1, best_size = 0;
  1133. int first = consec_start[best_fit_index];
  1134. int last = consec_end[best_fit_index];
  1135. if (rem_nodes <= 1) {
  1136. array_len = last - first + 1;
  1137. cpus_array = xmalloc(sizeof(int) * array_len);
  1138. for (i = first, j = 0; i <= last; i++, j++) {
  1139. if (bit_test(node_map, i))
  1140. continue;
  1141. cpus_array[j] = _get_cpu_cnt(job_ptr,
  1142. i, cpu_cnt);
  1143. if (cpus_array[j] < rem_cpus)
  1144. continue;
  1145. if ((best_fit == -1) ||
  1146. (cpus_array[j] < best_size)) {
  1147. best_fit = j;
  1148. best_size = cpus_array[j];
  1149. if (best_size == rem_cpus)
  1150. break;
  1151. }
  1152. }
  1153. /* If we found a single node to use,
  1154. * clear cpu counts for all other nodes */
  1155. for (i = first, j = 0;
  1156. ((i <= last) && (best_fit != -1));
  1157. i++, j++) {
  1158. if (j != best_fit)
  1159. cpus_array[j] = 0;
  1160. }
  1161. }
  1162. for (i = first, j = 0; i <= last; i++, j++) {
  1163. if ((max_nodes <= 0) ||
  1164. ((rem_nodes <= 0) && (rem_cpus <= 0)))
  1165. break;
  1166. if (bit_test(node_map, i))
  1167. continue;
  1168. if (cpus_array)
  1169. avail_cpus = cpus_array[j];
  1170. else {
  1171. avail_cpus = _get_cpu_cnt(job_ptr, i,
  1172. cpu_cnt);
  1173. }
  1174. if (avail_cpus <= 0)
  1175. continue;
  1176. if ((max_nodes == 1) &&
  1177. (avail_cpus < rem_cpus)) {
  1178. /* Job can only take one more node and
  1179. * this one has insufficient CPU */
  1180. continue;
  1181. }
  1182. /* This could result in 0, but if the user
  1183. * requested nodes here we will still give
  1184. * them and then the step layout will sort
  1185. * things out. */
  1186. _cpus_to_use(&avail_cpus, rem_cpus,
  1187. min_rem_nodes,
  1188. details_ptr, &cpu_cnt[i]);
  1189. total_cpus += avail_cpus;
  1190. /* enforce the max_cpus limit */
  1191. if ((details_ptr->max_cpus != NO_VAL) &&
  1192. (total_cpus > details_ptr->max_cpus)) {
  1193. debug2("3 can't use this node "
  1194. "since it would put us "
  1195. "over the limit");
  1196. total_cpus -= avail_cpus;
  1197. continue;
  1198. }
  1199. rem_cpus -= avail_cpus;
  1200. bit_set(node_map, i);
  1201. rem_nodes--;
  1202. min_rem_nodes--;
  1203. max_nodes--;
  1204. }
  1205. xfree(cpus_array);
  1206. }
  1207. if (details_ptr->contiguous ||
  1208. ((rem_nodes <= 0) && (rem_cpus <= 0))) {
  1209. error_code = SLURM_SUCCESS;
  1210. break;
  1211. }
  1212. consec_cpus[best_fit_index] = 0;
  1213. consec_nodes[best_fit_index] = 0;
  1214. }
  1215. if (error_code && (rem_cpus <= 0) &&
  1216. _enough_nodes(0, rem_nodes, min_nodes, req_nodes))
  1217. error_code = SLURM_SUCCESS;
  1218. fini: xfree(consec_cpus);
  1219. xfree(consec_nodes);
  1220. xfree(consec_start);
  1221. xfree(consec_end);
  1222. xfree(consec_req);
  1223. return error_code;
  1224. }
  1225. /*
  1226. * A network topology aware version of _eval_nodes().
  1227. * NOTE: The logic here is almost identical to that of _job_test_topo()
  1228. * in select_linear.c. Any bug found here is probably also there.
  1229. */
  1230. static int _eval_nodes_topo(struct job_record *job_ptr, bitstr_t *bitmap,
  1231. uint32_t min_nodes, uint32_t max_nodes,
  1232. uint32_t req_nodes, uint32_t cr_node_cnt,
  1233. uint16_t *cpu_cnt)
  1234. {
  1235. bitstr_t **switches_bitmap; /* nodes on this switch */
  1236. int *switches_cpu_cnt; /* total CPUs on switch */
  1237. int *switches_node_cnt; /* total nodes on switch */
  1238. int *switches_required; /* set if has required node */
  1239. int leaf_switch_count = 0; /* Count of leaf node switches used */
  1240. bitstr_t *avail_nodes_bitmap = NULL; /* nodes on any switch */
  1241. bitstr_t *req_nodes_bitmap = NULL;
  1242. int rem_cpus, rem_nodes; /* remaining resources desired */
  1243. int min_rem_nodes; /* remaining resources desired */
  1244. int avail_cpus;
  1245. int total_cpus = 0; /* #CPUs allocated to job */
  1246. int i, j, rc = SLURM_SUCCESS;
  1247. int best_fit_inx, first, last;
  1248. int best_fit_nodes, best_fit_cpus;
  1249. int best_fit_location = 0, best_fit_sufficient;
  1250. bool sufficient;
  1251. long time_waiting = 0;
  1252. if (job_ptr->req_switch) {
  1253. time_t time_now;
  1254. time_now = time(NULL);
  1255. if (job_ptr->wait4switch_start == 0)
  1256. job_ptr->wait4switch_start = time_now;
  1257. time_waiting = time_now - job_ptr->wait4switch_start;
  1258. }
  1259. rem_cpus = job_ptr->details->min_cpus;
  1260. rem_nodes = MAX(min_nodes, req_nodes);
  1261. min_rem_nodes = min_nodes;
  1262. if (job_ptr->details->req_node_bitmap) {
  1263. req_nodes_bitmap = bit_copy(job_ptr->details->req_node_bitmap);
  1264. i = bit_set_count(req_nodes_bitmap);
  1265. if (i > max_nodes) {
  1266. info("job %u requires more nodes than currently "
  1267. "available (%u>%u)",
  1268. job_ptr->job_id, i, max_nodes);
  1269. rc = SLURM_ERROR;
  1270. goto fini;
  1271. }
  1272. }
  1273. /* Construct a set of switch array entries,
  1274. * use the same indexes as switch_record_table in slurmctld */
  1275. switches_bitmap = xmalloc(sizeof(bitstr_t *) * switch_record_cnt);
  1276. switches_cpu_cnt = xmalloc(sizeof(int) * switch_record_cnt);
  1277. switches_node_cnt = xmalloc(sizeof(int) * switch_record_cnt);
  1278. switches_required = xmalloc(sizeof(int) * switch_record_cnt);
  1279. avail_nodes_bitmap = bit_alloc(cr_node_cnt);
  1280. for (i=0; i<switch_record_cnt; i++) {
  1281. switches_bitmap[i] = bit_copy(switch_record_table[i].
  1282. node_bitmap);
  1283. bit_and(switches_bitmap[i], bitmap);
  1284. bit_or(avail_nodes_bitmap, switches_bitmap[i]);
  1285. switches_node_cnt[i] = bit_set_count(switches_bitmap[i]);
  1286. if (req_nodes_bitmap &&
  1287. bit_overlap(req_nodes_bitmap, switches_bitmap[i])) {
  1288. switches_required[i] = 1;
  1289. }
  1290. }
  1291. bit_nclear(bitmap, 0, cr_node_cnt - 1);
  1292. if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
  1293. for (i=0; i<switch_record_cnt; i++) {
  1294. char *node_names = NULL;
  1295. if (switches_node_cnt[i]) {
  1296. node_names = bitmap2node_name(
  1297. switches_bitmap[i]);
  1298. }
  1299. debug("switch=%s nodes=%u:%s required:%u speed:%u",
  1300. switch_record_table[i].name,
  1301. switches_node_cnt[i], node_names,
  1302. switches_required[i],
  1303. switch_record_table[i].link_speed);
  1304. xfree(node_names);
  1305. }
  1306. }
  1307. if (req_nodes_bitmap &&
  1308. (!bit_super_set(req_nodes_bitmap, avail_nodes_bitmap))) {
  1309. info("job %u requires nodes not available on any switch",
  1310. job_ptr->job_id);
  1311. rc = SLURM_ERROR;
  1312. goto fini;
  1313. }
  1314. /* Check that specific required nodes are linked together */
  1315. if (req_nodes_bitmap) {
  1316. rc = SLURM_ERROR;
  1317. for (i=0; i<switch_record_cnt; i++) {
  1318. if (bit_super_set(req_nodes_bitmap,
  1319. switches_bitmap[i])) {
  1320. rc = SLURM_SUCCESS;
  1321. break;
  1322. }
  1323. }
  1324. if ( rc == SLURM_ERROR ) {
  1325. info("job %u requires nodes that are not linked "
  1326. "together", job_ptr->job_id);
  1327. goto fini;
  1328. }
  1329. }
  1330. if (req_nodes_bitmap) {
  1331. /* Accumulate specific required resources, if any */
  1332. first = bit_ffs(req_nodes_bitmap);
  1333. last = bit_fls(req_nodes_bitmap);
  1334. for (i=first; ((i<=last) && (first>=0)); i++) {
  1335. if (!bit_test(req_nodes_bitmap, i))
  1336. continue;
  1337. if (max_nodes <= 0) {
  1338. info("job %u requires nodes than allowed",
  1339. job_ptr->job_id);
  1340. rc = SLURM_ERROR;
  1341. goto fini;
  1342. }
  1343. bit_set(bitmap, i);
  1344. bit_clear(avail_nodes_bitmap, i);
  1345. avail_cpus = _get_cpu_cnt(job_ptr, i, cpu_cnt);
  1346. /* This could result in 0, but if the user
  1347. * requested nodes here we will still give
  1348. * them and then the step layout will sort
  1349. * things out. */
  1350. _cpus_to_use(&avail_cpus, rem_cpus, min_rem_nodes,
  1351. job_ptr->details, &cpu_cnt[i]);
  1352. rem_nodes--;
  1353. min_rem_nodes--;
  1354. max_nodes--;
  1355. total_cpus += avail_cpus;
  1356. rem_cpus -= avail_cpus;
  1357. for (j=0; j<switch_record_cnt; j++) {
  1358. if (!bit_test(switches_bitmap[j], i))
  1359. continue;
  1360. bit_clear(switches_bitmap[j], i);
  1361. switches_node_cnt[j]--;
  1362. /* keep track of the accumulated resources */
  1363. switches_required[j] += avail_cpus;
  1364. }
  1365. }
  1366. /* Compute CPUs already allocated to required nodes */
  1367. if ((job_ptr->details->max_cpus != NO_VAL) &&
  1368. (total_cpus > job_ptr->details->max_cpus)) {
  1369. info("Job %u can't use required node due to max CPU "
  1370. "limit", job_ptr->job_id);
  1371. rc = SLURM_ERROR;
  1372. goto fini;
  1373. }
  1374. if ((rem_nodes <= 0) && (rem_cpus <= 0))
  1375. goto fini;
  1376. /* Update bitmaps and node counts for higher-level switches */
  1377. for (j=0; j<switch_record_cnt; j++) {
  1378. if (switches_node_cnt[j] == 0)
  1379. continue;
  1380. first = bit_ffs(switches_bitmap[j]);
  1381. if (first < 0)
  1382. continue;
  1383. last = bit_fls(switches_bitmap[j]);
  1384. for (i=first; i<=last; i++) {
  1385. if (!bit_test(switches_bitmap[j], i))
  1386. continue;
  1387. if (!bit_test(avail_nodes_bitmap, i)) {
  1388. /* cleared from lower level */
  1389. bit_clear(switches_bitmap[j], i);
  1390. switches_node_cnt[j]--;
  1391. } else {
  1392. switches_cpu_cnt[j] +=
  1393. _get_cpu_cnt(job_ptr, i,
  1394. cpu_cnt);
  1395. }
  1396. }
  1397. }
  1398. } else {
  1399. /* No specific required nodes, calculate CPU counts */
  1400. for (j=0; j<switch_record_cnt; j++) {
  1401. first = bit_ffs(switches_bitmap[j]);
  1402. if (first < 0)
  1403. continue;
  1404. last = bit_fls(switches_bitmap[j]);
  1405. for (i=first; i<=last; i++) {
  1406. if (!bit_test(switches_bitmap[j], i))
  1407. continue;
  1408. switches_cpu_cnt[j] +=
  1409. _get_cpu_cnt(job_ptr, i, cpu_cnt);
  1410. }
  1411. }
  1412. }
  1413. /* Determine lowest level switch satisfying request with best fit
  1414. * in respect of the specific required nodes if specified
  1415. */
  1416. best_fit_inx = -1;
  1417. for (j=0; j<switch_record_cnt; j++) {
  1418. if ((switches_cpu_cnt[j] < rem_cpus) ||
  1419. (!_enough_nodes(switches_node_cnt[j], rem_nodes,
  1420. min_nodes, req_nodes)))
  1421. continue;
  1422. if ((best_fit_inx != -1) && (req_nodes > min_nodes) &&
  1423. (switches_node_cnt[best_fit_inx] < req_nodes) &&
  1424. (switches_node_cnt[best_fit_inx] < switches_node_cnt[j])) {
  1425. /* Try to get up to the requested node count */
  1426. best_fit_inx = -1;
  1427. }
  1428. /*
  1429. * If first possibility OR
  1430. * first required switch OR
  1431. * lower level switch OR
  1432. * same level but tighter switch (less resource waste) OR
  1433. * 2 required switches of same level and nodes count
  1434. * but the latter accumulated cpus amount is bigger than
  1435. * the former one
  1436. */
  1437. if ((best_fit_inx == -1) ||
  1438. (!switches_required[best_fit_inx] && switches_required[j]) ||
  1439. (switch_record_table[j].level <
  1440. switch_record_table[best_fit_inx].level) ||
  1441. ((switch_record_table[j].level ==
  1442. switch_record_table[best_fit_inx].level) &&
  1443. (switches_node_cnt[j] < switches_node_cnt[best_fit_inx])) ||
  1444. ((switches_required[best_fit_inx] && switches_required[j]) &&
  1445. (switch_record_table[j].level ==
  1446. switch_record_table[best_fit_inx].level) &&
  1447. (switches_node_cnt[j] == switches_node_cnt[best_fit_inx]) &&
  1448. switches_required[best_fit_inx] < switches_required[j]) ) {
  1449. /* If first possibility OR */
  1450. /* current best switch not required OR */
  1451. /* current best switch required but this */
  1452. /* better one too */
  1453. if ( best_fit_inx == -1 ||
  1454. !switches_required[best_fit_inx] ||
  1455. (switches_required[best_fit_inx] &&
  1456. switches_required[j]) )
  1457. best_fit_inx = j;
  1458. }
  1459. }
  1460. if (best_fit_inx == -1) {
  1461. debug("job %u: best_fit topology failure : no switch "
  1462. "satisfying the request found", job_ptr->job_id);
  1463. rc = SLURM_ERROR;
  1464. goto fini;
  1465. }
  1466. if (!switches_required[best_fit_inx] && req_nodes_bitmap ) {
  1467. debug("job %u: best_fit topology failure : no switch "
  1468. "including requested nodes and satisfying the "
  1469. "request found", job_ptr->job_id);
  1470. rc = SLURM_ERROR;
  1471. goto fini;
  1472. }
  1473. bit_and(avail_nodes_bitmap, switches_bitmap[best_fit_inx]);
  1474. /* Identify usable leafs (within higher switch having best fit) */
  1475. for (j=0; j<switch_record_cnt; j++) {
  1476. if ((switch_record_table[j].level != 0) ||
  1477. (!bit_super_set(switches_bitmap[j],
  1478. switches_bitmap[best_fit_inx]))) {
  1479. switches_node_cnt[j] = 0;
  1480. }
  1481. }
  1482. /* Select resources from these leafs on a best-fit basis */
  1483. /* Use required switches first to minimize the total amount */
  1484. /* of switches */
  1485. /* compute best-switch nodes available array */
  1486. while ((max_nodes > 0) && ((rem_nodes > 0) || (rem_cpus > 0))) {
  1487. int *cpus_array = NULL, array_len;
  1488. best_fit_cpus = best_fit_nodes = best_fit_sufficient = 0;
  1489. for (j=0; j<switch_record_cnt; j++) {
  1490. if (switches_node_cnt[j] == 0)
  1491. continue;
  1492. sufficient = (switches_cpu_cnt[j] >= rem_cpus) &&
  1493. _enough_nodes(switches_node_cnt[j],
  1494. rem_nodes, min_nodes,
  1495. req_nodes);
  1496. /* If first possibility OR */
  1497. /* first required switch OR */
  1498. /* first set large enough for request OR */
  1499. /* tightest fit (less resource waste) OR */
  1500. /* nothing yet large enough, but this is biggest OR */
  1501. /* 2 required switches of same level and cpus count */
  1502. /* but the latter accumulated cpus amount is bigger */
  1503. /* than the former one */
  1504. if ((best_fit_nodes == 0) ||
  1505. (!switches_required[best_fit_location] &&
  1506. switches_required[j] ) ||
  1507. (sufficient && (best_fit_sufficient == 0)) ||
  1508. (sufficient &&
  1509. (switches_cpu_cnt[j] < best_fit_cpus)) ||
  1510. ((sufficient == 0) &&
  1511. (switches_cpu_cnt[j] > best_fit_cpus)) ||
  1512. (switches_required[best_fit_location] &&
  1513. switches_required[j] &&
  1514. switches_cpu_cnt[best_fit_location] ==
  1515. switches_cpu_cnt[j] &&
  1516. switches_required[best_fit_location] <
  1517. switches_required[j]) ) {
  1518. /* If first possibility OR */
  1519. /* current best switch not required OR */
  1520. /* current best switch required but this */
  1521. /* better one too */
  1522. if ((best_fit_nodes == 0) ||
  1523. !switches_required[best_fit_location] ||
  1524. (switches_required[best_fit_location] &&
  1525. switches_required[j])) {
  1526. best_fit_cpus = switches_cpu_cnt[j];
  1527. best_fit_nodes = switches_node_cnt[j];
  1528. best_fit_location = j;
  1529. best_fit_sufficient = sufficient;
  1530. }
  1531. }
  1532. }
  1533. if (best_fit_nodes == 0)
  1534. break;
  1535. leaf_switch_count++;
  1536. /* Use select nodes from this leaf */
  1537. first = bit_ffs(switches_bitmap[best_fit_location]);
  1538. last = bit_fls(switches_bitmap[best_fit_location]);
  1539. /* compute best-switch nodes available cpus array */
  1540. array_len = last - first + 1;
  1541. cpus_array = xmalloc(sizeof(int) * array_len);
  1542. for (i=first, j=0; ((i<=last) && (first>=0)); i++, j++) {
  1543. if (!bit_test(switches_bitmap
  1544. [best_fit_location], i))
  1545. cpus_array[j] = 0;
  1546. else
  1547. cpus_array[j] = _get_cpu_cnt(job_ptr, i,
  1548. cpu_cnt);
  1549. }
  1550. if (job_ptr->req_switch > 0) {
  1551. if (time_waiting >= job_ptr->wait4switch) {
  1552. job_ptr->best_switch = true;
  1553. debug3("Job=%u Waited %ld sec for switches use=%d",
  1554. job_ptr->job_id, time_waiting,
  1555. leaf_switch_count);
  1556. } else if (leaf_switch_count>job_ptr->req_switch) {
  1557. /* Allocation is for more than requested number
  1558. * of switches */
  1559. job_ptr->best_switch = false;
  1560. debug3("Job=%u waited %ld sec for switches=%u "
  1561. "found=%d wait %u",
  1562. job_ptr->job_id, time_waiting,
  1563. job_ptr->req_switch,
  1564. leaf_switch_count,
  1565. job_ptr->wait4switch);
  1566. } else {
  1567. job_ptr->best_switch = true;
  1568. }
  1569. }
  1570. /* accumulate resources from this leaf on a best-fit basis */
  1571. while ((max_nodes > 0) && ((rem_nodes > 0) || (rem_cpus > 0))) {
  1572. /* pick a node using a best-fit approach */
  1573. /* if rem_cpus < 0, then we will search for nodes
  1574. * with lower free cpus nb first
  1575. */
  1576. int suff = 0, bfsuff = 0, bfloc = 0 , bfsize = 0;
  1577. int ca_bfloc = 0;
  1578. for (i=first, j=0; ((i<=last) && (first>=0));
  1579. i++, j++) {
  1580. if (cpus_array[j] == 0)
  1581. continue;
  1582. suff = cpus_array[j] >= rem_cpus;
  1583. if ( (bfsize == 0) ||
  1584. (suff && !bfsuff) ||
  1585. (suff && (cpus_array[j] < bfsize)) ||
  1586. (!suff && (cpus_array[j] > bfsize)) ) {
  1587. bfsuff = suff;
  1588. bfloc = i;
  1589. bfsize = cpus_array[j];
  1590. ca_bfloc = j;
  1591. }
  1592. }
  1593. /* no node found, break */
  1594. if (bfsize == 0)
  1595. break;
  1596. /* clear resources of this node from the switch */
  1597. bit_clear(switches_bitmap[best_fit_location],bfloc);
  1598. switches_node_cnt[best_fit_location]--;
  1599. switches_cpu_cnt[best_fit_location] -= bfsize;
  1600. cpus_array[ca_bfloc] = 0;
  1601. /* if this node was already selected in an other */
  1602. /* switch, skip it */
  1603. if (bit_test(bitmap, bfloc)) {
  1604. continue;
  1605. }
  1606. /* This could result in 0, but if the user
  1607. * requested nodes here we will still give
  1608. * them and then the step layout will sort
  1609. * things out. */
  1610. _cpus_to_use(&bfsize, rem_cpus, min_rem_nodes,
  1611. job_ptr->details, &cpu_cnt[bfloc]);
  1612. /* enforce the max_cpus limit */
  1613. if ((job_ptr->details->max_cpus != NO_VAL) &&
  1614. (total_cpus+bfsize > job_ptr->details->max_cpus)) {
  1615. debug2("5 can't use this node since it "
  1616. "would put us over the limit");
  1617. continue;
  1618. }
  1619. /* take the node into account */
  1620. bit_set(bitmap, bfloc);
  1621. total_cpus += bfsize;
  1622. rem_nodes--;
  1623. min_rem_nodes--;
  1624. max_nodes--;
  1625. rem_cpus -= bfsize;
  1626. }
  1627. /* free best-switch nodes available cpus array */
  1628. xfree(cpus_array);
  1629. /* mark this switch as processed */
  1630. switches_node_cnt[best_fit_location] = 0;
  1631. }
  1632. if ((rem_cpus <= 0) &&
  1633. _enough_nodes(0, rem_nodes, min_nodes, req_nodes)) {
  1634. rc = SLURM_SUCCESS;
  1635. } else
  1636. rc = SLURM_ERROR;
  1637. fini: FREE_NULL_BITMAP(avail_nodes_bitmap);
  1638. FREE_NULL_BITMAP(req_nodes_bitmap);
  1639. for (i=0; i<switch_record_cnt; i++)
  1640. FREE_NULL_BITMAP(switches_bitmap[i]);
  1641. xfree(switches_bitmap);
  1642. xfree(switches_cpu_cnt);
  1643. xfree(switches_node_cnt);
  1644. xfree(switches_required);
  1645. return rc;
  1646. }
  1647. /* this is an intermediary step between _select_nodes and _eval_nodes
  1648. * to tackle the knapsack problem. This code incrementally removes nodes
  1649. * with low cpu counts for the job and re-evaluates each result */
  1650. static int _choose_nodes(struct job_record *job_ptr, bitstr_t *node_map,
  1651. uint32_t min_nodes, uint32_t max_nodes,
  1652. uint32_t req_nodes, uint32_t cr_node_cnt,
  1653. uint16_t *cpu_cnt)
  1654. {
  1655. int i, count, ec, most_cpus = 0;
  1656. bitstr_t *origmap, *reqmap = NULL;
  1657. if (job_ptr->details->req_node_bitmap)
  1658. reqmap = job_ptr->details->req_node_bitmap;
  1659. /* clear nodes from the bitmap that don't have available resources */
  1660. for (i = 0; i < cr_node_cnt; i++) {
  1661. if (!bit_test(node_map, i))
  1662. continue;
  1663. /* Make sure we don't say we can use a node exclusively
  1664. * that is bigger than our max cpu count. */
  1665. if (((!job_ptr->details->shared) &&
  1666. (job_ptr->details->max_cpus != NO_VAL) &&
  1667. (job_ptr->details->max_cpus < cpu_cnt[i])) ||
  1668. /* OR node has no CPUs */
  1669. (cpu_cnt[i] < 1)) {
  1670. if (reqmap && bit_test(reqmap, i)) {
  1671. /* can't clear a required node! */
  1672. return SLURM_ERROR;
  1673. }
  1674. bit_clear(node_map, i);
  1675. }
  1676. }
  1677. /* NOTE: details->min_cpus is 1 by default,
  1678. * Only reset max_nodes if user explicitly sets a proc count */
  1679. if ((job_ptr->details->min_cpus > 1) &&
  1680. (max_nodes > job_ptr->details->min_cpus))
  1681. max_nodes = job_ptr->details->min_cpus;
  1682. origmap = bit_copy(node_map);
  1683. if (origmap == NULL)
  1684. fatal("bit_copy malloc failure");
  1685. ec = _eval_nodes(job_ptr, node_map, min_nodes, max_nodes,
  1686. req_nodes, cr_node_cnt, cpu_cnt);
  1687. if (ec == SLURM_SUCCESS) {
  1688. FREE_NULL_BITMAP(origmap);
  1689. return ec;
  1690. }
  1691. /* This nodeset didn't work. To avoid a possible knapsack problem,
  1692. * incrementally remove nodes with low cpu counts and retry */
  1693. for (i = 0; i < cr_node_cnt; i++) {
  1694. most_cpus = MAX(most_cpus, cpu_cnt[i]);
  1695. }
  1696. for (count = 1; count < most_cpus; count++) {
  1697. int nochange = 1;
  1698. bit_or(node_map, origmap);
  1699. for (i = 0; i < cr_node_cnt; i++) {
  1700. if ((cpu_cnt[i] > 0) && (cpu_cnt[i] <= count)) {
  1701. if (!bit_test(node_map, i))
  1702. continue;
  1703. if (reqmap && bit_test(reqmap, i))
  1704. continue;
  1705. nochange = 0;
  1706. bit_clear(node_map, i);
  1707. bit_clear(origmap, i);
  1708. }
  1709. }
  1710. if (nochange)
  1711. continue;
  1712. ec = _eval_nodes(job_ptr, node_map, min_nodes, max_nodes,
  1713. req_nodes, cr_node_cnt, cpu_cnt);
  1714. if (ec == SLURM_SUCCESS) {
  1715. FREE_NULL_BITMAP(origmap);
  1716. return ec;
  1717. }
  1718. }
  1719. FREE_NULL_BITMAP(origmap);
  1720. return ec;
  1721. }
  1722. /* Select the best set of resources for the given job
  1723. * IN: job_ptr - pointer to the job requesting resources
  1724. * IN: min_nodes - minimum number of nodes required
  1725. * IN: max_nodes - maximum number of nodes requested
  1726. * IN: req_nodes - number of requested nodes
  1727. * IN/OUT: node_map - bitmap of available nodes / bitmap of selected nodes
  1728. * IN: cr_node_cnt - total number of nodes in the cluster
  1729. * IN/OUT: core_map - bitmap of available cores / bitmap of selected cores
  1730. * IN: cr_type - resource type
  1731. * IN: test_only - ignore allocated memory check
  1732. * RET - array with number of CPUs available per node or NULL if not runnable
  1733. */
  1734. static uint16_t *_select_nodes(struct job_record *job_ptr, uint32_t min_nodes,
  1735. uint32_t max_nodes, uint32_t req_nodes,
  1736. bitstr_t *node_map, uint32_t cr_node_cnt,
  1737. bitstr_t *core_map,
  1738. struct node_use_record *node_usage,
  1739. uint16_t cr_type, bool test_only)
  1740. {
  1741. int rc;
  1742. uint16_t *cpu_cnt, *cpus = NULL;
  1743. uint32_t start, n, a;
  1744. //char str[100];
  1745. bitstr_t *req_map = job_ptr->details->req_node_bitmap;
  1746. if (bit_set_count(node_map) < min_nodes)
  1747. return NULL;
  1748. //bit_fmt(str, (sizeof(str) - 1), node_map);
  1749. //info("_select_nodes nodemap: %s", str);
  1750. //bit_fmt(str, (sizeof(str) - 1), core_map);
  1751. //info("_select_nodes coremap: %s", str);
  1752. /* get resource usage for this job from each available node */
  1753. _get_res_usage(job_ptr, node_map, core_map, cr_node_cnt,
  1754. node_usage, cr_type, &cpu_cnt, test_only);
  1755. /* clear all nodes that do not have any
  1756. * usable resources for this job */
  1757. for (n = 0; n < cr_node_cnt; n++) {
  1758. if (bit_test(node_map, n) && (cpu_cnt[n] == 0)) {
  1759. /* no resources are available for this node */
  1760. if (req_map && bit_test(req_map, n)) {
  1761. /* cannot clear a required node! */
  1762. xfree(cpu_cnt);
  1763. return NULL;
  1764. }
  1765. bit_clear(node_map, n);
  1766. }
  1767. }
  1768. if (bit_set_count(node_map) < min_nodes) {
  1769. xfree(cpu_cnt);
  1770. return NULL;
  1771. }
  1772. //bit_fmt(str, (sizeof(str) - 1), node_map);
  1773. //info("_select_nodes nodemap: %s", str);
  1774. //bit_fmt(str, (sizeof(str) - 1), node_map);
  1775. //info("_select_nodes nodemap: %s", str);
  1776. /* choose the best nodes for the job */
  1777. rc = _choose_nodes(job_ptr, node_map, min_nodes, max_nodes, req_nodes,
  1778. cr_node_cnt, cpu_cnt);
  1779. /* if successful, sync up the core_map with the node_map, and
  1780. * create a cpus array */
  1781. if (rc == SLURM_SUCCESS) {
  1782. cpus = xmalloc(bit_set_count(node_map) * sizeof(uint16_t));
  1783. start = 0;
  1784. a = 0;
  1785. for (n = 0; n < cr_node_cnt; n++) {
  1786. if (bit_test(node_map, n)) {
  1787. cpus[a++] = cpu_cnt[n];
  1788. if (cr_get_coremap_offset(n) != start) {
  1789. bit_nclear(core_map, start,
  1790. (cr_get_coremap_offset(n))-1);
  1791. }
  1792. start = cr_get_coremap_offset(n + 1);
  1793. }
  1794. }
  1795. if (cr_get_coremap_offset(n) != start) {
  1796. bit_nclear(core_map, start, cr_get_coremap_offset(n)-1);
  1797. }
  1798. }
  1799. xfree(cpu_cnt);
  1800. return cpus;
  1801. }
  1802. /* cr_job_test - does most of the real work for select_p_job_test(), which
  1803. * includes contiguous selection, load-leveling and max_share logic
  1804. *
  1805. * PROCEDURE:
  1806. *
  1807. * Step 1: compare nodes in "avail" bitmap with current node state data
  1808. * to find available nodes that match the job request
  1809. *
  1810. * Step 2: check resources in "avail" bitmap with allocated resources from
  1811. * higher priority partitions (busy resources are UNavailable)
  1812. *
  1813. * Step 3: select resource usage on remaining resources in "avail" bitmap
  1814. * for this job, with the placement influenced by existing
  1815. * allocations
  1816. */
  1817. extern int cr_job_test(struct job_record *job_ptr, bitstr_t *bitmap,
  1818. uint32_t min_nodes, uint32_t max_nodes,
  1819. uint32_t req_nodes, int mode,
  1820. uint16_t cr_type, enum node_cr_state job_node_req,
  1821. uint32_t cr_node_cnt,
  1822. struct part_res_record *cr_part_ptr,
  1823. struct node_use_record *node_usage,
  1824. bitstr_t *exc_core_bitmap)
  1825. {
  1826. static int gang_mode = -1;
  1827. int error_code = SLURM_SUCCESS, ll; /* ll = layout array index */
  1828. uint16_t *layout_ptr = NULL;
  1829. bitstr_t *orig_map, *avail_cores, *free_cores;
  1830. bitstr_t *tmpcore = NULL, *reqmap = NULL;
  1831. bool test_only;
  1832. uint32_t c, i, k, n, csize, total_cpus, save_mem = 0;
  1833. int32_t build_cnt;
  1834. job_resources_t *job_res;
  1835. struct job_details *details_ptr;
  1836. struct part_res_record *p_ptr, *jp_ptr;
  1837. uint16_t *cpu_count;
  1838. if (gang_mode == -1) {
  1839. if (slurm_get_preempt_mode() & PREEMPT_MODE_GANG)
  1840. gang_mode = 1;
  1841. else
  1842. gang_mode = 0;
  1843. }
  1844. details_ptr = job_ptr->details;
  1845. layout_ptr = details_ptr->req_node_layout;
  1846. reqmap = details_ptr->req_node_bitmap;
  1847. free_job_resources(&job_ptr->job_resrcs);
  1848. if (mode == SELECT_MODE_TEST_ONLY)
  1849. test_only = true;
  1850. else /* SELECT_MODE_RUN_NOW || SELECT_MODE_WILL_RUN */
  1851. test_only = false;
  1852. /* check node_state and update the node bitmap as necessary */
  1853. if (!test_only) {
  1854. error_code = _verify_node_state(cr_part_ptr, job_ptr,
  1855. bitmap, cr_type, node_usage,
  1856. job_node_req);
  1857. if (error_code != SLURM_SUCCESS) {
  1858. return error_code;
  1859. }
  1860. }
  1861. /* This is the case if -O/--overcommit is true */
  1862. if (details_ptr->min_cpus == details_ptr->min_nodes) {
  1863. struct multi_core_data *mc_ptr = details_ptr->mc_ptr;
  1864. if ((mc_ptr->threads_per_core != (uint16_t) NO_VAL) &&
  1865. (mc_ptr->threads_per_core > 1))
  1866. details_ptr->min_cpus *= mc_ptr->threads_per_core;
  1867. if ((mc_ptr->cores_per_socket != (uint16_t) NO_VAL) &&
  1868. (mc_ptr->cores_per_socket > 1))
  1869. details_ptr->min_cpus *= mc_ptr->cores_per_socket;
  1870. if ((mc_ptr->sockets_per_node != (uint16_t) NO_VAL) &&
  1871. (mc_ptr->sockets_per_node > 1))
  1872. details_ptr->min_cpus *= mc_ptr->sockets_per_node;
  1873. }
  1874. if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
  1875. info("cons_res: cr_job_test: evaluating job %u on %u nodes",
  1876. job_ptr->job_id, bit_set_count(bitmap));
  1877. }
  1878. orig_map = bit_copy(bitmap);
  1879. avail_cores = _make_core_bitmap(bitmap);
  1880. /* test to make sure that this job can succeed with all avail_cores
  1881. * if 'no' then return FAIL
  1882. * if 'yes' then we will seek the optimal placement for this job
  1883. * within avail_cores
  1884. */
  1885. free_cores = bit_copy(avail_cores);
  1886. cpu_count = _select_nodes(job_ptr, min_nodes, max_nodes, req_nodes,
  1887. bitmap, cr_node_cnt, free_cores,
  1888. node_usage, cr_type, test_only);
  1889. if (cpu_count == NULL) {
  1890. /* job cannot fit */
  1891. FREE_NULL_BITMAP(orig_map);
  1892. FREE_NULL_BITMAP(free_cores);
  1893. FREE_NULL_BITMAP(avail_cores);
  1894. if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
  1895. info("cons_res: cr_job_test: test 0 fail: "
  1896. "insufficient resources");
  1897. }
  1898. return SLURM_ERROR;
  1899. } else if (test_only) {
  1900. FREE_NULL_BITMAP(orig_map);
  1901. FREE_NULL_BITMAP(free_cores);
  1902. FREE_NULL_BITMAP(avail_cores);
  1903. xfree(cpu_count);
  1904. if (select_debug_flags & DEBUG_FLAG_CPU_BIND)
  1905. info("cons_res: cr_job_test: test 0 pass: test_only");
  1906. return SLURM_SUCCESS;
  1907. } else if (!job_ptr->best_switch) {
  1908. FREE_NULL_BITMAP(orig_map);
  1909. FREE_NULL_BITMAP(free_cores);
  1910. FREE_NULL_BITMAP(avail_cores);
  1911. xfree(cpu_count);
  1912. return SLURM_ERROR;
  1913. }
  1914. if (cr_type == CR_MEMORY) {
  1915. /* CR_MEMORY does not care about existing CPU allocations,
  1916. * so we can jump right to job allocation from here */
  1917. goto alloc_job;
  1918. }
  1919. xfree(cpu_count);
  1920. if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
  1921. info("cons_res: cr_job_test: test 0 pass - "
  1922. "job fits on given resources");
  1923. }
  1924. /* now that we know that this job can run with the given resources,
  1925. * let's factor in the existing allocations and seek the optimal set
  1926. * of resources for this job. Here is the procedure:
  1927. *
  1928. * Step 1: Seek idle CPUs across all partitions. If successful then
  1929. * place job and exit. If not successful, then continue. Two
  1930. * related items to note:
  1931. * 1. Jobs that don't share CPUs finish with step 1.
  1932. * 2. The remaining steps assume sharing or preemption.
  1933. *
  1934. * Step 2: Remove resources that are in use by higher-priority
  1935. * partitions, and test that job can still succeed. If not
  1936. * then exit.
  1937. *
  1938. * Step 3: Seek idle nodes among the partitions with the same
  1939. * priority as the job's partition. If successful then
  1940. * goto Step 6. If not then continue:
  1941. *
  1942. * Step 4: Seek placement within the job's partition. Search
  1943. * row-by-row. If no placement if found, then exit. If a row
  1944. * is found, then continue:
  1945. *
  1946. * Step 5: Place job and exit. FIXME! Here is where we need a
  1947. * placement algorithm that recognizes existing job
  1948. * boundaries and tries to "overlap jobs" as efficiently
  1949. * as possible.
  1950. *
  1951. * Step 6: Place job and exit. FIXME! here is we use a placement
  1952. * algorithm similar to Step 5 on jobs from lower-priority
  1953. * partitions.
  1954. */
  1955. /*** Step 1 ***/
  1956. bit_copybits(bitmap, orig_map);
  1957. bit_copybits(free_cores, avail_cores);
  1958. if (exc_core_bitmap) {
  1959. int exc_core_size = bit_size(exc_core_bitmap);
  1960. int free_core_size = bit_size(free_cores);
  1961. if (exc_core_size != free_core_size) {
  1962. /* This would indicate that cores were added to or
  1963. * removed from nodes in this reservation when the
  1964. * slurmctld daemon restarted with a new slurm.conf
  1965. * file. This can result in cores being lost from a
  1966. * reservation. */
  1967. error("Bad core_bitmap size for reservation %s "
  1968. "(%d != %d), ignoring core reservation",
  1969. job_ptr->resv_name,
  1970. exc_core_size, free_core_size);
  1971. exc_core_bitmap = NULL; /* Clear local value */
  1972. }
  1973. }
  1974. if (exc_core_bitmap) {
  1975. char str[100];
  1976. bit_fmt(str, (sizeof(str) - 1), exc_core_bitmap);
  1977. debug2("excluding cores reserved: %s", str);
  1978. bit_not(exc_core_bitmap);
  1979. bit_and(free_cores, exc_core_bitmap);
  1980. bit_not(exc_core_bitmap);
  1981. }
  1982. /* remove all existing allocations from free_cores */
  1983. tmpcore = bit_copy(free_cores);
  1984. for (p_ptr = cr_part_ptr; p_ptr; p_ptr = p_ptr->next) {
  1985. if (!p_ptr->row)
  1986. continue;
  1987. for (i = 0; i < p_ptr->num_rows; i++) {
  1988. if (!p_ptr->row[i].row_bitmap)
  1989. continue;
  1990. bit_copybits(tmpcore, p_ptr->row[i].row_bitmap);
  1991. bit_not(tmpcore); /* set bits now "free" resources */
  1992. bit_and(free_cores, tmpcore);
  1993. }
  1994. }
  1995. cpu_count = _select_nodes(job_ptr, min_nodes, max_nodes, req_nodes,
  1996. bitmap, cr_node_cnt, free_cores,
  1997. node_usage, cr_type, test_only);
  1998. if ((cpu_count) && (job_ptr->best_switch)) {
  1999. /* job fits! We're done. */
  2000. if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
  2001. info("cons_res: cr_job_test: test 1 pass - "
  2002. "idle resources found");
  2003. }
  2004. goto alloc_job;
  2005. }
  2006. if ((gang_mode == 0) && (job_node_req == NODE_CR_ONE_ROW)) {
  2007. /* This job CANNOT share CPUs regardless of priority,
  2008. * so we fail here. Note that Shared=EXCLUSIVE was already
  2009. * addressed in _verify_node_state() and job preemption
  2010. * removes jobs from simulated resource allocation map
  2011. * before this point. */
  2012. if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
  2013. info("cons_res: cr_job_test: test 1 fail - "
  2014. "no idle resources available");
  2015. }
  2016. goto alloc_job;
  2017. }
  2018. if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
  2019. info("cons_res: cr_job_test: test 1 fail - "
  2020. "not enough idle resources");
  2021. }
  2022. /*** Step 2 ***/
  2023. bit_copybits(bitmap, orig_map);
  2024. bit_copybits(free_cores, avail_cores);
  2025. if (exc_core_bitmap) {
  2026. bit_not(exc_core_bitmap);
  2027. bit_and(free_cores, exc_core_bitmap);
  2028. bit_not(exc_core_bitmap);
  2029. }
  2030. for (jp_ptr = cr_part_ptr; jp_ptr; jp_ptr = jp_ptr->next) {
  2031. if (jp_ptr->part_ptr == job_ptr->part_ptr)
  2032. break;
  2033. }
  2034. if (!jp_ptr) {
  2035. fatal("cons_res error: could not find partition for job %u",
  2036. job_ptr->job_id);
  2037. }
  2038. /* remove existing allocations (jobs) from higher-priority partitions
  2039. * from avail_cores */
  2040. for (p_ptr = cr_part_ptr; p_ptr; p_ptr = p_ptr->next) {
  2041. if (p_ptr->part_ptr->priority <= jp_ptr->part_ptr->priority)
  2042. continue;
  2043. if (!p_ptr->row)
  2044. continue;
  2045. for (i = 0; i < p_ptr->num_rows; i++) {
  2046. if (!p_ptr->row[i].row_bitmap)
  2047. continue;
  2048. bit_copybits(tmpcore, p_ptr->row[i].row_bitmap);
  2049. bit_not(tmpcore); /* set bits now "free" resources */
  2050. bit_and(free_cores, tmpcore);
  2051. }
  2052. }
  2053. /* make these changes permanent */
  2054. bit_copybits(avail_cores, free_cores);
  2055. cpu_count = _select_nodes(job_ptr, min_nodes, max_nodes, req_nodes,
  2056. bitmap, cr_node_cnt, free_cores,
  2057. node_usage, cr_type, test_only);
  2058. if (!cpu_count) {
  2059. /* job needs resources that are currently in use by
  2060. * higher-priority jobs, so fail for now */
  2061. if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
  2062. info("cons_res: cr_job_test: test 2 fail - "
  2063. "resources busy with higher priority jobs");
  2064. }
  2065. goto alloc_job;
  2066. }
  2067. xfree(cpu_count);
  2068. if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
  2069. info("cons_res: cr_job_test: test 2 pass - "
  2070. "available resources for this priority");
  2071. }
  2072. /*** Step 3 ***/
  2073. bit_copybits(bitmap, orig_map);
  2074. bit_copybits(free_cores, avail_cores);
  2075. /* remove existing allocations (jobs) from same-priority partitions
  2076. * from avail_cores */
  2077. for (p_ptr = cr_part_ptr; p_ptr; p_ptr = p_ptr->next) {
  2078. if (p_ptr->part_ptr->priority != jp_ptr->part_ptr->priority)
  2079. continue;
  2080. if (!p_ptr->row)
  2081. continue;
  2082. for (i = 0; i < p_ptr->num_rows; i++) {
  2083. if (!p_ptr->row[i].row_bitmap)
  2084. continue;
  2085. bit_copybits(tmpcore, p_ptr->row[i].row_bitmap);
  2086. bit_not(tmpcore); /* set bits now "free" resources */
  2087. bit_and(free_cores, tmpcore);
  2088. }
  2089. }
  2090. cpu_count = _select_nodes(job_ptr, min_nodes, max_nodes, req_nodes,
  2091. bitmap, cr_node_cnt, free_cores,
  2092. node_usage, cr_type, test_only);
  2093. if (cpu_count) {
  2094. /* jobs from low-priority partitions are the only thing left
  2095. * in our way. for now we'll ignore them, but FIXME: we need
  2096. * a good placement algorithm here that optimizes "job overlap"
  2097. * between this job (in these idle nodes) and the low-priority
  2098. * jobs */
  2099. if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
  2100. info("cons_res: cr_job_test: test 3 pass - "
  2101. "found resources");
  2102. }
  2103. goto alloc_job;
  2104. }
  2105. if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
  2106. info("cons_res: cr_job_test: test 3 fail - "
  2107. "not enough idle resources in same priority");
  2108. }
  2109. /*** Step 4 ***/
  2110. /* try to fit the job into an existing row
  2111. *
  2112. * tmpcore = worker core_bitmap
  2113. * free_cores = core_bitmap to be built
  2114. * avail_cores = static core_bitmap of all available cores
  2115. */
  2116. if (jp_ptr->row == NULL) {
  2117. /* there's no existing jobs in this partition, so place
  2118. * the job in avail_cores. FIXME: still need a good
  2119. * placement algorithm here that optimizes "job overlap"
  2120. * between this job (in these idle nodes) and existing
  2121. * jobs in the other partitions with <= priority to
  2122. * this partition */
  2123. bit_copybits(bitmap, orig_map);
  2124. bit_copybits(free_cores, avail_cores);
  2125. cpu_count = _select_nodes(job_ptr, min_nodes, max_nodes,
  2126. req_nodes, bitmap, cr_node_cnt,
  2127. free_cores, node_usage, cr_type,
  2128. test_only);
  2129. if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
  2130. info("cons_res: cr_job_test: test 4 pass - "
  2131. "first row found");
  2132. }
  2133. goto alloc_job;
  2134. }
  2135. cr_sort_part_rows(jp_ptr);
  2136. c = jp_ptr->num_rows;
  2137. if (job_node_req != NODE_CR_AVAILABLE)
  2138. c = 1;
  2139. for (i = 0; i < c; i++) {
  2140. if (!jp_ptr->row[i].row_bitmap)
  2141. break;
  2142. bit_copybits(bitmap, orig_map);
  2143. bit_copybits(free_cores, avail_cores);
  2144. bit_copybits(tmpcore, jp_ptr->row[i].row_bitmap);
  2145. bit_not(tmpcore);
  2146. bit_and(free_cores, tmpcore);
  2147. cpu_count = _select_nodes(job_ptr, min_nodes, max_nodes,
  2148. req_nodes, bitmap, cr_node_cnt,
  2149. free_cores, node_usage, cr_type,
  2150. test_only);
  2151. if (cpu_count) {
  2152. if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
  2153. info("cons_res: cr_job_test: test 4 pass - "
  2154. "row %i", i);
  2155. }
  2156. break;
  2157. }
  2158. if (select_debug_flags & DEBUG_FLAG_CPU_BIND)
  2159. info("cons_res: cr_job_test: test 4 fail - row %i", i);
  2160. }
  2161. if ((i < c) && !jp_ptr->row[i].row_bitmap) {
  2162. /* we've found an empty row, so use it */
  2163. bit_copybits(bitmap, orig_map);
  2164. bit_copybits(free_cores, avail_cores);
  2165. if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
  2166. info("cons_res: cr_job_test: "
  2167. "test 4 trying empty row %i",i);
  2168. }
  2169. cpu_count = _select_nodes(job_ptr, min_nodes, max_nodes,
  2170. req_nodes, bitmap, cr_node_cnt,
  2171. free_cores, node_usage, cr_type,
  2172. test_only);
  2173. }
  2174. if (!cpu_count) {
  2175. /* job can't fit into any row, so exit */
  2176. if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
  2177. info("cons_res: cr_job_test: test 4 fail - "
  2178. "busy partition");
  2179. }
  2180. goto alloc_job;
  2181. }
  2182. /*** CONSTRUCTION ZONE FOR STEPs 5 AND 6 ***
  2183. * Note that while the job may have fit into a row, it should
  2184. * still be run through a good placement algorithm here that
  2185. * optimizes "job overlap" between this job (in these idle nodes)
  2186. * and existing jobs in the other partitions with <= priority to
  2187. * this partition */
  2188. alloc_job:
  2189. /* at this point we've found a good set of
  2190. * bits to allocate to this job:
  2191. * - bitmap is the set of nodes to allocate
  2192. * - free_cores is the set of allocated cores
  2193. * - cpu_count is the number of cpus per allocated node
  2194. *
  2195. * Next steps are to cleanup the worker variables,
  2196. * create the job_resources struct,
  2197. * distribute the job on the bits, and exit
  2198. */
  2199. FREE_NULL_BITMAP(orig_map);
  2200. FREE_NULL_BITMAP(avail_cores);
  2201. FREE_NULL_BITMAP(tmpcore);
  2202. if ((!cpu_count) || (!job_ptr->best_switch)) {
  2203. /* we were sent here to cleanup and exit */
  2204. FREE_NULL_BITMAP(free_cores);
  2205. if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
  2206. info("cons_res: exiting cr_job_test with no "
  2207. "allocation");
  2208. }
  2209. return SLURM_ERROR;
  2210. }
  2211. /* At this point we have:
  2212. * - a bitmap of selected nodes
  2213. * - a free_cores bitmap of usable cores on each selected node
  2214. * - a per-alloc-node cpu_count array
  2215. */
  2216. if ((mode != SELECT_MODE_WILL_RUN) && (job_ptr->part_ptr == NULL))
  2217. error_code = EINVAL;
  2218. if ((error_code == SLURM_SUCCESS) && (mode == SELECT_MODE_WILL_RUN)) {
  2219. /* Set a reasonable value for the number of allocated CPUs.
  2220. * Without computing task distribution this is only a guess */
  2221. job_ptr->total_cpus = MAX(job_ptr->details->min_cpus,
  2222. job_ptr->details->min_nodes);
  2223. }
  2224. if ((error_code != SLURM_SUCCESS) || (mode != SELECT_MODE_RUN_NOW)) {
  2225. FREE_NULL_BITMAP(free_cores);
  2226. xfree(cpu_count);
  2227. return error_code;
  2228. }
  2229. if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
  2230. info("cons_res: cr_job_test: distributing job %u",
  2231. job_ptr->job_id);
  2232. }
  2233. /** create the struct_job_res **/
  2234. job_res = create_job_resources();
  2235. job_res->node_bitmap = bit_copy(bitmap);
  2236. job_res->nodes = bitmap2node_name(bitmap);
  2237. if (job_res->node_bitmap == NULL)
  2238. fatal("bit_copy malloc failure");
  2239. job_res->nhosts = bit_set_count(bitmap);
  2240. job_res->ncpus = job_res->nhosts;
  2241. if (job_ptr->details->ntasks_per_node)
  2242. job_res->ncpus *= details_ptr->ntasks_per_node;
  2243. job_res->ncpus = MAX(job_res->ncpus,
  2244. details_ptr->min_cpus);
  2245. job_res->ncpus = MAX(job_res->ncpus,
  2246. details_ptr->pn_min_cpus);
  2247. job_res->node_req = job_node_req;
  2248. job_res->cpus = cpu_count;
  2249. job_res->cpus_used = xmalloc(job_res->nhosts *
  2250. sizeof(uint16_t));
  2251. job_res->memory_allocated = xmalloc(job_res->nhosts *
  2252. sizeof(uint32_t));
  2253. job_res->memory_used = xmalloc(job_res->nhosts *
  2254. sizeof(uint32_t));
  2255. /* store the hardware data for the selected nodes */
  2256. error_code = build_job_resources(job_res, node_record_table_ptr,
  2257. select_fast_schedule);
  2258. if (error_code != SLURM_SUCCESS) {
  2259. free_job_resources(&job_res);
  2260. FREE_NULL_BITMAP(free_cores);
  2261. return error_code;
  2262. }
  2263. /* sync up cpus with layout_ptr, total up
  2264. * all cpus, and load the core_bitmap */
  2265. ll = -1;
  2266. total_cpus = 0;
  2267. c = 0;
  2268. csize = bit_size(job_res->core_bitmap);
  2269. for (i = 0, n = 0; n < cr_node_cnt; n++) {
  2270. uint32_t j;
  2271. if (layout_ptr && reqmap && bit_test(reqmap,n))
  2272. ll++;
  2273. if (bit_test(bitmap, n) == 0)
  2274. continue;
  2275. j = cr_get_coremap_offset(n);
  2276. k = cr_get_coremap_offset(n + 1);
  2277. for (; j < k; j++, c++) {
  2278. if (bit_test(free_cores, j)) {
  2279. if (c >= csize) {
  2280. error("cons_res: cr_job_test "
  2281. "core_bitmap index error on "
  2282. "node %s",
  2283. select_node_record[n].node_ptr->
  2284. name);
  2285. drain_nodes(select_node_record[n].
  2286. node_ptr->name,
  2287. "Bad core count",
  2288. getuid());
  2289. free_job_resources(&job_res);
  2290. FREE_NULL_BITMAP(free_cores);
  2291. return SLURM_ERROR;
  2292. }
  2293. bit_set(job_res->core_bitmap, c);
  2294. }
  2295. }
  2296. if (layout_ptr && reqmap && bit_test(reqmap, n)) {
  2297. job_res->cpus[i] = MIN(job_res->cpus[i],
  2298. layout_ptr[ll]);
  2299. } else if (layout_ptr) {
  2300. job_res->cpus[i] = 0;
  2301. }
  2302. total_cpus += job_res->cpus[i];
  2303. i++;
  2304. }
  2305. /* When 'srun --overcommit' is used, ncpus is set to a minimum value
  2306. * in order to allocate the appropriate number of nodes based on the
  2307. * job request.
  2308. * For cons_res, all available logical processors will be allocated on
  2309. * each allocated node in order to accommodate the overcommit request.
  2310. */
  2311. if (details_ptr->overcommit && details_ptr->num_tasks)
  2312. job_res->ncpus = MIN(total_cpus, details_ptr->num_tasks);
  2313. if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
  2314. info("cons_res: cr_job_test: job %u ncpus %u cbits "
  2315. "%u/%u nbits %u", job_ptr->job_id,
  2316. job_res->ncpus, bit_set_count(free_cores),
  2317. bit_set_count(job_res->core_bitmap), job_res->nhosts);
  2318. }
  2319. FREE_NULL_BITMAP(free_cores);
  2320. /* distribute the tasks and clear any unused cores */
  2321. job_ptr->job_resrcs = job_res;
  2322. error_code = cr_dist(job_ptr, cr_type);
  2323. if (error_code != SLURM_SUCCESS) {
  2324. free_job_resources(&job_ptr->job_resrcs);
  2325. return error_code;
  2326. }
  2327. /* translate job_res->cpus array into format with rep count */
  2328. build_cnt = build_job_resources_cpu_array(job_res);
  2329. if (build_cnt >= 0)
  2330. job_ptr->total_cpus = build_cnt;
  2331. else
  2332. job_ptr->total_cpus = total_cpus; /* best guess */
  2333. if (!(cr_type & CR_MEMORY))
  2334. return error_code;
  2335. /* load memory allocated array */
  2336. save_mem = details_ptr->pn_min_memory;
  2337. if (save_mem & MEM_PER_CPU) {
  2338. /* memory is per-cpu */
  2339. save_mem &= (~MEM_PER_CPU);
  2340. for (i = 0; i < job_res->nhosts; i++) {
  2341. job_res->memory_allocated[i] = job_res->cpus[i] *
  2342. save_mem;
  2343. }
  2344. } else {
  2345. /* memory is per-node */
  2346. for (i = 0; i < job_res->nhosts; i++) {
  2347. job_res->memory_allocated[i] = save_mem;
  2348. }
  2349. }
  2350. return error_code;
  2351. }