PageRenderTime 54ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/src/plugins/select/cons_res/dist_tasks.c

https://github.com/cfenoy/slurm
C | 895 lines | 594 code | 84 blank | 217 comment | 154 complexity | 4e1ef227dc6badeebe7e2a67d41cdda2 MD5 | raw file
Possible License(s): GPL-2.0, AGPL-1.0
  1. /*****************************************************************************\
  2. * dist_tasks - Assign task count to {socket,core,thread} or CPU
  3. * resources
  4. *****************************************************************************
  5. * Copyright (C) 2006-2008 Hewlett-Packard Development Company, L.P.
  6. * Written by Susanne M. Balle, <susanne.balle@hp.com>
  7. * CODE-OCEC-09-009. All rights reserved.
  8. * Portions copyright (C) 2012 Bull
  9. * Written by Martin Perry <martin.perry@bull.com>
  10. *
  11. * This file is part of SLURM, a resource management program.
  12. * For details, see <http://www.schedmd.com/slurmdocs/>.
  13. * Please also read the included file: DISCLAIMER.
  14. *
  15. * SLURM is free software; you can redistribute it and/or modify it under
  16. * the terms of the GNU General Public License as published by the Free
  17. * Software Foundation; either version 2 of the License, or (at your option)
  18. * any later version.
  19. *
  20. * In addition, as a special exception, the copyright holders give permission
  21. * to link the code of portions of this program with the OpenSSL library under
  22. * certain conditions as described in each individual source file, and
  23. * distribute linked combinations including the two. You must obey the GNU
  24. * General Public License in all respects for all of the code used other than
  25. * OpenSSL. If you modify file(s) with this exception, you may extend this
  26. * exception to your version of the file(s), but you are not obligated to do
  27. * so. If you do not wish to do so, delete this exception statement from your
  28. * version. If you delete this exception statement from all source files in
  29. * the program, then also delete it here.
  30. *
  31. * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
  32. * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  33. * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  34. * details.
  35. *
  36. * You should have received a copy of the GNU General Public License along
  37. * with SLURM; if not, write to the Free Software Foundation, Inc.,
  38. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  39. \*****************************************************************************/
  40. #include "select_cons_res.h"
  41. #include "dist_tasks.h"
  42. #if(0)
  43. /* Using CR_SOCKET or CR_SOCKET_MEMORY will not allocate a socket to more
  44. * than one job at a time, but it also will not grant a job access to more
  45. * CPUs on the socket than requested. If ALLOCATE_FULL_SOCKET is defined,
  46. * then a job will be given access to every cores on each allocated socket.
  47. */
  48. #define ALLOCATE_FULL_SOCKET 1
  49. #endif
  50. /* Max boards supported for best-fit across boards */
  51. /* Larger board configurations may require new algorithm */
  52. /* for acceptable performance */
  53. #define MAX_BOARDS 8
  54. /* Combination counts
  55. * comb_counts[n-1][k-1] = number of combinations of
  56. * k items from a set of n items
  57. *
  58. * Formula is n!/k!(n-k)!
  59. */
  60. uint32_t comb_counts[MAX_BOARDS][MAX_BOARDS] =
  61. {{1,0,0,0,0,0,0,0},
  62. {2,1,0,0,0,0,0,0},
  63. {3,3,1,0,0,0,0,0},
  64. {4,6,4,1,0,0,0,0},
  65. {5,10,10,5,1,0,0,0},
  66. {6,15,20,15,6,1,0,0},
  67. {7,21,35,35,21,7,1,0},
  68. {8,28,56,70,56,28,8,1}};
  69. /* Generate all combinations of k integers from the
  70. * set of integers 0 to n-1.
  71. * Return combinations in comb_list.
  72. *
  73. * Example: For k = 2 and n = 4, there are six
  74. * combinations:
  75. * {0,1},{0,2},{0,3},{1,2},{1,3},{2,3}
  76. *
  77. */
  78. void _gen_combs(int *comb_list, int n, int k)
  79. {
  80. int *comb = xmalloc(k * sizeof(int));
  81. /* Setup comb for the initial combination */
  82. int i, b;
  83. for (i = 0; i < k; ++i)
  84. comb[i] = i;
  85. b = 0;
  86. /* Generate all the other combinations */
  87. while (1) {
  88. for (i=0; i < k; i++) {
  89. comb_list[b+i] = comb[i];
  90. }
  91. b+=k;
  92. i = k - 1;
  93. ++comb[i];
  94. while ((i >= 0) && (comb[i] >= n - k + 1 + i)) {
  95. --i;
  96. ++comb[i];
  97. }
  98. if (comb[0] > n - k)
  99. break; /* No more combinations */
  100. for (i = i + 1; i < k; ++i)
  101. comb[i] = comb[i - 1] + 1;
  102. }
  103. xfree(comb);
  104. }
  105. /* _compute_task_c_b_task_dist - compute the number of tasks on each
  106. * of the node for the cyclic and block distribution. We need to do
  107. * this in the case of consumable resources so that we have an exact
  108. * count for the needed hardware resources which will be used later to
  109. * update the different used resources per node structures.
  110. *
  111. * The most common case is when we have more resources than needed. In
  112. * that case we just "take" what we need and "release" the remaining
  113. * resources for other jobs. In the case where we oversubscribe the
  114. * CPUs/Logical processors resources we keep the initial set of
  115. * resources.
  116. *
  117. * IN/OUT job_ptr - pointer to job being scheduled. The per-node
  118. * job_res->cpus array is recomputed here.
  119. *
  120. */
  121. static int _compute_c_b_task_dist(struct job_record *job_ptr)
  122. {
  123. bool over_subscribe = false;
  124. uint32_t n, i, tid, maxtasks, l;
  125. uint16_t *avail_cpus;
  126. job_resources_t *job_res = job_ptr->job_resrcs;
  127. if (!job_res || !job_res->cpus) {
  128. error("cons_res: _compute_c_b_task_dist given NULL job_ptr");
  129. return SLURM_ERROR;
  130. }
  131. maxtasks = job_res->ncpus;
  132. avail_cpus = job_res->cpus;
  133. job_res->cpus = xmalloc(job_res->nhosts * sizeof(uint16_t));
  134. /* ncpus is already set the number of tasks if overcommit is used */
  135. if (!job_ptr->details->overcommit &&
  136. (job_ptr->details->cpus_per_task > 1)) {
  137. if (job_ptr->details->ntasks_per_node == 0)
  138. maxtasks = maxtasks / job_ptr->details->cpus_per_task;
  139. else
  140. maxtasks = job_ptr->details->ntasks_per_node * job_res->nhosts;
  141. }
  142. /* Safe guard if the user didn't specified a lower number of
  143. * cpus than cpus_per_task or didn't specify the number. */
  144. if (!maxtasks) {
  145. error("_compute_c_b_task_dist: request was for 0 tasks, "
  146. "setting to 1");
  147. maxtasks = 1;
  148. }
  149. if (job_ptr->details->cpus_per_task == 0)
  150. job_ptr->details->cpus_per_task = 1;
  151. for (tid = 0, i = job_ptr->details->cpus_per_task ; (tid < maxtasks);
  152. i += job_ptr->details->cpus_per_task ) { /* cycle counter */
  153. bool space_remaining = false;
  154. if (over_subscribe) {
  155. /* 'over_subscribe' is a relief valve that guards
  156. * against an infinite loop, and it *should* never
  157. * come into play because maxtasks should never be
  158. * greater than the total number of available cpus
  159. */
  160. error("cons_res: _compute_c_b_task_dist oversubscribe");
  161. }
  162. for (n = 0; ((n < job_res->nhosts) && (tid < maxtasks)); n++) {
  163. if ((i <= avail_cpus[n]) || over_subscribe) {
  164. tid++;
  165. for (l = 0; l < job_ptr->details->cpus_per_task;
  166. l++) {
  167. if (job_res->cpus[n] < avail_cpus[n])
  168. job_res->cpus[n]++;
  169. }
  170. if ((i + 1) <= avail_cpus[n])
  171. space_remaining = true;
  172. }
  173. }
  174. if (!space_remaining) {
  175. over_subscribe = true;
  176. }
  177. }
  178. xfree(avail_cpus);
  179. return SLURM_SUCCESS;
  180. }
  181. /* distribute blocks (planes) of tasks cyclically */
  182. static int _compute_plane_dist(struct job_record *job_ptr)
  183. {
  184. bool over_subscribe = false;
  185. uint32_t n, i, p, tid, maxtasks, l;
  186. uint16_t *avail_cpus, plane_size = 1;
  187. job_resources_t *job_res = job_ptr->job_resrcs;
  188. if (!job_res || !job_res->cpus) {
  189. error("cons_res: _compute_plane_dist given NULL job_res");
  190. return SLURM_ERROR;
  191. }
  192. maxtasks = job_res->ncpus;
  193. avail_cpus = job_res->cpus;
  194. if (job_ptr->details->cpus_per_task > 1)
  195. maxtasks = maxtasks / job_ptr->details->cpus_per_task;
  196. if (job_ptr->details && job_ptr->details->mc_ptr)
  197. plane_size = job_ptr->details->mc_ptr->plane_size;
  198. if (plane_size <= 0) {
  199. error("cons_res: _compute_plane_dist received invalid "
  200. "plane_size");
  201. return SLURM_ERROR;
  202. }
  203. job_res->cpus = xmalloc(job_res->nhosts * sizeof(uint16_t));
  204. for (tid = 0, i = 0; (tid < maxtasks); i++) { /* cycle counter */
  205. bool space_remaining = false;
  206. if (over_subscribe) {
  207. /* 'over_subscribe' is a relief valve that guards
  208. * against an infinite loop, and it *should* never
  209. * come into play because maxtasks should never be
  210. * greater than the total number of available cpus
  211. */
  212. error("cons_res: _compute_plane_dist oversubscribe");
  213. }
  214. for (n = 0; ((n < job_res->nhosts) && (tid < maxtasks)); n++) {
  215. for (p = 0; p < plane_size && (tid < maxtasks); p++) {
  216. if ((job_res->cpus[n] < avail_cpus[n]) ||
  217. over_subscribe) {
  218. tid++;
  219. for (l=0;
  220. l<job_ptr->details->cpus_per_task;
  221. l++) {
  222. if (job_res->cpus[n] <
  223. avail_cpus[n])
  224. job_res->cpus[n]++;
  225. }
  226. }
  227. }
  228. if (job_res->cpus[n] < avail_cpus[n])
  229. space_remaining = true;
  230. }
  231. if (!space_remaining) {
  232. over_subscribe = true;
  233. }
  234. }
  235. xfree(avail_cpus);
  236. return SLURM_SUCCESS;
  237. }
  238. /* sync up core bitmap with new CPU count using a best-fit approach
  239. * on the available resources on each node
  240. *
  241. * "Best-fit" means:
  242. * 1st priority: Use smallest number of boards with sufficient
  243. * available CPUs
  244. * 2nd priority: Use smallest number of sockets with sufficient
  245. * available CPUs
  246. * 3rd priority: Use board combination with the smallest number
  247. * of available CPUs
  248. * 4th priority: Use higher-numbered boards/sockets/cores first
  249. *
  250. * The CPU array contains the distribution of CPUs, which can include
  251. * virtual CPUs (hyperthreads)
  252. */
  253. static void _block_sync_core_bitmap(struct job_record *job_ptr,
  254. const uint16_t cr_type)
  255. {
  256. uint32_t c, s, i, j, n, b, z, size, csize, core_cnt;
  257. uint16_t cpus, num_bits, vpus = 1;
  258. job_resources_t *job_res = job_ptr->job_resrcs;
  259. bool alloc_cores = false, alloc_sockets = false;
  260. uint16_t ntasks_per_core = 0xffff;
  261. int count, cpu_min, b_min, elig, s_min, comb_idx, sock_idx;
  262. int elig_idx, comb_brd_idx, sock_list_idx, comb_min, board_num;
  263. int* boards_cpu_cnt;
  264. int* sort_brds_cpu_cnt;
  265. int* sockets_cpu_cnt;
  266. int* board_combs;
  267. int* socket_list;
  268. int* elig_brd_combs;
  269. int* elig_cpu_cnt;
  270. bool* sockets_used;
  271. uint16_t boards_nb;
  272. uint16_t nboards_nb;
  273. uint16_t sockets_nb;
  274. uint16_t ncores_nb;
  275. uint16_t nsockets_nb;
  276. uint16_t sock_per_brd;
  277. uint16_t sock_per_comb;
  278. uint16_t req_cpus,best_fit_cpus = 0;
  279. uint32_t best_fit_location = 0;
  280. uint64_t ncomb_brd;
  281. bool sufficient,best_fit_sufficient;
  282. /* qsort compare function for ascending int list */
  283. int _cmp_int_ascend (const void *a, const void *b)
  284. {
  285. return (*(int*)a - *(int*)b);
  286. }
  287. /* qsort compare function for descending int list */
  288. int _cmp_int_descend (const void *a, const void *b)
  289. {
  290. return (*(int*)b - *(int*)a);
  291. }
  292. /* qsort compare function for board combination socket
  293. * list */
  294. int _cmp_sock (const void *a, const void *b)
  295. {
  296. return (sockets_cpu_cnt[*(int*)b] -
  297. sockets_cpu_cnt[*(int*)a]);
  298. }
  299. if (!job_res)
  300. return;
  301. if (cr_type & CR_CORE)
  302. alloc_cores = true;
  303. #ifdef ALLOCATE_FULL_SOCKET
  304. if (cr_type & CR_SOCKET)
  305. alloc_sockets = true;
  306. #else
  307. if (cr_type & CR_SOCKET)
  308. alloc_cores = true;
  309. #endif
  310. if (job_ptr->details && job_ptr->details->mc_ptr) {
  311. multi_core_data_t *mc_ptr = job_ptr->details->mc_ptr;
  312. if (mc_ptr->ntasks_per_core) {
  313. ntasks_per_core = mc_ptr->ntasks_per_core;
  314. }
  315. if ((mc_ptr->threads_per_core != (uint16_t) NO_VAL) &&
  316. (mc_ptr->threads_per_core < ntasks_per_core)) {
  317. ntasks_per_core = mc_ptr->threads_per_core;
  318. }
  319. }
  320. size = bit_size(job_res->node_bitmap);
  321. csize = bit_size(job_res->core_bitmap);
  322. sockets_nb = select_node_record[0].sockets;
  323. sockets_cpu_cnt = xmalloc(sockets_nb * sizeof(int));
  324. sockets_used = xmalloc(sockets_nb * sizeof(bool));
  325. boards_nb = select_node_record[0].boards;
  326. boards_cpu_cnt = xmalloc(boards_nb * sizeof(int));
  327. sort_brds_cpu_cnt = xmalloc(boards_nb * sizeof(int));
  328. for (c = 0, i = 0, n = 0; n < size; n++) {
  329. if (bit_test(job_res->node_bitmap, n) == 0)
  330. continue;
  331. core_cnt = 0;
  332. ncores_nb = select_node_record[n].cores;
  333. nsockets_nb = select_node_record[n].sockets;
  334. nboards_nb = select_node_record[n].boards;
  335. num_bits = nsockets_nb * ncores_nb;
  336. if ((c + num_bits) > csize)
  337. fatal("cons_res: _block_sync_core_bitmap index error");
  338. cpus = job_res->cpus[i];
  339. vpus = MIN(select_node_record[n].vpus, ntasks_per_core);
  340. /* compute still required cores on the node */
  341. req_cpus = cpus / vpus;
  342. if ( cpus % vpus )
  343. req_cpus++;
  344. if (nboards_nb > MAX_BOARDS) {
  345. debug3("cons_res: node[%u]: exceeds max boards; "
  346. "doing best-fit across sockets only", n);
  347. nboards_nb = 1;
  348. }
  349. if ( nsockets_nb > sockets_nb) {
  350. sockets_nb = nsockets_nb;
  351. xrealloc(sockets_cpu_cnt, sockets_nb * sizeof(int));
  352. xrealloc(sockets_used,sockets_nb * sizeof(bool));
  353. }
  354. if ( nboards_nb > boards_nb) {
  355. boards_nb = nboards_nb;
  356. xrealloc(boards_cpu_cnt, boards_nb * sizeof(int));
  357. xrealloc(sort_brds_cpu_cnt, boards_nb * sizeof(int));
  358. }
  359. /* Count available cores on each socket and board */
  360. sock_per_brd = nsockets_nb / nboards_nb;
  361. for (b = 0; b < nboards_nb; b++) {
  362. boards_cpu_cnt[b] = 0;
  363. sort_brds_cpu_cnt[b] = 0;
  364. }
  365. for (s = 0; s < nsockets_nb; s++) {
  366. sockets_cpu_cnt[s]=0;
  367. sockets_used[s]=false;
  368. b = s/sock_per_brd;
  369. for ( j = c + (s * ncores_nb) ;
  370. j < c + ((s+1) * ncores_nb) ;
  371. j++ ) {
  372. if ( bit_test(job_res->core_bitmap,j) ) {
  373. sockets_cpu_cnt[s]++;
  374. boards_cpu_cnt[b]++;
  375. sort_brds_cpu_cnt[b]++;
  376. }
  377. }
  378. }
  379. /* Sort boards in descending order of available core count */
  380. qsort(sort_brds_cpu_cnt, nboards_nb, sizeof (int),
  381. _cmp_int_descend);
  382. /* Determine minimum number of boards required for the
  383. * allocation (b_min) */
  384. count = 0;
  385. for (b = 0; b < nboards_nb; b++) {
  386. count+=sort_brds_cpu_cnt[b];
  387. if (count >= req_cpus)
  388. break;
  389. }
  390. b_min = b+1;
  391. sock_per_comb = b_min * sock_per_brd;
  392. /* Allocate space for list of board combinations */
  393. ncomb_brd = comb_counts[nboards_nb-1][b_min-1];
  394. board_combs = xmalloc(ncomb_brd * b_min * sizeof(int));
  395. /* Generate all combinations of b_min boards on the node */
  396. _gen_combs(board_combs, nboards_nb, b_min);
  397. /* Determine which combinations have enough available cores
  398. * for the allocation (eligible board combinations)
  399. */
  400. elig_brd_combs = xmalloc(ncomb_brd * sizeof(int));
  401. elig_cpu_cnt = xmalloc(ncomb_brd * sizeof(int));
  402. elig = 0;
  403. for (comb_idx = 0; comb_idx < ncomb_brd; comb_idx++) {
  404. count = 0;
  405. for (comb_brd_idx = 0; comb_brd_idx < b_min;
  406. comb_brd_idx++) {
  407. board_num = board_combs[(comb_idx * b_min)
  408. + comb_brd_idx];
  409. count += boards_cpu_cnt[board_num];
  410. }
  411. if (count >= req_cpus) {
  412. elig_brd_combs[elig] = comb_idx;
  413. elig_cpu_cnt[elig] = count;
  414. elig++;
  415. }
  416. }
  417. /* Allocate space for list of sockets for each eligible board
  418. * combination */
  419. socket_list = xmalloc(elig * sock_per_comb * sizeof(int));
  420. /* Generate sorted list of sockets for each eligible board
  421. * combination, and find combination with minimum number
  422. * of sockets and minimum number of cpus required for the
  423. * allocation
  424. */
  425. s_min = sock_per_comb;
  426. comb_min = 0;
  427. cpu_min = sock_per_comb * ncores_nb;
  428. for (elig_idx = 0; elig_idx < elig; elig_idx++) {
  429. comb_idx = elig_brd_combs[elig_idx];
  430. for (comb_brd_idx = 0; comb_brd_idx < b_min;
  431. comb_brd_idx++) {
  432. board_num = board_combs[(comb_idx * b_min)
  433. + comb_brd_idx];
  434. sock_list_idx = (elig_idx * sock_per_comb) +
  435. (comb_brd_idx * sock_per_brd);
  436. for (sock_idx = 0; sock_idx < sock_per_brd;
  437. sock_idx++) {
  438. socket_list[sock_list_idx + sock_idx]
  439. = (board_num * sock_per_brd)
  440. + sock_idx;
  441. }
  442. }
  443. /* Sort this socket list in descending order of
  444. * available core count */
  445. qsort(&socket_list[elig_idx*sock_per_comb],
  446. sock_per_comb, sizeof (int), _cmp_sock);
  447. /* Determine minimum number of sockets required for
  448. * the allocation from this socket list */
  449. count = 0;
  450. for (b = 0; b < sock_per_comb; b++) {
  451. sock_idx =
  452. socket_list[(int)((elig_idx*sock_per_comb)+b)];
  453. count+=sockets_cpu_cnt[sock_idx];
  454. if (count >= req_cpus)
  455. break;
  456. }
  457. b++;
  458. /* Use board combination with minimum number
  459. * of required sockets and minimum number of CPUs
  460. */
  461. if ((b < s_min) ||
  462. (b == s_min && elig_cpu_cnt[elig_idx]
  463. <= cpu_min)) {
  464. s_min = b;
  465. comb_min = elig_idx;
  466. cpu_min = elig_cpu_cnt[elig_idx];
  467. }
  468. }
  469. debug3("cons_res: best_fit: node[%u]: required cpus: %u, "
  470. "min req boards: %u,", n, cpus, b_min);
  471. debug3("cons_res: best_fit: node[%u]: min req sockets: %u, "
  472. "min avail cores: %u", n, s_min, cpu_min);
  473. /* Re-sort socket list for best-fit board combination in
  474. * ascending order of socket number */
  475. qsort(&socket_list[comb_min * sock_per_comb], sock_per_comb,
  476. sizeof (int), _cmp_int_ascend);
  477. xfree(board_combs);
  478. xfree(elig_brd_combs);
  479. xfree(elig_cpu_cnt);
  480. /* select cores from the sockets of the best-fit board
  481. * combination using a best-fit approach */
  482. while( cpus > 0 ) {
  483. best_fit_cpus = 0;
  484. best_fit_sufficient = false;
  485. /* search for the best socket, */
  486. /* starting from the last one to let more room */
  487. /* in the first one for system usage */
  488. for ( z = sock_per_comb-1; (int) z >= (int) 0; z-- ) {
  489. s = socket_list[(comb_min*sock_per_comb)+z];
  490. sufficient = sockets_cpu_cnt[s] >= req_cpus ;
  491. if ( (best_fit_cpus == 0) ||
  492. (sufficient && !best_fit_sufficient ) ||
  493. (sufficient && (sockets_cpu_cnt[s] <
  494. best_fit_cpus)) ||
  495. (!sufficient && (sockets_cpu_cnt[s] >
  496. best_fit_cpus)) ) {
  497. best_fit_cpus = sockets_cpu_cnt[s];
  498. best_fit_location = s;
  499. best_fit_sufficient = sufficient;
  500. }
  501. }
  502. /* check that we have found a usable socket */
  503. if ( best_fit_cpus == 0 )
  504. break;
  505. debug3("cons_res: best_fit: using node[%u]: "
  506. "board[%u]: socket[%u]: %u cores available",
  507. n, best_fit_location/sock_per_brd,
  508. best_fit_location,
  509. sockets_cpu_cnt[best_fit_location]);
  510. /* select socket cores from last to first */
  511. /* socket[0]:Core[0] would be the last one */
  512. sockets_used[best_fit_location] = true;
  513. for ( j = c + ((best_fit_location+1) * ncores_nb)
  514. - 1 ;
  515. (int) j >= (int) (c + (best_fit_location *
  516. ncores_nb)) ;
  517. j-- ) {
  518. /*
  519. * if no more cpus to select
  520. * release remaining cores unless
  521. * we are allocating whole sockets
  522. */
  523. if ( cpus == 0 && alloc_sockets ) {
  524. if ( bit_test(job_res->core_bitmap,j) )
  525. core_cnt++;
  526. continue;
  527. }
  528. else if ( cpus == 0 ) {
  529. bit_clear(job_res->core_bitmap,j);
  530. continue;
  531. }
  532. /*
  533. * remove cores from socket count and
  534. * cpus count using hyperthreading requirement
  535. */
  536. if ( bit_test(job_res->core_bitmap,j) ) {
  537. sockets_cpu_cnt[best_fit_location]--;
  538. core_cnt++;
  539. if (cpus < vpus)
  540. cpus = 0;
  541. else
  542. cpus -= vpus;
  543. }
  544. }
  545. /* loop again if more cpus required */
  546. if ( cpus > 0 )
  547. continue;
  548. /* release remaining cores of the unused sockets */
  549. for (s = 0; s < nsockets_nb; s++) {
  550. if ( sockets_used[s] )
  551. continue;
  552. bit_nclear(job_res->core_bitmap,
  553. c+(s*ncores_nb),
  554. c+((s+1)*ncores_nb)-1);
  555. }
  556. }
  557. xfree(socket_list);
  558. if (cpus > 0) {
  559. /* cpu count should NEVER be greater than the number
  560. * of set bits in the core bitmap for a given node */
  561. fatal("cons_res: cpus computation error");
  562. }
  563. /* adjust cpus count of the current node */
  564. if ((alloc_cores || alloc_sockets) &&
  565. (select_node_record[n].vpus > 1)) {
  566. job_res->cpus[i] = core_cnt *
  567. select_node_record[n].vpus;
  568. }
  569. i++;
  570. /* move c to the next node in core_bitmap */
  571. c += num_bits;
  572. }
  573. xfree(boards_cpu_cnt);
  574. xfree(sort_brds_cpu_cnt);
  575. xfree(sockets_cpu_cnt);
  576. xfree(sockets_used);
  577. }
  578. /* Sync up the core_bitmap with the CPU array using cyclic distribution
  579. *
  580. * The CPU array contains the distribution of CPUs, which can include
  581. * virtual CPUs (hyperthreads)
  582. */
  583. static int _cyclic_sync_core_bitmap(struct job_record *job_ptr,
  584. const uint16_t cr_type)
  585. {
  586. uint32_t c, i, j, s, n, *sock_start, *sock_end, size, csize, core_cnt;
  587. uint16_t cps = 0, cpus, vpus, sockets, sock_size;
  588. job_resources_t *job_res = job_ptr->job_resrcs;
  589. bitstr_t *core_map;
  590. bool *sock_used, alloc_cores = false, alloc_sockets = false;
  591. uint16_t ntasks_per_core = 0xffff;
  592. int error_code = SLURM_SUCCESS;
  593. if ((job_res == NULL) || (job_res->core_bitmap == NULL))
  594. return error_code;
  595. if (cr_type & CR_CORE)
  596. alloc_cores = true;
  597. #ifdef ALLOCATE_FULL_SOCKET
  598. if (cr_type & CR_SOCKET)
  599. alloc_sockets = true;
  600. #else
  601. if (cr_type & CR_SOCKET)
  602. alloc_cores = true;
  603. #endif
  604. core_map = job_res->core_bitmap;
  605. if (job_ptr->details && job_ptr->details->mc_ptr) {
  606. multi_core_data_t *mc_ptr = job_ptr->details->mc_ptr;
  607. if (mc_ptr->ntasks_per_core) {
  608. ntasks_per_core = mc_ptr->ntasks_per_core;
  609. }
  610. if ((mc_ptr->threads_per_core != (uint16_t) NO_VAL) &&
  611. (mc_ptr->threads_per_core < ntasks_per_core)) {
  612. ntasks_per_core = mc_ptr->threads_per_core;
  613. }
  614. }
  615. sock_size = select_node_record[0].sockets;
  616. sock_start = xmalloc(sock_size * sizeof(uint32_t));
  617. sock_end = xmalloc(sock_size * sizeof(uint32_t));
  618. sock_used = xmalloc(sock_size * sizeof(bool));
  619. size = bit_size(job_res->node_bitmap);
  620. csize = bit_size(core_map);
  621. for (c = 0, i = 0, n = 0; n < size; n++) {
  622. if (bit_test(job_res->node_bitmap, n) == 0)
  623. continue;
  624. sockets = select_node_record[n].sockets;
  625. cps = select_node_record[n].cores;
  626. vpus = MIN(select_node_record[n].vpus, ntasks_per_core);
  627. if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
  628. info("DEBUG: job %u node %s vpus %u cpus %u",
  629. job_ptr->job_id,
  630. select_node_record[n].node_ptr->name,
  631. vpus, job_res->cpus[i]);
  632. }
  633. if ((c + (sockets * cps)) > csize)
  634. fatal("cons_res: _cyclic_sync_core_bitmap index error");
  635. if (sockets > sock_size) {
  636. sock_size = sockets;
  637. xrealloc(sock_start, sock_size * sizeof(uint32_t));
  638. xrealloc(sock_end, sock_size * sizeof(uint32_t));
  639. xrealloc(sock_used, sock_size * sizeof(bool));
  640. }
  641. for (s = 0; s < sockets; s++) {
  642. sock_start[s] = c + (s * cps);
  643. sock_end[s] = sock_start[s] + cps;
  644. }
  645. core_cnt = 0;
  646. cpus = job_res->cpus[i];
  647. while (cpus > 0) {
  648. uint16_t prev_cpus = cpus;
  649. for (s = 0; s < sockets && cpus > 0; s++) {
  650. while (sock_start[s] < sock_end[s]) {
  651. if (bit_test(core_map,sock_start[s])) {
  652. sock_used[s] = true;
  653. core_cnt++;
  654. break;
  655. } else
  656. sock_start[s]++;
  657. }
  658. if (sock_start[s] == sock_end[s])
  659. /* this socket is unusable */
  660. continue;
  661. if (cpus < vpus)
  662. cpus = 0;
  663. else
  664. cpus -= vpus;
  665. sock_start[s]++;
  666. }
  667. if (prev_cpus == cpus) {
  668. /* we're stuck! */
  669. job_ptr->priority = 0;
  670. job_ptr->state_reason = WAIT_HELD;
  671. error("cons_res: sync loop not progressing, "
  672. "holding job %u", job_ptr->job_id);
  673. error_code = SLURM_ERROR;
  674. goto fini;
  675. }
  676. }
  677. /* clear the rest of the cores in each socket
  678. * FIXME: do we need min_core/min_socket checks here? */
  679. for (s = 0; s < sockets; s++) {
  680. if (sock_start[s] == sock_end[s])
  681. continue;
  682. if (!alloc_sockets || !sock_used[s]) {
  683. bit_nclear(core_map, sock_start[s],
  684. sock_end[s]-1);
  685. }
  686. if ((select_node_record[n].vpus > 1) &&
  687. (alloc_sockets || alloc_cores) && sock_used[s]) {
  688. for (j=sock_start[s]; j<sock_end[s]; j++) {
  689. if (bit_test(core_map, j))
  690. core_cnt++;
  691. }
  692. }
  693. }
  694. if ((alloc_cores || alloc_sockets) &&
  695. (select_node_record[n].vpus > 1)) {
  696. job_res->cpus[i] = core_cnt *
  697. select_node_record[n].vpus;
  698. }
  699. i++;
  700. /* advance 'c' to the beginning of the next node */
  701. c += sockets * cps;
  702. }
  703. fini: xfree(sock_start);
  704. xfree(sock_end);
  705. xfree(sock_used);
  706. return error_code;
  707. }
  708. /* To effectively deal with heterogeneous nodes, we fake a cyclic
  709. * distribution to figure out how many cpus are needed on each node.
  710. *
  711. * This routine is a slightly modified "version" of the routine
  712. * _task_layout_block in src/common/dist_tasks.c. We do not need to
  713. * assign tasks to job->hostid[] and job->tids[][] at this point so
  714. * the cpu allocation is the same for cyclic and block.
  715. *
  716. * For the consumable resources support we need to determine what
  717. * "node/CPU/Core/thread"-tuplets will be allocated for a given job.
  718. * In the past we assumed that we only allocated one task per CPU (at
  719. * that point the lowest level of logical processor) and didn't allow
  720. * the use of overcommit. We have changed this philosophy and are now
  721. * allowing people to overcommit their resources and expect the system
  722. * administrator to enable the task/affinity plug-in which will then
  723. * bind all of a job's tasks to its allocated resources thereby
  724. * avoiding interference between co-allocated running jobs.
  725. *
  726. * In the consumable resources environment we need to determine the
  727. * layout schema within slurmctld.
  728. *
  729. * We have a core_bitmap of all available cores. All we're doing here
  730. * is removing cores that are not needed based on the task count, and
  731. * the choice of cores to remove is based on the distribution:
  732. * - "cyclic" removes cores "evenly", starting from the last socket,
  733. * - "block" removes cores from the "last" socket(s)
  734. * - "plane" removes cores "in chunks"
  735. */
  736. extern int cr_dist(struct job_record *job_ptr, const uint16_t cr_type)
  737. {
  738. int error_code, cr_cpu = 1;
  739. if (job_ptr->job_resrcs->node_req == NODE_CR_RESERVED) {
  740. /* the job has been allocated an EXCLUSIVE set of nodes,
  741. * so it gets all of the bits in the core_bitmap and
  742. * all of the available CPUs in the cpus array */
  743. int size = bit_size(job_ptr->job_resrcs->core_bitmap);
  744. bit_nset(job_ptr->job_resrcs->core_bitmap, 0, size-1);
  745. return SLURM_SUCCESS;
  746. }
  747. if (job_ptr->details->task_dist == SLURM_DIST_PLANE) {
  748. /* perform a plane distribution on the 'cpus' array */
  749. error_code = _compute_plane_dist(job_ptr);
  750. if (error_code != SLURM_SUCCESS) {
  751. error("cons_res: cr_dist: Error in "
  752. "_compute_plane_dist");
  753. return error_code;
  754. }
  755. } else {
  756. /* perform a cyclic distribution on the 'cpus' array */
  757. error_code = _compute_c_b_task_dist(job_ptr);
  758. if (error_code != SLURM_SUCCESS) {
  759. error("cons_res: cr_dist: Error in "
  760. "_compute_c_b_task_dist");
  761. return error_code;
  762. }
  763. }
  764. /* now sync up the core_bitmap with the allocated 'cpus' array
  765. * based on the given distribution AND resource setting */
  766. if ((cr_type & CR_CORE) || (cr_type & CR_SOCKET))
  767. cr_cpu = 0;
  768. if (cr_cpu) {
  769. _block_sync_core_bitmap(job_ptr, cr_type);
  770. return SLURM_SUCCESS;
  771. }
  772. /*
  773. * If SelectTypeParameters mentions to use a block distribution for
  774. * cores by default, use that kind of distribution if no particular
  775. * cores distribution specified.
  776. * Note : cyclic cores distribution, which is the default, is treated
  777. * by the next code block
  778. */
  779. if ( slurmctld_conf.select_type_param & CR_CORE_DEFAULT_DIST_BLOCK ) {
  780. switch(job_ptr->details->task_dist) {
  781. case SLURM_DIST_ARBITRARY:
  782. case SLURM_DIST_BLOCK:
  783. case SLURM_DIST_CYCLIC:
  784. case SLURM_DIST_UNKNOWN:
  785. _block_sync_core_bitmap(job_ptr, cr_type);
  786. return SLURM_SUCCESS;
  787. }
  788. }
  789. /* Determine the number of logical processors per node needed
  790. * for this job. Make sure below matches the layouts in
  791. * lllp_distribution in plugins/task/affinity/dist_task.c (FIXME) */
  792. switch(job_ptr->details->task_dist) {
  793. case SLURM_DIST_BLOCK_BLOCK:
  794. case SLURM_DIST_CYCLIC_BLOCK:
  795. case SLURM_DIST_PLANE:
  796. _block_sync_core_bitmap(job_ptr, cr_type);
  797. break;
  798. case SLURM_DIST_ARBITRARY:
  799. case SLURM_DIST_BLOCK:
  800. case SLURM_DIST_CYCLIC:
  801. case SLURM_DIST_BLOCK_CYCLIC:
  802. case SLURM_DIST_CYCLIC_CYCLIC:
  803. case SLURM_DIST_UNKNOWN:
  804. error_code = _cyclic_sync_core_bitmap(job_ptr, cr_type);
  805. break;
  806. default:
  807. error("select/cons_res: invalid task_dist entry");
  808. return SLURM_ERROR;
  809. }
  810. return error_code;
  811. }