PageRenderTime 61ms CodeModel.GetById 26ms RepoModel.GetById 0ms app.codeStats 0ms

/src/plugins/task/affinity/dist_tasks.c

https://github.com/cfenoy/slurm
C | 1192 lines | 837 code | 98 blank | 257 comment | 187 complexity | 561b20967121f1ea5a5d4e6474a992ce MD5 | raw file
Possible License(s): GPL-2.0, AGPL-1.0
  1. /*****************************************************************************\
  2. * Copyright (C) 2006-2009 Hewlett-Packard Development Company, L.P.
  3. * Copyright (C) 2008-2009 Lawrence Livermore National Security.
  4. * Written by Susanne M. Balle, <susanne.balle@hp.com>
  5. * CODE-OCEC-09-009. All rights reserved.
  6. *
  7. * This file is part of SLURM, a resource management program.
  8. * For details, see <http://www.schedmd.com/slurmdocs/>.
  9. * Please also read the included file: DISCLAIMER.
  10. *
  11. * SLURM is free software; you can redistribute it and/or modify it under
  12. * the terms of the GNU General Public License as published by the Free
  13. * Software Foundation; either version 2 of the License, or (at your option)
  14. * any later version.
  15. *
  16. * In addition, as a special exception, the copyright holders give permission
  17. * to link the code of portions of this program with the OpenSSL library under
  18. * certain conditions as described in each individual source file, and
  19. * distribute linked combinations including the two. You must obey the GNU
  20. * General Public License in all respects for all of the code used other than
  21. * OpenSSL. If you modify file(s) with this exception, you may extend this
  22. * exception to your version of the file(s), but you are not obligated to do
  23. * so. If you do not wish to do so, delete this exception statement from your
  24. * version. If you delete this exception statement from all source files in
  25. * the program, then also delete it here.
  26. *
  27. * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
  28. * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  29. * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  30. * details.
  31. *
  32. * You should have received a copy of the GNU General Public License along
  33. * with SLURM; if not, write to the Free Software Foundation, Inc.,
  34. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  35. \*****************************************************************************/
  36. #include "affinity.h"
  37. #include "dist_tasks.h"
  38. #include "src/common/bitstring.h"
  39. #include "src/common/log.h"
  40. #include "src/common/slurm_cred.h"
  41. #include "src/common/slurm_protocol_api.h"
  42. #include "src/common/slurm_resource_info.h"
  43. #include "src/common/xmalloc.h"
  44. #include "src/slurmd/slurmd/slurmd.h"
  45. #ifdef HAVE_NUMA
  46. #include <numa.h>
  47. #endif
  48. static char *_alloc_mask(launch_tasks_request_msg_t *req,
  49. int *whole_node_cnt, int *whole_socket_cnt,
  50. int *whole_core_cnt, int *whole_thread_cnt,
  51. int *part_socket_cnt, int *part_core_cnt);
  52. static bitstr_t *_get_avail_map(launch_tasks_request_msg_t *req,
  53. uint16_t *hw_sockets, uint16_t *hw_cores,
  54. uint16_t *hw_threads);
  55. static int _get_local_node_info(slurm_cred_arg_t *arg, int job_node_id,
  56. uint16_t *sockets, uint16_t *cores);
  57. static int _task_layout_lllp_block(launch_tasks_request_msg_t *req,
  58. uint32_t node_id, bitstr_t ***masks_p);
  59. static int _task_layout_lllp_cyclic(launch_tasks_request_msg_t *req,
  60. uint32_t node_id, bitstr_t ***masks_p);
  61. static int _task_layout_lllp_multi(launch_tasks_request_msg_t *req,
  62. uint32_t node_id, bitstr_t ***masks_p);
  63. static void _lllp_map_abstract_masks(const uint32_t maxtasks,
  64. bitstr_t **masks);
  65. static void _lllp_generate_cpu_bind(launch_tasks_request_msg_t *req,
  66. const uint32_t maxtasks,
  67. bitstr_t **masks);
  68. /* BLOCK_MAP physical machine LLLP index to abstract block LLLP index
  69. * BLOCK_MAP_INV physical abstract block LLLP index to machine LLLP index
  70. */
  71. #define BLOCK_MAP(index) _block_map(index, conf->block_map)
  72. #define BLOCK_MAP_INV(index) _block_map(index, conf->block_map_inv)
  73. /* _block_map
  74. *
  75. * safely returns a mapped index using a provided block map
  76. *
  77. * IN - index to map
  78. * IN - map to use
  79. */
  80. static uint16_t _block_map(uint16_t index, uint16_t *map)
  81. {
  82. if (map == NULL) {
  83. return index;
  84. }
  85. /* make sure bit falls in map */
  86. if (index >= conf->block_map_size) {
  87. debug3("wrapping index %u into block_map_size of %u",
  88. index, conf->block_map_size);
  89. index = index % conf->block_map_size;
  90. }
  91. index = map[index];
  92. return(index);
  93. }
  94. static void _task_layout_display_masks(launch_tasks_request_msg_t *req,
  95. const uint32_t *gtid,
  96. const uint32_t maxtasks,
  97. bitstr_t **masks)
  98. {
  99. int i;
  100. char *str = NULL;
  101. for(i = 0; i < maxtasks; i++) {
  102. str = (char *)bit_fmt_hexmask(masks[i]);
  103. debug3("_task_layout_display_masks jobid [%u:%d] %s",
  104. req->job_id, gtid[i], str);
  105. xfree(str);
  106. }
  107. }
  108. static void _lllp_free_masks(const uint32_t maxtasks, bitstr_t **masks)
  109. {
  110. int i;
  111. bitstr_t *bitmask;
  112. for (i = 0; i < maxtasks; i++) {
  113. bitmask = masks[i];
  114. FREE_NULL_BITMAP(bitmask);
  115. }
  116. xfree(masks);
  117. }
  118. #ifdef HAVE_NUMA
  119. /* _match_mask_to_ldom
  120. *
  121. * expand each mask to encompass the whole locality domain
  122. * within which it currently exists
  123. * NOTE: this assumes that the masks are already in logical
  124. * (and not abstract) CPU order.
  125. */
  126. static void _match_masks_to_ldom(const uint32_t maxtasks, bitstr_t **masks)
  127. {
  128. uint32_t i, b, size;
  129. if (!masks || !masks[0])
  130. return;
  131. size = bit_size(masks[0]);
  132. for(i = 0; i < maxtasks; i++) {
  133. for (b = 0; b < size; b++) {
  134. if (bit_test(masks[i], b)) {
  135. /* get the NUMA node for this CPU, and then
  136. * set all CPUs in the mask that exist in
  137. * the same CPU */
  138. int c;
  139. uint16_t nnid = slurm_get_numa_node(b);
  140. for (c = 0; c < size; c++) {
  141. if (slurm_get_numa_node(c) == nnid)
  142. bit_set(masks[i], c);
  143. }
  144. }
  145. }
  146. }
  147. }
  148. #endif
  149. /*
  150. * batch_bind - Set the batch request message so as to bind the shell to the
  151. * proper resources
  152. */
  153. void batch_bind(batch_job_launch_msg_t *req)
  154. {
  155. bitstr_t *req_map, *hw_map;
  156. slurm_cred_arg_t arg;
  157. uint16_t sockets=0, cores=0, num_cpus;
  158. int start, task_cnt=0;
  159. if (slurm_cred_get_args(req->cred, &arg) != SLURM_SUCCESS) {
  160. error("task/affinity: job lacks a credential");
  161. return;
  162. }
  163. start = _get_local_node_info(&arg, 0, &sockets, &cores);
  164. if (start != 0) {
  165. error("task/affinity: missing node 0 in job credential");
  166. slurm_cred_free_args(&arg);
  167. return;
  168. }
  169. if ((sockets * cores) == 0) {
  170. error("task/affinity: socket and core count both zero");
  171. slurm_cred_free_args(&arg);
  172. return;
  173. }
  174. num_cpus = MIN((sockets * cores),
  175. (conf->sockets * conf->cores));
  176. req_map = (bitstr_t *) bit_alloc(num_cpus);
  177. hw_map = (bitstr_t *) bit_alloc(conf->block_map_size);
  178. if (!req_map || !hw_map) {
  179. error("task/affinity: malloc error");
  180. FREE_NULL_BITMAP(req_map);
  181. FREE_NULL_BITMAP(hw_map);
  182. slurm_cred_free_args(&arg);
  183. return;
  184. }
  185. #ifdef HAVE_FRONT_END
  186. {
  187. /* Since the front-end nodes are a shared resource, we limit each job
  188. * to one CPU based upon monotonically increasing sequence number */
  189. static int last_id = 0;
  190. bit_set(hw_map, ((last_id++) % conf->block_map_size));
  191. task_cnt = 1;
  192. }
  193. #else
  194. {
  195. char *str;
  196. int t, p;
  197. /* Transfer core_bitmap data to local req_map.
  198. * The MOD function handles the case where fewer processes
  199. * physically exist than are configured (slurmd is out of
  200. * sync with the slurmctld daemon). */
  201. for (p = 0; p < (sockets * cores); p++) {
  202. if (bit_test(arg.job_core_bitmap, p))
  203. bit_set(req_map, (p % num_cpus));
  204. }
  205. str = (char *)bit_fmt_hexmask(req_map);
  206. debug3("task/affinity: job %u CPU mask from slurmctld: %s",
  207. req->job_id, str);
  208. xfree(str);
  209. for (p = 0; p < num_cpus; p++) {
  210. if (bit_test(req_map, p) == 0)
  211. continue;
  212. /* core_bitmap does not include threads, so we
  213. * add them here but limit them to what the job
  214. * requested */
  215. for (t = 0; t < conf->threads; t++) {
  216. uint16_t pos = p * conf->threads + t;
  217. if (pos >= conf->block_map_size) {
  218. info("more resources configured than exist");
  219. p = num_cpus;
  220. break;
  221. }
  222. bit_set(hw_map, pos);
  223. task_cnt++;
  224. }
  225. }
  226. }
  227. #endif
  228. if (task_cnt) {
  229. req->cpu_bind_type = CPU_BIND_MASK;
  230. if (conf->task_plugin_param & CPU_BIND_VERBOSE)
  231. req->cpu_bind_type |= CPU_BIND_VERBOSE;
  232. req->cpu_bind = (char *)bit_fmt_hexmask(hw_map);
  233. info("task/affinity: job %u CPU input mask for node: %s",
  234. req->job_id, req->cpu_bind);
  235. /* translate abstract masks to actual hardware layout */
  236. _lllp_map_abstract_masks(1, &hw_map);
  237. #ifdef HAVE_NUMA
  238. if (req->cpu_bind_type & CPU_BIND_TO_LDOMS) {
  239. _match_masks_to_ldom(1, &hw_map);
  240. }
  241. #endif
  242. xfree(req->cpu_bind);
  243. req->cpu_bind = (char *)bit_fmt_hexmask(hw_map);
  244. info("task/affinity: job %u CPU final HW mask for node: %s",
  245. req->job_id, req->cpu_bind);
  246. } else {
  247. error("task/affinity: job %u allocated no CPUs",
  248. req->job_id);
  249. }
  250. FREE_NULL_BITMAP(hw_map);
  251. FREE_NULL_BITMAP(req_map);
  252. slurm_cred_free_args(&arg);
  253. }
  254. /*
  255. * lllp_distribution
  256. *
  257. * Note: lllp stands for Lowest Level of Logical Processors.
  258. *
  259. * When automatic binding is enabled:
  260. * - no binding flags set >= CPU_BIND_NONE, and
  261. * - a auto binding level selected CPU_BIND_TO_{SOCKETS,CORES,THREADS}
  262. * Otherwise limit job step to the allocated CPUs
  263. *
  264. * generate the appropriate cpu_bind type and string which results in
  265. * the specified lllp distribution.
  266. *
  267. * IN/OUT- job launch request (cpu_bind_type and cpu_bind updated)
  268. * IN- global task id array
  269. */
  270. void lllp_distribution(launch_tasks_request_msg_t *req, uint32_t node_id)
  271. {
  272. int rc = SLURM_SUCCESS;
  273. bitstr_t **masks = NULL;
  274. char buf_type[100];
  275. int maxtasks = req->tasks_to_launch[(int)node_id];
  276. int whole_nodes, whole_sockets, whole_cores, whole_threads;
  277. int part_sockets, part_cores;
  278. const uint32_t *gtid = req->global_task_ids[(int)node_id];
  279. static uint16_t bind_entity = CPU_BIND_TO_THREADS | CPU_BIND_TO_CORES |
  280. CPU_BIND_TO_SOCKETS | CPU_BIND_TO_LDOMS;
  281. static uint16_t bind_mode = CPU_BIND_NONE | CPU_BIND_MASK |
  282. CPU_BIND_RANK | CPU_BIND_MAP |
  283. CPU_BIND_LDMASK | CPU_BIND_LDRANK |
  284. CPU_BIND_LDMAP;
  285. if (req->cpu_bind_type & bind_mode) {
  286. /* Explicit step binding specified by user */
  287. char *avail_mask = _alloc_mask(req,
  288. &whole_nodes, &whole_sockets,
  289. &whole_cores, &whole_threads,
  290. &part_sockets, &part_cores);
  291. if ((whole_nodes == 0) && avail_mask) {
  292. /* Step does NOT have access to whole node,
  293. * bind to full mask of available processors */
  294. xfree(req->cpu_bind);
  295. req->cpu_bind = avail_mask;
  296. req->cpu_bind_type &= (~bind_mode);
  297. req->cpu_bind_type |= CPU_BIND_MASK;
  298. } else {
  299. /* Step does have access to whole node,
  300. * bind to whatever step wants */
  301. xfree(avail_mask);
  302. }
  303. slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type);
  304. info("lllp_distribution jobid [%u] manual binding: %s",
  305. req->job_id, buf_type);
  306. return;
  307. }
  308. if (!(req->cpu_bind_type & bind_entity)) {
  309. /* No bind unit (sockets, cores) specified by user,
  310. * pick something reasonable */
  311. int max_tasks = req->tasks_to_launch[(int)node_id];
  312. char *avail_mask = _alloc_mask(req,
  313. &whole_nodes, &whole_sockets,
  314. &whole_cores, &whole_threads,
  315. &part_sockets, &part_cores);
  316. debug("binding tasks:%d to "
  317. "nodes:%d sockets:%d:%d cores:%d:%d threads:%d",
  318. max_tasks, whole_nodes, whole_sockets ,part_sockets,
  319. whole_cores, part_cores, whole_threads);
  320. if ((max_tasks == whole_sockets) && (part_sockets == 0)) {
  321. req->cpu_bind_type |= CPU_BIND_TO_SOCKETS;
  322. goto make_auto;
  323. }
  324. if ((max_tasks == whole_cores) && (part_cores == 0)) {
  325. req->cpu_bind_type |= CPU_BIND_TO_CORES;
  326. goto make_auto;
  327. }
  328. if (max_tasks == whole_threads) {
  329. req->cpu_bind_type |= CPU_BIND_TO_THREADS;
  330. goto make_auto;
  331. }
  332. if (avail_mask) {
  333. xfree(req->cpu_bind);
  334. req->cpu_bind = avail_mask;
  335. req->cpu_bind_type |= CPU_BIND_MASK;
  336. }
  337. slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type);
  338. info("lllp_distribution jobid [%u] auto binding off: %s",
  339. req->job_id, buf_type);
  340. return;
  341. make_auto: xfree(avail_mask);
  342. slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type);
  343. info("lllp_distribution jobid [%u] implicit auto binding: "
  344. "%s, dist %d", req->job_id, buf_type, req->task_dist);
  345. } else {
  346. /* Explicit bind unit (sockets, cores) specified by user */
  347. slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type);
  348. info("lllp_distribution jobid [%u] binding: %s, dist %d",
  349. req->job_id, buf_type, req->task_dist);
  350. }
  351. switch (req->task_dist) {
  352. case SLURM_DIST_BLOCK_BLOCK:
  353. case SLURM_DIST_CYCLIC_BLOCK:
  354. case SLURM_DIST_PLANE:
  355. /* tasks are distributed in blocks within a plane */
  356. rc = _task_layout_lllp_block(req, node_id, &masks);
  357. break;
  358. case SLURM_DIST_CYCLIC:
  359. case SLURM_DIST_BLOCK:
  360. case SLURM_DIST_CYCLIC_CYCLIC:
  361. case SLURM_DIST_BLOCK_CYCLIC:
  362. rc = _task_layout_lllp_cyclic(req, node_id, &masks);
  363. break;
  364. default:
  365. if (req->cpus_per_task > 1)
  366. rc = _task_layout_lllp_multi(req, node_id, &masks);
  367. else
  368. rc = _task_layout_lllp_cyclic(req, node_id, &masks);
  369. req->task_dist = SLURM_DIST_BLOCK_CYCLIC;
  370. break;
  371. }
  372. /* FIXME: I'm worried about core_bitmap with CPU_BIND_TO_SOCKETS &
  373. * max_cores - does select/cons_res plugin allocate whole
  374. * socket??? Maybe not. Check srun man page.
  375. */
  376. if (rc == SLURM_SUCCESS) {
  377. _task_layout_display_masks(req, gtid, maxtasks, masks);
  378. /* translate abstract masks to actual hardware layout */
  379. _lllp_map_abstract_masks(maxtasks, masks);
  380. _task_layout_display_masks(req, gtid, maxtasks, masks);
  381. #ifdef HAVE_NUMA
  382. if (req->cpu_bind_type & CPU_BIND_TO_LDOMS) {
  383. _match_masks_to_ldom(maxtasks, masks);
  384. _task_layout_display_masks(req, gtid, maxtasks, masks);
  385. }
  386. #endif
  387. /* convert masks into cpu_bind mask string */
  388. _lllp_generate_cpu_bind(req, maxtasks, masks);
  389. } else {
  390. char *avail_mask = _alloc_mask(req,
  391. &whole_nodes, &whole_sockets,
  392. &whole_cores, &whole_threads,
  393. &part_sockets, &part_cores);
  394. if (avail_mask) {
  395. xfree(req->cpu_bind);
  396. req->cpu_bind = avail_mask;
  397. req->cpu_bind_type &= (~bind_mode);
  398. req->cpu_bind_type |= CPU_BIND_MASK;
  399. }
  400. slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type);
  401. error("lllp_distribution jobid [%u] overriding binding: %s",
  402. req->job_id, buf_type);
  403. error("Verify socket/core/thread counts in configuration");
  404. }
  405. if (masks)
  406. _lllp_free_masks(maxtasks, masks);
  407. }
  408. /*
  409. * _get_local_node_info - get job allocation details for this node
  410. * IN: req - launch request structure
  411. * IN: job_node_id - index of the local node in the job allocation
  412. * IN/OUT: sockets - pointer to socket count variable
  413. * IN/OUT: cores - pointer to cores_per_socket count variable
  414. * OUT: returns the core_bitmap index of the first core for this node
  415. */
  416. static int _get_local_node_info(slurm_cred_arg_t *arg, int job_node_id,
  417. uint16_t *sockets, uint16_t *cores)
  418. {
  419. int bit_start = 0, bit_finish = 0;
  420. int i, index = -1, cur_node_id = -1;
  421. do {
  422. index++;
  423. for (i = 0; i < arg->sock_core_rep_count[index] &&
  424. cur_node_id < job_node_id; i++) {
  425. bit_start = bit_finish;
  426. bit_finish += arg->sockets_per_node[index] *
  427. arg->cores_per_socket[index];
  428. cur_node_id++;
  429. }
  430. } while (cur_node_id < job_node_id);
  431. *sockets = arg->sockets_per_node[index];
  432. *cores = arg->cores_per_socket[index];
  433. return bit_start;
  434. }
  435. /* Determine which CPUs a job step can use.
  436. * OUT whole_<entity>_count - returns count of whole <entities> in this
  437. * allocation for this node
  438. * OUT part__<entity>_count - returns count of partial <entities> in this
  439. * allocation for this node
  440. * RET - a string representation of the available mask or NULL on error
  441. * NOTE: Caller must xfree() the return value. */
  442. static char *_alloc_mask(launch_tasks_request_msg_t *req,
  443. int *whole_node_cnt, int *whole_socket_cnt,
  444. int *whole_core_cnt, int *whole_thread_cnt,
  445. int *part_socket_cnt, int *part_core_cnt)
  446. {
  447. uint16_t sockets, cores, threads;
  448. int c, s, t, i;
  449. int c_miss, s_miss, t_miss, c_hit, t_hit;
  450. bitstr_t *alloc_bitmap;
  451. char *str_mask;
  452. bitstr_t *alloc_mask;
  453. *whole_node_cnt = 0;
  454. *whole_socket_cnt = 0;
  455. *whole_core_cnt = 0;
  456. *whole_thread_cnt = 0;
  457. *part_socket_cnt = 0;
  458. *part_core_cnt = 0;
  459. alloc_bitmap = _get_avail_map(req, &sockets, &cores, &threads);
  460. if (!alloc_bitmap)
  461. return NULL;
  462. alloc_mask = bit_alloc(bit_size(alloc_bitmap));
  463. if (!alloc_mask) {
  464. error("malloc error");
  465. FREE_NULL_BITMAP(alloc_bitmap);
  466. return NULL;
  467. }
  468. i = 0;
  469. for (s=0, s_miss=false; s<sockets; s++) {
  470. for (c=0, c_hit=c_miss=false; c<cores; c++) {
  471. for (t=0, t_hit=t_miss=false; t<threads; t++) {
  472. /* If we are pretending we have a
  473. larger system than we really have
  474. this is needed to make sure we
  475. don't bust the bank.
  476. */
  477. if (i >= bit_size(alloc_bitmap))
  478. i = 0;
  479. if (bit_test(alloc_bitmap, i)) {
  480. bit_set(alloc_mask, i);
  481. (*whole_thread_cnt)++;
  482. t_hit = true;
  483. c_hit = true;
  484. } else
  485. t_miss = true;
  486. i++;
  487. }
  488. if (!t_miss)
  489. (*whole_core_cnt)++;
  490. else {
  491. if (t_hit)
  492. (*part_core_cnt)++;
  493. c_miss = true;
  494. }
  495. }
  496. if (!c_miss)
  497. (*whole_socket_cnt)++;
  498. else {
  499. if (c_hit)
  500. (*part_socket_cnt)++;
  501. s_miss = true;
  502. }
  503. }
  504. if (!s_miss)
  505. (*whole_node_cnt)++;
  506. FREE_NULL_BITMAP(alloc_bitmap);
  507. /* translate abstract masks to actual hardware layout */
  508. _lllp_map_abstract_masks(1, &alloc_mask);
  509. #ifdef HAVE_NUMA
  510. if (req->cpu_bind_type & CPU_BIND_TO_LDOMS) {
  511. _match_masks_to_ldom(1, &alloc_mask);
  512. }
  513. #endif
  514. str_mask = bit_fmt_hexmask(alloc_mask);
  515. FREE_NULL_BITMAP(alloc_mask);
  516. return str_mask;
  517. }
  518. /*
  519. * Given a job step request, return an equivalent local bitmap for this node
  520. * IN req - The job step launch request
  521. * OUT hw_sockets - number of actual sockets on this node
  522. * OUT hw_cores - number of actual cores per socket on this node
  523. * OUT hw_threads - number of actual threads per core on this node
  524. * RET: bitmap of processors available to this job step on this node
  525. * OR NULL on error
  526. */
  527. static bitstr_t *_get_avail_map(launch_tasks_request_msg_t *req,
  528. uint16_t *hw_sockets, uint16_t *hw_cores,
  529. uint16_t *hw_threads)
  530. {
  531. bitstr_t *req_map, *hw_map;
  532. slurm_cred_arg_t arg;
  533. uint16_t p, t, new_p, num_cpus, sockets, cores;
  534. int job_node_id;
  535. int start;
  536. char *str;
  537. *hw_sockets = conf->sockets;
  538. *hw_cores = conf->cores;
  539. *hw_threads = conf->threads;
  540. if (slurm_cred_get_args(req->cred, &arg) != SLURM_SUCCESS) {
  541. error("task/affinity: job lacks a credential");
  542. return NULL;
  543. }
  544. /* we need this node's ID in relation to the whole
  545. * job allocation, not just this jobstep */
  546. job_node_id = nodelist_find(arg.job_hostlist, conf->node_name);
  547. start = _get_local_node_info(&arg, job_node_id, &sockets, &cores);
  548. if (start < 0) {
  549. error("task/affinity: missing node %d in job credential",
  550. job_node_id);
  551. slurm_cred_free_args(&arg);
  552. return NULL;
  553. }
  554. debug3("task/affinity: slurmctld s %u c %u; hw s %u c %u t %u",
  555. sockets, cores, *hw_sockets, *hw_cores, *hw_threads);
  556. num_cpus = MIN((sockets * cores), ((*hw_sockets)*(*hw_cores)));
  557. req_map = (bitstr_t *) bit_alloc(num_cpus);
  558. hw_map = (bitstr_t *) bit_alloc(conf->block_map_size);
  559. if (!req_map || !hw_map) {
  560. error("task/affinity: malloc error");
  561. FREE_NULL_BITMAP(req_map);
  562. FREE_NULL_BITMAP(hw_map);
  563. slurm_cred_free_args(&arg);
  564. return NULL;
  565. }
  566. /* Transfer core_bitmap data to local req_map.
  567. * The MOD function handles the case where fewer processes
  568. * physically exist than are configured (slurmd is out of
  569. * sync with the slurmctld daemon). */
  570. for (p = 0; p < (sockets * cores); p++) {
  571. if (bit_test(arg.step_core_bitmap, start+p))
  572. bit_set(req_map, (p % num_cpus));
  573. }
  574. str = (char *)bit_fmt_hexmask(req_map);
  575. debug3("task/affinity: job %u.%u CPU mask from slurmctld: %s",
  576. req->job_id, req->job_step_id, str);
  577. xfree(str);
  578. for (p = 0; p < num_cpus; p++) {
  579. if (bit_test(req_map, p) == 0)
  580. continue;
  581. /* If we are pretending we have a larger system than
  582. we really have this is needed to make sure we
  583. don't bust the bank.
  584. */
  585. new_p = p % conf->block_map_size;
  586. /* core_bitmap does not include threads, so we
  587. * add them here but limit them to what the job
  588. * requested */
  589. for (t = 0; t < (*hw_threads); t++) {
  590. uint16_t bit = new_p * (*hw_threads) + t;
  591. bit %= conf->block_map_size;
  592. bit_set(hw_map, bit);
  593. }
  594. }
  595. str = (char *)bit_fmt_hexmask(hw_map);
  596. debug3("task/affinity: job %u.%u CPU final mask for local node: %s",
  597. req->job_id, req->job_step_id, str);
  598. xfree(str);
  599. FREE_NULL_BITMAP(req_map);
  600. slurm_cred_free_args(&arg);
  601. return hw_map;
  602. }
  603. /* helper function for _expand_masks() */
  604. static void _blot_mask(bitstr_t *mask, uint16_t blot)
  605. {
  606. uint16_t i, size = 0;
  607. int prev = -1;
  608. if (!mask)
  609. return;
  610. size = bit_size(mask);
  611. for (i = 0; i < size; i++) {
  612. if (bit_test(mask, i)) {
  613. /* fill in this blot */
  614. uint16_t start = (i / blot) * blot;
  615. if (start != prev) {
  616. bit_nset(mask, start, start+blot-1);
  617. prev = start;
  618. }
  619. }
  620. }
  621. }
  622. /* helper function for _expand_masks()
  623. * for each task, consider which other bits are set in avail_map
  624. * on the same socket */
  625. static void _blot_mask_sockets(const uint32_t maxtasks, const uint32_t task,
  626. bitstr_t **masks, uint16_t hw_sockets,
  627. uint16_t hw_cores, uint16_t hw_threads,
  628. bitstr_t *avail_map)
  629. {
  630. uint16_t i, j, size = 0;
  631. int blot;
  632. if (!masks[task])
  633. return;
  634. blot = bit_size(avail_map) / hw_sockets;
  635. size = bit_size(masks[task]);
  636. for (i = 0; i < size; i++) {
  637. if (bit_test(masks[task], i)) {
  638. /* check if other bits are set in avail_map on this
  639. * socket and set each corresponding bit in masks */
  640. uint16_t start = (i / blot) * blot;
  641. for (j = start; j < start+blot; j++) {
  642. if (bit_test(avail_map, j))
  643. bit_set(masks[task], j);
  644. }
  645. }
  646. }
  647. }
  648. /* for each mask, expand the mask around the set bits to include the
  649. * complete resource to which the set bits are to be bound */
  650. static void _expand_masks(uint16_t cpu_bind_type, const uint32_t maxtasks,
  651. bitstr_t **masks, uint16_t hw_sockets,
  652. uint16_t hw_cores, uint16_t hw_threads,
  653. bitstr_t *avail_map)
  654. {
  655. uint32_t i;
  656. if (cpu_bind_type & CPU_BIND_TO_THREADS)
  657. return;
  658. if (cpu_bind_type & CPU_BIND_TO_CORES) {
  659. if (hw_threads < 2)
  660. return;
  661. for (i = 0; i < maxtasks; i++) {
  662. _blot_mask(masks[i], hw_threads);
  663. }
  664. return;
  665. }
  666. if (cpu_bind_type & CPU_BIND_TO_SOCKETS) {
  667. if (hw_threads*hw_cores < 2)
  668. return;
  669. for (i = 0; i < maxtasks; i++) {
  670. _blot_mask_sockets(maxtasks, i, masks, hw_sockets,
  671. hw_cores, hw_threads, avail_map);
  672. }
  673. return;
  674. }
  675. }
  676. /*
  677. * _task_layout_lllp_multi
  678. *
  679. * A variant of _task_layout_lllp_cyclic for use with allocations having
  680. * more than one CPU per task, put the tasks as close as possible (fill
  681. * core rather than going next socket for the extra task)
  682. *
  683. */
  684. static int _task_layout_lllp_multi(launch_tasks_request_msg_t *req,
  685. uint32_t node_id, bitstr_t ***masks_p)
  686. {
  687. int last_taskcount = -1, taskcount = 0;
  688. uint16_t c, i, s, t, hw_sockets = 0, hw_cores = 0, hw_threads = 0;
  689. int size, max_tasks = req->tasks_to_launch[(int)node_id];
  690. int max_cpus = max_tasks * req->cpus_per_task;
  691. bitstr_t *avail_map;
  692. bitstr_t **masks = NULL;
  693. info ("_task_layout_lllp_multi ");
  694. avail_map = _get_avail_map(req, &hw_sockets, &hw_cores, &hw_threads);
  695. if (!avail_map)
  696. return SLURM_ERROR;
  697. *masks_p = xmalloc(max_tasks * sizeof(bitstr_t*));
  698. masks = *masks_p;
  699. size = bit_set_count(avail_map);
  700. if (size < max_tasks) {
  701. error("task/affinity: only %d bits in avail_map for %d tasks!",
  702. size, max_tasks);
  703. FREE_NULL_BITMAP(avail_map);
  704. return SLURM_ERROR;
  705. }
  706. if (size < max_cpus) {
  707. /* Possible result of overcommit */
  708. i = size / max_tasks;
  709. info("task/affinity: reset cpus_per_task from %d to %d",
  710. req->cpus_per_task, i);
  711. req->cpus_per_task = i;
  712. }
  713. size = bit_size(avail_map);
  714. i = 0;
  715. while (taskcount < max_tasks) {
  716. if (taskcount == last_taskcount)
  717. fatal("_task_layout_lllp_multi failure");
  718. last_taskcount = taskcount;
  719. for (s = 0; s < hw_sockets; s++) {
  720. for (c = 0; c < hw_cores; c++) {
  721. for (t = 0; t < hw_threads; t++) {
  722. uint16_t bit = s*(hw_cores*hw_threads) +
  723. c*(hw_threads) + t;
  724. if (bit_test(avail_map, bit) == 0)
  725. continue;
  726. if (masks[taskcount] == NULL) {
  727. masks[taskcount] =
  728. bit_alloc(conf->block_map_size);
  729. }
  730. bit_set(masks[taskcount], bit);
  731. if (++i < req->cpus_per_task)
  732. continue;
  733. i = 0;
  734. if (++taskcount >= max_tasks)
  735. break;
  736. }
  737. if (taskcount >= max_tasks)
  738. break;
  739. }
  740. if (taskcount >= max_tasks)
  741. break;
  742. }
  743. }
  744. /* last step: expand the masks to bind each task
  745. * to the requested resource */
  746. _expand_masks(req->cpu_bind_type, max_tasks, masks,
  747. hw_sockets, hw_cores, hw_threads, avail_map);
  748. FREE_NULL_BITMAP(avail_map);
  749. return SLURM_SUCCESS;
  750. }
  751. /*
  752. * _task_layout_lllp_cyclic
  753. *
  754. * task_layout_lllp_cyclic creates a cyclic distribution at the
  755. * lowest level of logical processor which is either socket, core or
  756. * thread depending on the system architecture. The Cyclic algorithm
  757. * is the same as the Cyclic distribution performed in srun.
  758. *
  759. * Distribution at the lllp:
  760. * -m hostfile|plane|block|cyclic:block|cyclic
  761. *
  762. * The first distribution "hostfile|plane|block|cyclic" is computed
  763. * in srun. The second distribution "plane|block|cyclic" is computed
  764. * locally by each slurmd.
  765. *
  766. * The input to the lllp distribution algorithms is the gids (tasks
  767. * ids) generated for the local node.
  768. *
  769. * The output is a mapping of the gids onto logical processors
  770. * (thread/core/socket) with is expressed cpu_bind masks.
  771. *
  772. */
  773. static int _task_layout_lllp_cyclic(launch_tasks_request_msg_t *req,
  774. uint32_t node_id, bitstr_t ***masks_p)
  775. {
  776. int last_taskcount = -1, taskcount = 0;
  777. uint16_t c, i, s, t, hw_sockets = 0, hw_cores = 0, hw_threads = 0;
  778. int size, max_tasks = req->tasks_to_launch[(int)node_id];
  779. int max_cpus = max_tasks * req->cpus_per_task;
  780. int avail_size;
  781. bitstr_t *avail_map;
  782. bitstr_t **masks = NULL;
  783. info ("_task_layout_lllp_cyclic ");
  784. avail_map = _get_avail_map(req, &hw_sockets, &hw_cores, &hw_threads);
  785. if (!avail_map)
  786. return SLURM_ERROR;
  787. avail_size = bit_size(avail_map);
  788. *masks_p = xmalloc(max_tasks * sizeof(bitstr_t*));
  789. masks = *masks_p;
  790. size = bit_set_count(avail_map);
  791. if (size < max_tasks) {
  792. error("task/affinity: only %d bits in avail_map for %d tasks!",
  793. size, max_tasks);
  794. FREE_NULL_BITMAP(avail_map);
  795. return SLURM_ERROR;
  796. }
  797. if (size < max_cpus) {
  798. /* Possible result of overcommit */
  799. i = size / max_tasks;
  800. info("task/affinity: reset cpus_per_task from %d to %d",
  801. req->cpus_per_task, i);
  802. req->cpus_per_task = i;
  803. }
  804. size = bit_size(avail_map);
  805. i = 0;
  806. while (taskcount < max_tasks) {
  807. if (taskcount == last_taskcount)
  808. fatal("_task_layout_lllp_cyclic failure");
  809. last_taskcount = taskcount;
  810. for (t = 0; t < hw_threads; t++) {
  811. for (c = 0; c < hw_cores; c++) {
  812. for (s = 0; s < hw_sockets; s++) {
  813. uint16_t bit = s*(hw_cores*hw_threads) +
  814. c*(hw_threads) + t;
  815. /* In case hardware and config differ */
  816. bit %= avail_size;
  817. if (bit_test(avail_map, bit) == 0)
  818. continue;
  819. if (masks[taskcount] == NULL) {
  820. masks[taskcount] =
  821. (bitstr_t *)
  822. bit_alloc(conf->
  823. block_map_size);
  824. }
  825. bit_set(masks[taskcount], bit);
  826. if (++i < req->cpus_per_task)
  827. continue;
  828. i = 0;
  829. if (++taskcount >= max_tasks)
  830. break;
  831. }
  832. if (taskcount >= max_tasks)
  833. break;
  834. }
  835. if (taskcount >= max_tasks)
  836. break;
  837. }
  838. }
  839. /* last step: expand the masks to bind each task
  840. * to the requested resource */
  841. _expand_masks(req->cpu_bind_type, max_tasks, masks,
  842. hw_sockets, hw_cores, hw_threads, avail_map);
  843. FREE_NULL_BITMAP(avail_map);
  844. return SLURM_SUCCESS;
  845. }
  846. /*
  847. * _task_layout_lllp_block
  848. *
  849. * task_layout_lllp_block will create a block distribution at the
  850. * lowest level of logical processor which is either socket, core or
  851. * thread depending on the system architecture. The Block algorithm
  852. * is the same as the Block distribution performed in srun.
  853. *
  854. * Distribution at the lllp:
  855. * -m hostfile|plane|block|cyclic:block|cyclic
  856. *
  857. * The first distribution "hostfile|plane|block|cyclic" is computed
  858. * in srun. The second distribution "plane|block|cyclic" is computed
  859. * locally by each slurmd.
  860. *
  861. * The input to the lllp distribution algorithms is the gids (tasks
  862. * ids) generated for the local node.
  863. *
  864. * The output is a mapping of the gids onto logical processors
  865. * (thread/core/socket) with is expressed cpu_bind masks.
  866. *
  867. */
  868. static int _task_layout_lllp_block(launch_tasks_request_msg_t *req,
  869. uint32_t node_id, bitstr_t ***masks_p)
  870. {
  871. int c, i, j, t, size, last_taskcount = -1, taskcount = 0;
  872. uint16_t hw_sockets = 0, hw_cores = 0, hw_threads = 0;
  873. int max_tasks = req->tasks_to_launch[(int)node_id];
  874. int max_cpus = max_tasks * req->cpus_per_task;
  875. int *task_array;
  876. bitstr_t *avail_map;
  877. bitstr_t **masks = NULL;
  878. info("_task_layout_lllp_block ");
  879. avail_map = _get_avail_map(req, &hw_sockets, &hw_cores, &hw_threads);
  880. if (!avail_map) {
  881. return SLURM_ERROR;
  882. }
  883. size = bit_set_count(avail_map);
  884. if (size < max_tasks) {
  885. error("task/affinity: only %d bits in avail_map for %d tasks!",
  886. size, max_tasks);
  887. FREE_NULL_BITMAP(avail_map);
  888. return SLURM_ERROR;
  889. }
  890. if (size < max_cpus) {
  891. /* Possible result of overcommit */
  892. i = size / max_tasks;
  893. info("task/affinity: reset cpus_per_task from %d to %d",
  894. req->cpus_per_task, i);
  895. req->cpus_per_task = i;
  896. }
  897. size = bit_size(avail_map);
  898. *masks_p = xmalloc(max_tasks * sizeof(bitstr_t*));
  899. masks = *masks_p;
  900. task_array = xmalloc(size * sizeof(int));
  901. if (!task_array) {
  902. error("In lllp_block: task_array memory error");
  903. FREE_NULL_BITMAP(avail_map);
  904. return SLURM_ERROR;
  905. }
  906. /* block distribution with oversubsciption */
  907. c = 0;
  908. while(taskcount < max_tasks) {
  909. if (taskcount == last_taskcount) {
  910. fatal("_task_layout_lllp_block infinite loop");
  911. }
  912. last_taskcount = taskcount;
  913. /* the abstract map is already laid out in block order,
  914. * so just iterate over it
  915. */
  916. for (i = 0; i < size; i++) {
  917. /* skip unrequested threads */
  918. if (i%hw_threads >= hw_threads)
  919. continue;
  920. /* skip unavailable resources */
  921. if (bit_test(avail_map, i) == 0)
  922. continue;
  923. /* if multiple CPUs per task, only
  924. * count the task on the first CPU */
  925. if (c == 0)
  926. task_array[i] += 1;
  927. if (++c < req->cpus_per_task)
  928. continue;
  929. c = 0;
  930. if (++taskcount >= max_tasks)
  931. break;
  932. }
  933. }
  934. /* Distribute the tasks and create per-task masks that only
  935. * contain the first CPU. Note that unused resources
  936. * (task_array[i] == 0) will get skipped */
  937. taskcount = 0;
  938. for (i = 0; i < size; i++) {
  939. for (t = 0; t < task_array[i]; t++) {
  940. if (masks[taskcount] == NULL)
  941. masks[taskcount] = (bitstr_t *)bit_alloc(conf->block_map_size);
  942. bit_set(masks[taskcount++], i);
  943. }
  944. }
  945. /* now set additional CPUs for cpus_per_task > 1 */
  946. for (t=0; t<max_tasks && req->cpus_per_task>1; t++) {
  947. if (!masks[t])
  948. continue;
  949. c = 0;
  950. for (i = 0; i < size && c<req->cpus_per_task; i++) {
  951. if (bit_test(masks[t], i) == 0)
  952. continue;
  953. for (j=i+1,c=1; j<size && c<req->cpus_per_task;j++) {
  954. if (bit_test(avail_map, j) == 0)
  955. continue;
  956. bit_set(masks[t], j);
  957. c++;
  958. }
  959. if (c < req->cpus_per_task) {
  960. /* we haven't found all of the CPUs for this
  961. * task, so we'll wrap the search to cover the
  962. * whole node */
  963. for (j=0; j<i && c<req->cpus_per_task; j++) {
  964. if (bit_test(avail_map, j) == 0)
  965. continue;
  966. bit_set(masks[t], j);
  967. c++;
  968. }
  969. }
  970. }
  971. }
  972. xfree(task_array);
  973. /* last step: expand the masks to bind each task
  974. * to the requested resource */
  975. _expand_masks(req->cpu_bind_type, max_tasks, masks,
  976. hw_sockets, hw_cores, hw_threads, avail_map);
  977. FREE_NULL_BITMAP(avail_map);
  978. return SLURM_SUCCESS;
  979. }
  980. /*
  981. * _lllp_map_abstract_mask
  982. *
  983. * Map one abstract block mask to a physical machine mask
  984. *
  985. * IN - mask to map
  986. * OUT - mapped mask (storage allocated in this routine)
  987. */
  988. static bitstr_t *_lllp_map_abstract_mask(bitstr_t *bitmask)
  989. {
  990. int i, bit;
  991. int num_bits = bit_size(bitmask);
  992. bitstr_t *newmask = NULL;
  993. newmask = (bitstr_t *) bit_alloc(num_bits);
  994. /* remap to physical machine */
  995. for (i = 0; i < num_bits; i++) {
  996. if (bit_test(bitmask,i)) {
  997. bit = BLOCK_MAP(i);
  998. if(bit < bit_size(newmask))
  999. bit_set(newmask, bit);
  1000. else
  1001. error("_lllp_map_abstract_mask: can't go from "
  1002. "%d -> %d since we only have %d bits",
  1003. i, bit, bit_size(newmask));
  1004. }
  1005. }
  1006. return newmask;
  1007. }
  1008. /*
  1009. * _lllp_map_abstract_masks
  1010. *
  1011. * Map an array of abstract block masks to physical machine masks
  1012. *
  1013. * IN- maximum number of tasks
  1014. * IN/OUT- array of masks
  1015. */
  1016. static void _lllp_map_abstract_masks(const uint32_t maxtasks, bitstr_t **masks)
  1017. {
  1018. int i;
  1019. debug3("_lllp_map_abstract_masks");
  1020. for (i = 0; i < maxtasks; i++) {
  1021. bitstr_t *bitmask = masks[i];
  1022. if (bitmask) {
  1023. bitstr_t *newmask = _lllp_map_abstract_mask(bitmask);
  1024. FREE_NULL_BITMAP(bitmask);
  1025. masks[i] = newmask;
  1026. }
  1027. }
  1028. }
  1029. /*
  1030. * _lllp_generate_cpu_bind
  1031. *
  1032. * Generate the cpu_bind type and string given an array of bitstr_t masks
  1033. *
  1034. * IN/OUT- job launch request (cpu_bind_type and cpu_bind updated)
  1035. * IN- maximum number of tasks
  1036. * IN- array of masks
  1037. */
  1038. static void _lllp_generate_cpu_bind(launch_tasks_request_msg_t *req,
  1039. const uint32_t maxtasks, bitstr_t **masks)
  1040. {
  1041. int i, num_bits=0, masks_len;
  1042. bitstr_t *bitmask;
  1043. bitoff_t charsize;
  1044. char *masks_str = NULL;
  1045. char buf_type[100];
  1046. for (i = 0; i < maxtasks; i++) {
  1047. bitmask = masks[i];
  1048. if (bitmask) {
  1049. num_bits = bit_size(bitmask);
  1050. break;
  1051. }
  1052. }
  1053. charsize = (num_bits + 3) / 4; /* ASCII hex digits */
  1054. charsize += 3; /* "0x" and trailing "," */
  1055. masks_len = maxtasks * charsize + 1; /* number of masks + null */
  1056. debug3("_lllp_generate_cpu_bind %d %d %d", maxtasks, charsize,
  1057. masks_len);
  1058. masks_str = xmalloc(masks_len);
  1059. masks_len = 0;
  1060. for (i = 0; i < maxtasks; i++) {
  1061. char *str;
  1062. int curlen;
  1063. bitmask = masks[i];
  1064. if (bitmask == NULL) {
  1065. continue;
  1066. }
  1067. str = (char *)bit_fmt_hexmask(bitmask);
  1068. curlen = strlen(str) + 1;
  1069. if (masks_len > 0)
  1070. masks_str[masks_len-1]=',';
  1071. strncpy(&masks_str[masks_len], str, curlen);
  1072. masks_len += curlen;
  1073. xassert(masks_str[masks_len] == '\0');
  1074. xfree(str);
  1075. }
  1076. if (req->cpu_bind) {
  1077. xfree(req->cpu_bind);
  1078. }
  1079. if (masks_str[0] != '\0') {
  1080. req->cpu_bind = masks_str;
  1081. req->cpu_bind_type |= CPU_BIND_MASK;
  1082. } else {
  1083. req->cpu_bind = NULL;
  1084. req->cpu_bind_type &= ~CPU_BIND_VERBOSE;
  1085. }
  1086. /* clear mask generation bits */
  1087. req->cpu_bind_type &= ~CPU_BIND_TO_THREADS;
  1088. req->cpu_bind_type &= ~CPU_BIND_TO_CORES;
  1089. req->cpu_bind_type &= ~CPU_BIND_TO_SOCKETS;
  1090. req->cpu_bind_type &= ~CPU_BIND_TO_LDOMS;
  1091. slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type);
  1092. info("_lllp_generate_cpu_bind jobid [%u]: %s, %s",
  1093. req->job_id, buf_type, masks_str);
  1094. }