PageRenderTime 54ms CodeModel.GetById 20ms RepoModel.GetById 1ms app.codeStats 0ms

/src/common/slurm_step_layout.c

https://github.com/cfenoy/slurm
C | 747 lines | 537 code | 75 blank | 135 comment | 117 complexity | 3adeeaad2f3110e227bfe10421db3c4f MD5 | raw file
Possible License(s): GPL-2.0, AGPL-1.0
  1. /*****************************************************************************\
  2. * slurm_step_layout.c - functions to distribute tasks over nodes.
  3. * $Id$
  4. *****************************************************************************
  5. *
  6. * Copyright (C) 2005 Hewlett-Packard Development Company, L.P.
  7. * Written by Chris Holmes, <cholmes@hp.com>, who borrowed heavily
  8. * from other parts of SLURM.
  9. * CODE-OCEC-09-009. All rights reserved.
  10. *
  11. * This file is part of SLURM, a resource management program.
  12. * For details, see <http://www.schedmd.com/slurmdocs/>.
  13. * Please also read the included file: DISCLAIMER.
  14. *
  15. * SLURM is free software; you can redistribute it and/or modify it under
  16. * the terms of the GNU General Public License as published by the Free
  17. * Software Foundation; either version 2 of the License, or (at your option)
  18. * any later version.
  19. *
  20. * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
  21. * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  22. * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  23. * details.
  24. *
  25. * You should have received a copy of the GNU General Public License along
  26. * with SLURM; if not, write to the Free Software Foundation, Inc.,
  27. * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
  28. *
  29. * This file is patterned after hostlist.c, written by Mark Grondona and
  30. * Copyright (C) 2002 The Regents of the University of California.
  31. \*****************************************************************************/
  32. #ifdef HAVE_CONFIG_H
  33. # include "config.h"
  34. # if HAVE_STRING_H
  35. # include <string.h>
  36. # endif
  37. #else /* !HAVE_CONFIG_H */
  38. # include <string.h>
  39. #endif /* HAVE_CONFIG_H */
  40. #include <stdlib.h>
  41. #include "slurm/slurm.h"
  42. #include "slurm/slurm_errno.h"
  43. #include "src/common/slurm_step_layout.h"
  44. #include "src/common/log.h"
  45. #include "src/common/xmalloc.h"
  46. #include "src/common/xstring.h"
  47. #include "src/common/read_config.h"
  48. #include "src/common/slurm_protocol_api.h"
  49. #include "src/common/node_select.h"
  50. #include "src/common/slurmdb_defs.h"
  51. /* build maps for task layout on nodes */
  52. static int _init_task_layout(slurm_step_layout_t *step_layout,
  53. const char *arbitrary_nodes,
  54. uint16_t *cpus_per_node, uint32_t *cpu_count_reps,
  55. uint16_t cpus_per_task,
  56. uint16_t task_dist, uint16_t plane_size);
  57. static int _task_layout_block(slurm_step_layout_t *step_layout,
  58. uint16_t *cpus);
  59. static int _task_layout_cyclic(slurm_step_layout_t *step_layout,
  60. uint16_t *cpus);
  61. static int _task_layout_plane(slurm_step_layout_t *step_layout,
  62. uint16_t *cpus);
  63. static int _task_layout_hostfile(slurm_step_layout_t *step_layout,
  64. const char *arbitrary_nodes);
  65. /*
  66. * slurm_step_layout_create - determine how many tasks of a job will be
  67. * run on each node. Distribution is influenced
  68. * by number of cpus on each host.
  69. * IN tlist - hostlist corresponding to task layout
  70. * IN cpus_per_node - cpus per node
  71. * IN cpu_count_reps - how many nodes have same cpu count
  72. * IN num_hosts - number of hosts we have
  73. * IN num_tasks - number of tasks to distribute across these cpus
  74. * IN cpus_per_task - number of cpus per task
  75. * IN task_dist - type of distribution we are using
  76. * IN plane_size - plane size (only needed for the plane distribution)
  77. * RET a pointer to an slurm_step_layout_t structure
  78. * NOTE: allocates memory that should be xfreed by caller
  79. */
  80. slurm_step_layout_t *slurm_step_layout_create(
  81. const char *tlist,
  82. uint16_t *cpus_per_node, uint32_t *cpu_count_reps,
  83. uint32_t num_hosts,
  84. uint32_t num_tasks,
  85. uint16_t cpus_per_task,
  86. uint16_t task_dist,
  87. uint16_t plane_size)
  88. {
  89. char *arbitrary_nodes = NULL;
  90. slurm_step_layout_t *step_layout =
  91. xmalloc(sizeof(slurm_step_layout_t));
  92. uint32_t cluster_flags = slurmdb_setup_cluster_flags();
  93. step_layout->task_dist = task_dist;
  94. if (task_dist == SLURM_DIST_ARBITRARY) {
  95. hostlist_t hl = NULL;
  96. char *buf = NULL;
  97. /* set the node list for the task layout later if user
  98. * supplied could be different that the job allocation */
  99. arbitrary_nodes = xstrdup(tlist);
  100. hl = hostlist_create(tlist);
  101. hostlist_uniq(hl);
  102. buf = hostlist_ranged_string_xmalloc(hl);
  103. num_hosts = hostlist_count(hl);
  104. hostlist_destroy(hl);
  105. step_layout->node_list = buf;
  106. } else {
  107. step_layout->node_list = xstrdup(tlist);
  108. }
  109. step_layout->task_cnt = num_tasks;
  110. if (cluster_flags & CLUSTER_FLAG_FE) {
  111. /* Limited job step support on front-end systems.
  112. * All jobs execute through front-end on Blue Gene.
  113. * Normally we would not permit execution of job steps,
  114. * but can fake it by just allocating all tasks to
  115. * one of the allocated nodes. */
  116. if (cluster_flags & CLUSTER_FLAG_BG)
  117. step_layout->node_cnt = num_hosts;
  118. else
  119. step_layout->node_cnt = 1;
  120. } else
  121. step_layout->node_cnt = num_hosts;
  122. if (_init_task_layout(step_layout, arbitrary_nodes,
  123. cpus_per_node, cpu_count_reps,
  124. cpus_per_task,
  125. task_dist, plane_size) != SLURM_SUCCESS) {
  126. slurm_step_layout_destroy(step_layout);
  127. step_layout = NULL;
  128. }
  129. xfree(arbitrary_nodes);
  130. return step_layout;
  131. }
  132. /*
  133. * fake_slurm_step_layout_create - used when you don't allocate a job from the
  134. * controller does not set up anything
  135. * that should really be used with a switch.
  136. * Or to really lay out tasks any any certain fashion.
  137. * IN tlist - hostlist corresponding to task layout
  138. * IN cpus_per_node - cpus per node NULL if no allocation
  139. * IN cpu_count_reps - how many nodes have same cpu count NULL if no allocation
  140. * IN node_cnt - number of nodes we have
  141. * IN task_cnt - number of tasks to distribute across these cpus 0
  142. * if using cpus_per_node
  143. * RET a pointer to an slurm_step_layout_t structure
  144. * NOTE: allocates memory that should be xfreed by caller
  145. */
  146. slurm_step_layout_t *fake_slurm_step_layout_create(
  147. const char *tlist,
  148. uint16_t *cpus_per_node,
  149. uint32_t *cpu_count_reps,
  150. uint32_t node_cnt,
  151. uint32_t task_cnt)
  152. {
  153. uint32_t cpn = 1;
  154. int cpu_cnt = 0, cpu_inx = 0, i, j;
  155. slurm_step_layout_t *step_layout = NULL;
  156. if ((node_cnt <= 0) || (task_cnt <= 0 && !cpus_per_node) || !tlist) {
  157. error("there is a problem with your fake_step_layout request\n"
  158. "node_cnt = %u, task_cnt = %u, tlist = %s",
  159. node_cnt, task_cnt, tlist);
  160. return NULL;
  161. }
  162. step_layout = xmalloc(sizeof(slurm_step_layout_t));
  163. step_layout->node_list = xstrdup(tlist);
  164. step_layout->node_cnt = node_cnt;
  165. step_layout->tasks = xmalloc(sizeof(uint16_t) * node_cnt);
  166. step_layout->tids = xmalloc(sizeof(uint32_t *) * node_cnt);
  167. step_layout->task_cnt = 0;
  168. for (i = 0; i < step_layout->node_cnt; i++) {
  169. if (cpus_per_node && cpu_count_reps) {
  170. step_layout->tasks[i] = cpus_per_node[cpu_inx];
  171. step_layout->tids[i] = xmalloc(sizeof(uint32_t) *
  172. step_layout->tasks[i]);
  173. for (j = 0; j < step_layout->tasks[i]; j++)
  174. step_layout->tids[i][j] =
  175. step_layout->task_cnt++;
  176. if ((++cpu_cnt) >= cpu_count_reps[cpu_inx]) {
  177. /* move to next record */
  178. cpu_inx++;
  179. cpu_cnt = 0;
  180. }
  181. } else {
  182. cpn = ((task_cnt - step_layout->task_cnt) +
  183. (node_cnt - i) - 1) / (node_cnt - i);
  184. if (step_layout->task_cnt >= task_cnt) {
  185. step_layout->tasks[i] = 0;
  186. step_layout->tids[i] = NULL;
  187. } else {
  188. step_layout->tasks[i] = cpn;
  189. step_layout->tids[i] =
  190. xmalloc(sizeof(uint32_t) * cpn);
  191. for (j = 0; j < cpn; j++) {
  192. step_layout->tids[i][j] =
  193. step_layout->task_cnt++;
  194. if (step_layout->task_cnt >= task_cnt) {
  195. step_layout->tasks[i] = j + 1;
  196. break;
  197. }
  198. }
  199. }
  200. }
  201. }
  202. return step_layout;
  203. }
  204. /* copys structure for step layout */
  205. extern slurm_step_layout_t *slurm_step_layout_copy(
  206. slurm_step_layout_t *step_layout)
  207. {
  208. slurm_step_layout_t *layout;
  209. int i = 0;
  210. if (!step_layout)
  211. return NULL;
  212. layout = xmalloc(sizeof(slurm_step_layout_t));
  213. layout->node_list = xstrdup(step_layout->node_list);
  214. layout->node_cnt = step_layout->node_cnt;
  215. layout->task_cnt = step_layout->task_cnt;
  216. layout->task_dist = step_layout->task_dist;
  217. layout->tasks = xmalloc(sizeof(uint16_t) * layout->node_cnt);
  218. memcpy(layout->tasks, step_layout->tasks,
  219. (sizeof(uint16_t) * layout->node_cnt));
  220. layout->tids = xmalloc(sizeof(uint32_t *) * layout->node_cnt);
  221. for (i = 0; i < layout->node_cnt; i++) {
  222. layout->tids[i] = xmalloc(sizeof(uint32_t) * layout->tasks[i]);
  223. memcpy(layout->tids[i], step_layout->tids[i],
  224. (sizeof(uint32_t) * layout->tasks[i]));
  225. }
  226. return layout;
  227. }
  228. extern void pack_slurm_step_layout(slurm_step_layout_t *step_layout,
  229. Buf buffer, uint16_t protocol_version)
  230. {
  231. uint32_t i = 0;
  232. if (protocol_version >= SLURM_2_3_PROTOCOL_VERSION) {
  233. if (step_layout)
  234. i=1;
  235. pack16(i, buffer);
  236. if (!i)
  237. return;
  238. packstr(step_layout->front_end, buffer);
  239. packstr(step_layout->node_list, buffer);
  240. pack32(step_layout->node_cnt, buffer);
  241. pack32(step_layout->task_cnt, buffer);
  242. pack16(step_layout->task_dist, buffer);
  243. for (i=0; i<step_layout->node_cnt; i++) {
  244. pack32_array(step_layout->tids[i],
  245. step_layout->tasks[i],
  246. buffer);
  247. }
  248. } else {
  249. error("pack_slurm_step_layout: protocol_version "
  250. "%hu not supported", protocol_version);
  251. }
  252. }
  253. extern int unpack_slurm_step_layout(slurm_step_layout_t **layout, Buf buffer,
  254. uint16_t protocol_version)
  255. {
  256. uint16_t uint16_tmp;
  257. uint32_t num_tids, uint32_tmp;
  258. slurm_step_layout_t *step_layout = NULL;
  259. int i;
  260. if (protocol_version >= SLURM_2_3_PROTOCOL_VERSION) {
  261. safe_unpack16(&uint16_tmp, buffer);
  262. if (!uint16_tmp)
  263. return SLURM_SUCCESS;
  264. step_layout = xmalloc(sizeof(slurm_step_layout_t));
  265. *layout = step_layout;
  266. safe_unpackstr_xmalloc(&step_layout->front_end,
  267. &uint32_tmp, buffer);
  268. safe_unpackstr_xmalloc(&step_layout->node_list,
  269. &uint32_tmp, buffer);
  270. safe_unpack32(&step_layout->node_cnt, buffer);
  271. safe_unpack32(&step_layout->task_cnt, buffer);
  272. safe_unpack16(&step_layout->task_dist, buffer);
  273. step_layout->tasks =
  274. xmalloc(sizeof(uint32_t) * step_layout->node_cnt);
  275. step_layout->tids = xmalloc(sizeof(uint32_t *)
  276. * step_layout->node_cnt);
  277. for (i = 0; i < step_layout->node_cnt; i++) {
  278. safe_unpack32_array(&(step_layout->tids[i]),
  279. &num_tids,
  280. buffer);
  281. step_layout->tasks[i] = num_tids;
  282. }
  283. } else {
  284. error("unpack_slurm_step_layout: protocol_version "
  285. "%hu not supported", protocol_version);
  286. goto unpack_error;
  287. }
  288. return SLURM_SUCCESS;
  289. unpack_error:
  290. slurm_step_layout_destroy(step_layout);
  291. *layout = NULL;
  292. return SLURM_ERROR;
  293. }
  294. /* destroys structure for step layout */
  295. extern int slurm_step_layout_destroy(slurm_step_layout_t *step_layout)
  296. {
  297. int i=0;
  298. if (step_layout) {
  299. xfree(step_layout->front_end);
  300. xfree(step_layout->node_list);
  301. xfree(step_layout->tasks);
  302. for (i = 0; i < step_layout->node_cnt; i++) {
  303. xfree(step_layout->tids[i]);
  304. }
  305. xfree(step_layout->tids);
  306. xfree(step_layout);
  307. }
  308. return SLURM_SUCCESS;
  309. }
  310. int slurm_step_layout_host_id (slurm_step_layout_t *s, int taskid)
  311. {
  312. int i, j;
  313. if (!s->tasks || !s->tids || (taskid > s->task_cnt - 1))
  314. return SLURM_ERROR;
  315. for (i=0; i < s->node_cnt; i++)
  316. for (j=0; j<s->tasks[i]; j++)
  317. if(s->tids[i][j] == taskid)
  318. return i;
  319. return SLURM_ERROR;
  320. }
  321. char *slurm_step_layout_host_name (slurm_step_layout_t *s, int taskid)
  322. {
  323. int hostid = slurm_step_layout_host_id (s, taskid);
  324. if (hostid < 0)
  325. return NULL;
  326. return nodelist_nth_host(s->node_list, hostid);
  327. }
  328. /* build maps for task layout on nodes */
  329. static int _init_task_layout(slurm_step_layout_t *step_layout,
  330. const char *arbitrary_nodes,
  331. uint16_t *cpus_per_node, uint32_t *cpu_count_reps,
  332. uint16_t cpus_per_task,
  333. uint16_t task_dist, uint16_t plane_size)
  334. {
  335. int cpu_cnt = 0, cpu_inx = 0, i;
  336. uint32_t cluster_flags = slurmdb_setup_cluster_flags();
  337. /* char *name = NULL; */
  338. uint16_t cpus[step_layout->node_cnt];
  339. if (step_layout->node_cnt == 0)
  340. return SLURM_ERROR;
  341. if (step_layout->tasks) /* layout already completed */
  342. return SLURM_SUCCESS;
  343. if ((int)cpus_per_task < 1 || cpus_per_task == (uint16_t)NO_VAL)
  344. cpus_per_task = 1;
  345. step_layout->plane_size = plane_size;
  346. step_layout->tasks = xmalloc(sizeof(uint16_t)
  347. * step_layout->node_cnt);
  348. step_layout->tids = xmalloc(sizeof(uint32_t *)
  349. * step_layout->node_cnt);
  350. if (!(cluster_flags & CLUSTER_FLAG_BG)) {
  351. hostlist_t hl = hostlist_create(step_layout->node_list);
  352. /* make sure the number of nodes we think we have
  353. * is the correct number */
  354. i = hostlist_count(hl);
  355. if (step_layout->node_cnt > i)
  356. step_layout->node_cnt = i;
  357. hostlist_destroy(hl);
  358. }
  359. debug("laying out the %u tasks on %u hosts %s dist %u",
  360. step_layout->task_cnt, step_layout->node_cnt,
  361. step_layout->node_list, task_dist);
  362. if (step_layout->node_cnt < 1) {
  363. error("no hostlist given can't layout tasks");
  364. return SLURM_ERROR;
  365. }
  366. for (i=0; i<step_layout->node_cnt; i++) {
  367. /* name = hostlist_shift(hl); */
  368. /* if (!name) { */
  369. /* error("hostlist incomplete for this job request"); */
  370. /* hostlist_destroy(hl); */
  371. /* return SLURM_ERROR; */
  372. /* } */
  373. /* debug2("host %d = %s", i, name); */
  374. /* free(name); */
  375. cpus[i] = (cpus_per_node[cpu_inx] / cpus_per_task);
  376. if (cpus[i] == 0) {
  377. /* this can be a result of a heterogeneous allocation
  378. * (e.g. 4 cpus on one node and 2 on the second with
  379. * cpus_per_task=3) */
  380. cpus[i] = 1;
  381. }
  382. //info("got %d cpus", cpus[i]);
  383. if ((++cpu_cnt) >= cpu_count_reps[cpu_inx]) {
  384. /* move to next record */
  385. cpu_inx++;
  386. cpu_cnt = 0;
  387. }
  388. }
  389. if ((task_dist == SLURM_DIST_CYCLIC) ||
  390. (task_dist == SLURM_DIST_CYCLIC_CYCLIC) ||
  391. (task_dist == SLURM_DIST_CYCLIC_BLOCK))
  392. return _task_layout_cyclic(step_layout, cpus);
  393. else if(task_dist == SLURM_DIST_ARBITRARY
  394. && !(cluster_flags & CLUSTER_FLAG_FE))
  395. return _task_layout_hostfile(step_layout, arbitrary_nodes);
  396. else if(task_dist == SLURM_DIST_PLANE)
  397. return _task_layout_plane(step_layout, cpus);
  398. else
  399. return _task_layout_block(step_layout, cpus);
  400. }
  401. /* use specific set run tasks on each host listed in hostfile
  402. * XXX: Need to handle over-subscribe.
  403. */
  404. static int _task_layout_hostfile(slurm_step_layout_t *step_layout,
  405. const char *arbitrary_nodes)
  406. {
  407. int i=0, j, taskid = 0, task_cnt=0;
  408. hostlist_iterator_t itr = NULL, itr_task = NULL;
  409. char *host = NULL;
  410. char *host_task = NULL;
  411. hostlist_t job_alloc_hosts = NULL;
  412. hostlist_t step_alloc_hosts = NULL;
  413. debug2("job list is %s", step_layout->node_list);
  414. job_alloc_hosts = hostlist_create(step_layout->node_list);
  415. itr = hostlist_iterator_create(job_alloc_hosts);
  416. if(!arbitrary_nodes) {
  417. error("no hostlist given for arbitrary dist");
  418. return SLURM_ERROR;
  419. }
  420. debug2("list is %s", arbitrary_nodes);
  421. step_alloc_hosts = hostlist_create(arbitrary_nodes);
  422. if(hostlist_count(step_alloc_hosts) != step_layout->task_cnt) {
  423. error("Asked for %u tasks have %d in the nodelist. "
  424. "Check your nodelist, or set the -n option to be %d",
  425. step_layout->task_cnt,
  426. hostlist_count(step_alloc_hosts),
  427. hostlist_count(step_alloc_hosts));
  428. return SLURM_ERROR;
  429. }
  430. itr_task = hostlist_iterator_create(step_alloc_hosts);
  431. while((host = hostlist_next(itr))) {
  432. step_layout->tasks[i] = 0;
  433. while((host_task = hostlist_next(itr_task))) {
  434. if(!strcmp(host, host_task)) {
  435. step_layout->tasks[i]++;
  436. task_cnt++;
  437. }
  438. free(host_task);
  439. if(task_cnt >= step_layout->task_cnt)
  440. break;
  441. }
  442. debug3("%s got %u tasks", host, step_layout->tasks[i]);
  443. if(step_layout->tasks[i] == 0)
  444. goto reset_hosts;
  445. step_layout->tids[i] = xmalloc(sizeof(uint32_t)
  446. * step_layout->tasks[i]);
  447. taskid = 0;
  448. j = 0;
  449. hostlist_iterator_reset(itr_task);
  450. while((host_task = hostlist_next(itr_task))) {
  451. if(!strcmp(host, host_task)) {
  452. step_layout->tids[i][j] = taskid;
  453. j++;
  454. }
  455. taskid++;
  456. free(host_task);
  457. if(j >= step_layout->tasks[i])
  458. break;
  459. }
  460. i++;
  461. reset_hosts:
  462. hostlist_iterator_reset(itr_task);
  463. free(host);
  464. if(i > step_layout->task_cnt)
  465. break;
  466. }
  467. hostlist_iterator_destroy(itr);
  468. hostlist_iterator_destroy(itr_task);
  469. hostlist_destroy(job_alloc_hosts);
  470. hostlist_destroy(step_alloc_hosts);
  471. if(task_cnt != step_layout->task_cnt) {
  472. error("Asked for %u tasks but placed %d. Check your nodelist",
  473. step_layout->task_cnt, task_cnt);
  474. return SLURM_ERROR;
  475. }
  476. return SLURM_SUCCESS;
  477. }
  478. /* to effectively deal with heterogeneous nodes, we fake a cyclic
  479. * distribution to figure out how many tasks go on each node and
  480. * then make those assignments in a block fashion */
  481. static int _task_layout_block(slurm_step_layout_t *step_layout, uint16_t *cpus)
  482. {
  483. int i, j, taskid = 0;
  484. bool over_subscribe = false;
  485. /* figure out how many tasks go to each node */
  486. for (j=0; (taskid<step_layout->task_cnt); j++) { /* cycle counter */
  487. bool space_remaining = false;
  488. for (i=0; ((i<step_layout->node_cnt)
  489. && (taskid<step_layout->task_cnt)); i++) {
  490. if ((j<cpus[i]) || over_subscribe) {
  491. taskid++;
  492. step_layout->tasks[i]++;
  493. if ((j+1) < cpus[i])
  494. space_remaining = true;
  495. }
  496. }
  497. if (!space_remaining)
  498. over_subscribe = true;
  499. }
  500. /* now distribute the tasks */
  501. taskid = 0;
  502. for (i=0; i < step_layout->node_cnt; i++) {
  503. step_layout->tids[i] = xmalloc(sizeof(uint32_t)
  504. * step_layout->tasks[i]);
  505. for (j=0; j<step_layout->tasks[i]; j++) {
  506. step_layout->tids[i][j] = taskid;
  507. taskid++;
  508. }
  509. }
  510. return SLURM_SUCCESS;
  511. }
  512. /* distribute tasks across available nodes: allocate tasks to nodes
  513. * in a cyclic fashion using available processors. once all available
  514. * processors are allocated, continue to allocate task over-subscribing
  515. * nodes as needed. for example
  516. * cpus per node 4 2 4 2
  517. * -- -- -- --
  518. * task distribution: 0 1 2 3
  519. * 4 5 6 7
  520. * 8 9
  521. * 10 11 all processors allocated now
  522. * 12 13 14 15 etc.
  523. */
  524. static int _task_layout_cyclic(slurm_step_layout_t *step_layout,
  525. uint16_t *cpus)
  526. {
  527. int i, j, taskid = 0;
  528. bool over_subscribe = false;
  529. for (j=0; taskid<step_layout->task_cnt; j++) { /* cycle counter */
  530. bool space_remaining = false;
  531. for (i=0; ((i<step_layout->node_cnt)
  532. && (taskid<step_layout->task_cnt)); i++) {
  533. if ((j<cpus[i]) || over_subscribe) {
  534. xrealloc(step_layout->tids[i], sizeof(uint32_t)
  535. * (step_layout->tasks[i] + 1));
  536. step_layout->tids[i][step_layout->tasks[i]] =
  537. taskid;
  538. taskid++;
  539. step_layout->tasks[i]++;
  540. if ((j+1) < cpus[i])
  541. space_remaining = true;
  542. }
  543. }
  544. if (!space_remaining)
  545. over_subscribe = true;
  546. }
  547. return SLURM_SUCCESS;
  548. }
  549. /*
  550. * The plane distribution results in a block cyclic of block size
  551. * "plane_size".
  552. * To effectively deal with heterogeneous nodes, we fake a cyclic
  553. * distribution to figure out how many tasks go on each node and
  554. * then make the assignments of task numbers to nodes using the
  555. * user-specified plane size.
  556. * For example:
  557. * plane_size = 2, #tasks = 6, #nodes = 3
  558. *
  559. * Node#: Node0 Node1 Node2
  560. * ----- ----- -----
  561. * #of allocated CPUs: 4 1 1
  562. *
  563. * task distribution: 0 1 2 3
  564. * 4 5
  565. */
  566. static int _task_layout_plane(slurm_step_layout_t *step_layout,
  567. uint16_t *cpus)
  568. {
  569. int i, j, k, taskid = 0;
  570. bool over_subscribe = false;
  571. uint32_t cur_task[step_layout->node_cnt];
  572. debug3("_task_layout_plane plane_size %u node_cnt %u task_cnt %u",
  573. step_layout->plane_size,
  574. step_layout->node_cnt, step_layout->task_cnt);
  575. if (step_layout->plane_size <= 0)
  576. return SLURM_ERROR;
  577. if (step_layout->tasks == NULL)
  578. return SLURM_ERROR;
  579. /* figure out how many tasks go to each node */
  580. for (j=0; taskid<step_layout->task_cnt; j++) { /* cycle counter */
  581. bool space_remaining = false;
  582. for (i=0; ((i<step_layout->node_cnt)
  583. && (taskid<step_layout->task_cnt)); i++) {
  584. if ((j<cpus[i]) || over_subscribe) {
  585. taskid++;
  586. step_layout->tasks[i]++;
  587. if ((j+1) < cpus[i])
  588. space_remaining = true;
  589. }
  590. }
  591. if (!space_remaining)
  592. over_subscribe = true;
  593. }
  594. /* now distribute the tasks */
  595. taskid = 0;
  596. for (i=0; i < step_layout->node_cnt; i++) {
  597. step_layout->tids[i] = xmalloc(sizeof(uint32_t)
  598. * step_layout->tasks[i]);
  599. cur_task[i] = 0;
  600. }
  601. for (j=0; taskid<step_layout->task_cnt; j++) { /* cycle counter */
  602. for (i=0; ((i<step_layout->node_cnt)
  603. && (taskid<step_layout->task_cnt)); i++) {
  604. /* assign a block of 'plane_size' tasks to this node */
  605. for (k=0; ((k<step_layout->plane_size)
  606. && (cur_task[i] < step_layout->tasks[i])
  607. && (taskid < step_layout->task_cnt)); k++) {
  608. step_layout->tids[i][cur_task[i]] = taskid;
  609. taskid++;
  610. cur_task[i]++;
  611. }
  612. }
  613. }
  614. if (taskid != step_layout->task_cnt) {
  615. error("_task_layout_plane: Mismatch in task count (%d != %d) ",
  616. taskid, step_layout->task_cnt);
  617. return SLURM_ERROR;
  618. }
  619. #if(0)
  620. /* debugging only */
  621. for (i=0; i < step_layout->node_cnt; i++) {
  622. info("tasks[%d]: %u", i, step_layout->tasks[i]);
  623. }
  624. for (i=0; i < step_layout->node_cnt; i++) {
  625. info ("Host %d _plane_ # of tasks %u", i, step_layout->tasks[i]);
  626. for (j=0; j<step_layout->tasks[i]; j++) {
  627. info ("Host %d _plane_ localid %d taskid %u",
  628. i, j, step_layout->tids[i][j]);
  629. }
  630. }
  631. #endif
  632. return SLURM_SUCCESS;
  633. }
  634. extern char *slurm_step_layout_type_name(task_dist_states_t task_dist)
  635. {
  636. switch(task_dist) {
  637. case SLURM_DIST_CYCLIC:
  638. return "Cyclic";
  639. break;
  640. case SLURM_DIST_BLOCK: /* distribute tasks filling node by node */
  641. return "Block";
  642. break;
  643. case SLURM_DIST_ARBITRARY: /* arbitrary task distribution */
  644. return "Arbitrary";
  645. break;
  646. case SLURM_DIST_PLANE: /* distribute tasks by filling up
  647. planes of lllp first and then by
  648. going across the nodes See
  649. documentation for more
  650. information */
  651. return "Plane";
  652. break;
  653. case SLURM_DIST_CYCLIC_CYCLIC:/* distribute tasks 1 per node:
  654. round robin: same for lowest
  655. level of logical processor (lllp) */
  656. return "CCyclic";
  657. break;
  658. case SLURM_DIST_CYCLIC_BLOCK: /* cyclic for node and block for lllp */
  659. return "CBlock";
  660. break;
  661. case SLURM_DIST_BLOCK_CYCLIC: /* block for node and cyclic for lllp */
  662. return "BCyclic";
  663. break;
  664. case SLURM_DIST_BLOCK_BLOCK: /* block for node and block for lllp */
  665. return "BBlock";
  666. break;
  667. case SLURM_NO_LLLP_DIST: /* No distribution specified for lllp */
  668. case SLURM_DIST_UNKNOWN:
  669. default:
  670. return "Unknown";
  671. }
  672. }