PageRenderTime 37ms CodeModel.GetById 8ms RepoModel.GetById 0ms app.codeStats 0ms

/src/plugins/task/cgroup/task_cgroup_cpuset.c

https://github.com/cfenoy/slurm
C | 931 lines | 626 code | 110 blank | 195 comment | 152 complexity | b77da860ae2ff5382251ccfe16bc6bba MD5 | raw file
Possible License(s): GPL-2.0, AGPL-1.0
  1. /***************************************************************************** \
  2. * task_cgroup_cpuset.c - cpuset cgroup subsystem for task/cgroup
  3. *****************************************************************************
  4. * Copyright (C) 2009 CEA/DAM/DIF
  5. * Written by Matthieu Hautreux <matthieu.hautreux@cea.fr>
  6. * Portions copyright (C) 2012 Bull
  7. * Written by Martin Perry <martin.perry@bull.com>
  8. *
  9. * This file is part of SLURM, a resource management program.
  10. * For details, see <http://www.schedmd.com/slurmdocs/>.
  11. * Please also read the included file: DISCLAIMER.
  12. *
  13. * SLURM is free software; you can redistribute it and/or modify it under
  14. * the terms of the GNU General Public License as published by the Free
  15. * Software Foundation; either version 2 of the License, or (at your option)
  16. * any later version.
  17. *
  18. * In addition, as a special exception, the copyright holders give permission
  19. * to link the code of portions of this program with the OpenSSL library under
  20. * certain conditions as described in each individual source file, and
  21. * distribute linked combinations including the two. You must obey the GNU
  22. * General Public License in all respects for all of the code used other than
  23. * OpenSSL. If you modify file(s) with this exception, you may extend this
  24. * exception to your version of the file(s), but you are not obligated to do
  25. * so. If you do not wish to do so, delete this exception statement from your
  26. * version. If you delete this exception statement from all source files in
  27. * the program, then also delete it here.
  28. *
  29. * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
  30. * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  31. * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  32. * details.
  33. *
  34. * You should have received a copy of the GNU General Public License along
  35. * with SLURM; if not, write to the Free Software Foundation, Inc.,
  36. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  37. \*****************************************************************************/
  38. #if HAVE_CONFIG_H
  39. #include "config.h"
  40. #endif
  41. #define _GNU_SOURCE
  42. #include <sched.h>
  43. #include <sys/types.h>
  44. #include "slurm/slurm_errno.h"
  45. #include "slurm/slurm.h"
  46. #include "src/slurmd/slurmstepd/slurmstepd_job.h"
  47. #include "src/slurmd/slurmd/slurmd.h"
  48. #include "src/common/cpu_frequency.h"
  49. #include "src/common/bitstring.h"
  50. #include "src/common/xstring.h"
  51. #include "src/common/xcgroup_read_config.h"
  52. #include "src/common/xcgroup.h"
  53. #include "task_cgroup.h"
  54. #ifdef HAVE_HWLOC
  55. #include <hwloc.h>
  56. #include <hwloc/glibc-sched.h>
  57. # if HWLOC_API_VERSION <= 0x00010000
  58. /* After this version the cpuset structure and all it's functions
  59. * changed to bitmaps. So to work with old hwloc's we just to the
  60. * opposite to avoid having to put a bunch of ifdef's in the code we
  61. * just do it here.
  62. */
  63. typedef hwloc_cpuset_t hwloc_bitmap_t;
  64. static inline hwloc_bitmap_t hwloc_bitmap_alloc(void)
  65. {
  66. return hwloc_cpuset_alloc();
  67. }
  68. static inline void hwloc_bitmap_free(hwloc_bitmap_t bitmap)
  69. {
  70. hwloc_cpuset_free(bitmap);
  71. }
  72. static inline void hwloc_bitmap_or(
  73. hwloc_bitmap_t res, hwloc_bitmap_t bitmap1, hwloc_bitmap_t bitmap2)
  74. {
  75. hwloc_cpuset_or(res, bitmap1, bitmap2);
  76. }
  77. static inline int hwloc_bitmap_asprintf(char **str, hwloc_bitmap_t bitmap)
  78. {
  79. return hwloc_cpuset_asprintf(str, bitmap);
  80. }
  81. # endif
  82. #endif
  83. #ifndef PATH_MAX
  84. #define PATH_MAX 256
  85. #endif
  86. static char user_cgroup_path[PATH_MAX];
  87. static char job_cgroup_path[PATH_MAX];
  88. static char jobstep_cgroup_path[PATH_MAX];
  89. static xcgroup_ns_t cpuset_ns;
  90. static xcgroup_t user_cpuset_cg;
  91. static xcgroup_t job_cpuset_cg;
  92. static xcgroup_t step_cpuset_cg;
  93. static int _xcgroup_cpuset_init(xcgroup_t* cg);
  94. /*
  95. * convert abstract range into the machine one
  96. */
  97. static int _abs_to_mac(char* lrange, char** prange)
  98. {
  99. static int total_cores = -1, total_cpus = -1;
  100. bitstr_t* absmap = NULL;
  101. bitstr_t* macmap = NULL;
  102. int icore, ithread;
  103. int absid, macid;
  104. int rc = SLURM_SUCCESS;
  105. if (total_cores == -1) {
  106. total_cores = conf->sockets * conf->cores;
  107. total_cpus = conf->block_map_size;
  108. }
  109. /* allocate bitmap */
  110. absmap = bit_alloc(total_cores);
  111. macmap = bit_alloc(total_cpus);
  112. if (!absmap || !macmap) {
  113. rc = SLURM_ERROR;
  114. goto end_it;
  115. }
  116. /* string to bitmap conversion */
  117. if (bit_unfmt(absmap, lrange)) {
  118. rc = SLURM_ERROR;
  119. goto end_it;
  120. }
  121. /* mapping abstract id to machine id using conf->block_map */
  122. for (icore = 0; icore < total_cores; icore++) {
  123. if (bit_test(absmap, icore)) {
  124. for (ithread = 0; ithread<conf->threads; ithread++) {
  125. absid = icore*conf->threads + ithread;
  126. absid %= total_cpus;
  127. macid = conf->block_map[absid];
  128. macid %= total_cpus;
  129. bit_set(macmap, macid);
  130. }
  131. }
  132. }
  133. /* convert machine cpu bitmap to range string */
  134. *prange = (char*)xmalloc(total_cpus*6);
  135. bit_fmt(*prange, total_cpus*6, macmap);
  136. /* free unused bitmaps */
  137. end_it:
  138. FREE_NULL_BITMAP(absmap);
  139. FREE_NULL_BITMAP(macmap);
  140. if (rc != SLURM_SUCCESS)
  141. info("_abs_to_mac failed");
  142. return rc;
  143. }
  144. /* when cgroups are configured with cpuset, at least
  145. * cpuset.cpus and cpuset.mems must be set or the cgroup
  146. * will not be available at all.
  147. * we duplicate the ancestor configuration in the init step */
  148. static int _xcgroup_cpuset_init(xcgroup_t* cg)
  149. {
  150. int fstatus,i;
  151. char* cpuset_metafiles[] = {
  152. "cpuset.cpus",
  153. "cpuset.mems"
  154. };
  155. char* cpuset_meta;
  156. char* cpuset_conf;
  157. size_t csize;
  158. xcgroup_t acg;
  159. char* acg_name;
  160. char* p;
  161. fstatus = XCGROUP_ERROR;
  162. /* load ancestor cg */
  163. acg_name = (char*) xstrdup(cg->name);
  164. p = rindex(acg_name,'/');
  165. if (p == NULL) {
  166. debug2("task/cgroup: unable to get ancestor path for "
  167. "cpuset cg '%s' : %m",cg->path);
  168. return fstatus;
  169. } else
  170. *p = '\0';
  171. if (xcgroup_load(cg->ns,&acg, acg_name) != XCGROUP_SUCCESS) {
  172. debug2("task/cgroup: unable to load ancestor for "
  173. "cpuset cg '%s' : %m",cg->path);
  174. return fstatus;
  175. }
  176. /* inherits ancestor params */
  177. for (i = 0 ; i < 2 ; i++) {
  178. cpuset_meta = cpuset_metafiles[i];
  179. if (xcgroup_get_param(&acg,cpuset_meta,
  180. &cpuset_conf,&csize)
  181. != XCGROUP_SUCCESS) {
  182. debug2("task/cgroup: assuming no cpuset cg "
  183. "support for '%s'",acg.path);
  184. xcgroup_destroy(&acg);
  185. return fstatus;
  186. }
  187. if (csize > 0)
  188. cpuset_conf[csize-1]='\0';
  189. if (xcgroup_set_param(cg,cpuset_meta,cpuset_conf)
  190. != XCGROUP_SUCCESS) {
  191. debug2("task/cgroup: unable to write %s configuration "
  192. "(%s) for cpuset cg '%s'",cpuset_meta,
  193. cpuset_conf,cg->path);
  194. xcgroup_destroy(&acg);
  195. xfree(cpuset_conf);
  196. return fstatus;
  197. }
  198. xfree(cpuset_conf);
  199. }
  200. xcgroup_destroy(&acg);
  201. return XCGROUP_SUCCESS;
  202. }
  203. #ifdef HAVE_HWLOC
  204. /*
  205. * Add cpuset for an object to the total cpuset for a task, using the
  206. * appropriate ancestor object cpuset if necessary
  207. *
  208. * obj = object to add
  209. * cpuset = cpuset for task
  210. */
  211. static void _add_cpuset(
  212. hwloc_obj_type_t hwtype, hwloc_obj_type_t req_hwtype,
  213. hwloc_obj_t obj, uint32_t taskid, int bind_verbose,
  214. hwloc_bitmap_t cpuset)
  215. {
  216. struct hwloc_obj *pobj;
  217. /* if requested binding overlap the granularity */
  218. /* use the ancestor cpuset instead of the object one */
  219. if (hwloc_compare_types(hwtype,req_hwtype) > 0) {
  220. /* Get the parent object of req_hwtype or the */
  221. /* one just above if not found (meaning of >0)*/
  222. /* (useful for ldoms binding with !NUMA nodes)*/
  223. pobj = obj->parent;
  224. while (pobj != NULL &&
  225. hwloc_compare_types(pobj->type, req_hwtype) > 0)
  226. pobj = pobj->parent;
  227. if (pobj != NULL) {
  228. if (bind_verbose)
  229. info("task/cgroup: task[%u] higher level %s "
  230. "found", taskid,
  231. hwloc_obj_type_string(pobj->type));
  232. hwloc_bitmap_or(cpuset, cpuset, pobj->allowed_cpuset);
  233. } else {
  234. /* should not be executed */
  235. if (bind_verbose)
  236. info("task/cgroup: task[%u] no higher level "
  237. "found", taskid);
  238. hwloc_bitmap_or(cpuset, cpuset, obj->allowed_cpuset);
  239. }
  240. } else
  241. hwloc_bitmap_or(cpuset, cpuset, obj->allowed_cpuset);
  242. }
  243. /*
  244. * Distribute cpus to the task using cyclic distribution across sockets
  245. */
  246. static int _task_cgroup_cpuset_dist_cyclic(
  247. hwloc_topology_t topology, hwloc_obj_type_t hwtype,
  248. hwloc_obj_type_t req_hwtype, slurmd_job_t *job, int bind_verbose,
  249. hwloc_bitmap_t cpuset)
  250. {
  251. hwloc_obj_t obj;
  252. uint32_t *obj_idx;
  253. uint32_t i, sock_idx, npskip, npdist, nsockets;
  254. uint32_t taskid = job->envtp->localid;
  255. if (bind_verbose)
  256. info("task/cgroup: task[%u] using cyclic distribution, "
  257. "task_dist %u", taskid, job->task_dist);
  258. nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology,
  259. HWLOC_OBJ_SOCKET);
  260. obj_idx = xmalloc(nsockets * sizeof(uint32_t));
  261. if (hwloc_compare_types(hwtype,HWLOC_OBJ_CORE) >= 0) {
  262. /* cores or threads granularity */
  263. npskip = taskid * job->cpus_per_task;
  264. npdist = job->cpus_per_task;
  265. } else {
  266. /* sockets or ldoms granularity */
  267. npskip = taskid;
  268. npdist = 1;
  269. }
  270. /* skip objs for lower taskids */
  271. i = 0;
  272. sock_idx = 0;
  273. while (i < npskip) {
  274. while ((sock_idx < nsockets) && (i < npskip)) {
  275. obj = hwloc_get_obj_below_by_type(
  276. topology, HWLOC_OBJ_SOCKET, sock_idx,
  277. hwtype, obj_idx[sock_idx]);
  278. if (obj != NULL) {
  279. obj_idx[sock_idx]++;
  280. i++;
  281. }
  282. sock_idx++;
  283. }
  284. if (i < npskip)
  285. sock_idx = 0;
  286. }
  287. /* distribute objs cyclically across sockets */
  288. i = npdist;
  289. while (i > 0) {
  290. while ((sock_idx < nsockets) && (i > 0)) {
  291. obj = hwloc_get_obj_below_by_type(
  292. topology, HWLOC_OBJ_SOCKET, sock_idx,
  293. hwtype, obj_idx[sock_idx]);
  294. if (obj != NULL) {
  295. obj_idx[sock_idx]++;
  296. _add_cpuset(hwtype, req_hwtype, obj, taskid,
  297. bind_verbose, cpuset);
  298. i--;
  299. }
  300. sock_idx++;
  301. }
  302. sock_idx = 0;
  303. }
  304. xfree(obj_idx);
  305. return XCGROUP_SUCCESS;
  306. }
  307. /*
  308. * Distribute cpus to the task using block distribution
  309. */
  310. static int _task_cgroup_cpuset_dist_block(
  311. hwloc_topology_t topology, hwloc_obj_type_t hwtype,
  312. hwloc_obj_type_t req_hwtype, uint32_t nobj,
  313. slurmd_job_t *job, int bind_verbose, hwloc_bitmap_t cpuset)
  314. {
  315. hwloc_obj_t obj;
  316. uint32_t i, pfirst,plast;
  317. uint32_t taskid = job->envtp->localid;
  318. int hwdepth;
  319. if (bind_verbose)
  320. info("task/cgroup: task[%u] using block distribution, "
  321. "task_dist %u", taskid, job->task_dist);
  322. if (hwloc_compare_types(hwtype,HWLOC_OBJ_CORE) >= 0) {
  323. /* cores or threads granularity */
  324. pfirst = taskid * job->cpus_per_task ;
  325. plast = pfirst + job->cpus_per_task - 1;
  326. } else {
  327. /* sockets or ldoms granularity */
  328. pfirst = taskid;
  329. plast = pfirst;
  330. }
  331. hwdepth = hwloc_get_type_depth(topology,hwtype);
  332. for (i = pfirst; i <= plast && i < nobj ; i++) {
  333. obj = hwloc_get_obj_by_depth(topology, hwdepth, (int)i);
  334. _add_cpuset(hwtype, req_hwtype, obj, taskid, bind_verbose,
  335. cpuset);
  336. }
  337. return XCGROUP_SUCCESS;
  338. }
  339. #endif
  340. extern int task_cgroup_cpuset_init(slurm_cgroup_conf_t *slurm_cgroup_conf)
  341. {
  342. /* initialize user/job/jobstep cgroup relative paths */
  343. user_cgroup_path[0]='\0';
  344. job_cgroup_path[0]='\0';
  345. jobstep_cgroup_path[0]='\0';
  346. /* initialize cpuset cgroup namespace */
  347. if (xcgroup_ns_create(slurm_cgroup_conf, &cpuset_ns, "", "cpuset")
  348. != XCGROUP_SUCCESS) {
  349. error("task/cgroup: unable to create cpuset namespace");
  350. return SLURM_ERROR;
  351. }
  352. return SLURM_SUCCESS;
  353. }
  354. extern int task_cgroup_cpuset_fini(slurm_cgroup_conf_t *slurm_cgroup_conf)
  355. {
  356. if (user_cgroup_path[0] != '\0')
  357. xcgroup_destroy(&user_cpuset_cg);
  358. if (job_cgroup_path[0] != '\0')
  359. xcgroup_destroy(&job_cpuset_cg);
  360. if (jobstep_cgroup_path[0] != '\0')
  361. xcgroup_destroy(&step_cpuset_cg);
  362. user_cgroup_path[0]='\0';
  363. job_cgroup_path[0]='\0';
  364. jobstep_cgroup_path[0]='\0';
  365. xcgroup_ns_destroy(&cpuset_ns);
  366. return SLURM_SUCCESS;
  367. }
  368. extern int task_cgroup_cpuset_create(slurmd_job_t *job)
  369. {
  370. int rc;
  371. int fstatus = SLURM_ERROR;
  372. xcgroup_t cpuset_cg;
  373. uint32_t jobid = job->jobid;
  374. uint32_t stepid = job->stepid;
  375. uid_t uid = job->uid;
  376. uid_t gid = job->gid;
  377. char* user_alloc_cores = NULL;
  378. char* job_alloc_cores = NULL;
  379. char* step_alloc_cores = NULL;
  380. char* cpus = NULL;
  381. size_t cpus_size;
  382. char* slurm_cgpath ;
  383. xcgroup_t slurm_cg;
  384. /* create slurm root cg in this cg namespace */
  385. slurm_cgpath = task_cgroup_create_slurm_cg(&cpuset_ns);
  386. if ( slurm_cgpath == NULL ) {
  387. return SLURM_ERROR;
  388. }
  389. /* check that this cgroup has cpus allowed or initialize them */
  390. if (xcgroup_load(&cpuset_ns,&slurm_cg,slurm_cgpath)
  391. != XCGROUP_SUCCESS) {
  392. error("task/cgroup: unable to load slurm cpuset xcgroup");
  393. xfree(slurm_cgpath);
  394. return SLURM_ERROR;
  395. }
  396. rc = xcgroup_get_param(&slurm_cg,"cpuset.cpus",&cpus,&cpus_size);
  397. if (rc != XCGROUP_SUCCESS || cpus_size == 1) {
  398. /* initialize the cpusets as it was inexistant */
  399. if (_xcgroup_cpuset_init(&slurm_cg) !=
  400. XCGROUP_SUCCESS) {
  401. xfree(slurm_cgpath);
  402. xcgroup_destroy(&slurm_cg);
  403. return SLURM_ERROR;
  404. }
  405. }
  406. xfree(cpus);
  407. /* build user cgroup relative path if not set (should not be) */
  408. if (*user_cgroup_path == '\0') {
  409. if (snprintf(user_cgroup_path, PATH_MAX,
  410. "%s/uid_%u", slurm_cgpath, uid) >= PATH_MAX) {
  411. error("unable to build uid %u cgroup relative "
  412. "path : %m", uid);
  413. xfree(slurm_cgpath);
  414. return SLURM_ERROR;
  415. }
  416. }
  417. xfree(slurm_cgpath);
  418. /* build job cgroup relative path if no set (should not be) */
  419. if (*job_cgroup_path == '\0') {
  420. if (snprintf(job_cgroup_path,PATH_MAX,"%s/job_%u",
  421. user_cgroup_path,jobid) >= PATH_MAX) {
  422. error("task/cgroup: unable to build job %u cpuset "
  423. "cg relative path : %m",jobid);
  424. return SLURM_ERROR;
  425. }
  426. }
  427. /* build job step cgroup relative path (should not be) */
  428. if (*jobstep_cgroup_path == '\0') {
  429. if (stepid == NO_VAL) {
  430. if (snprintf(jobstep_cgroup_path, PATH_MAX,
  431. "%s/step_batch", job_cgroup_path)
  432. >= PATH_MAX) {
  433. error("task/cgroup: unable to build job step"
  434. " %u.batch cpuset cg relative path: %m",
  435. jobid);
  436. return SLURM_ERROR;
  437. }
  438. } else {
  439. if (snprintf(jobstep_cgroup_path,
  440. PATH_MAX, "%s/step_%u",
  441. job_cgroup_path, stepid) >= PATH_MAX) {
  442. error("task/cgroup: unable to build job step"
  443. " %u.%u cpuset cg relative path: %m",
  444. jobid, stepid);
  445. return SLURM_ERROR;
  446. }
  447. }
  448. }
  449. /*
  450. * create cpuset root cg and lock it
  451. *
  452. * we will keep the lock until the end to avoid the effect of a release
  453. * agent that would remove an existing cgroup hierarchy while we are
  454. * setting it up. As soon as the step cgroup is created, we can release
  455. * the lock.
  456. * Indeed, consecutive slurm steps could result in cg being removed
  457. * between the next EEXIST instanciation and the first addition of
  458. * a task. The release_agent will have to lock the root cpuset cgroup
  459. * to avoid this scenario.
  460. */
  461. if (xcgroup_create(&cpuset_ns,&cpuset_cg,"",0,0) != XCGROUP_SUCCESS) {
  462. error("task/cgroup: unable to create root cpuset xcgroup");
  463. return SLURM_ERROR;
  464. }
  465. if (xcgroup_lock(&cpuset_cg) != XCGROUP_SUCCESS) {
  466. xcgroup_destroy(&cpuset_cg);
  467. error("task/cgroup: unable to lock root cpuset cg");
  468. return SLURM_ERROR;
  469. }
  470. /*
  471. * build job and job steps allocated cores lists
  472. */
  473. debug("task/cgroup: job abstract cores are '%s'",
  474. job->job_alloc_cores);
  475. debug("task/cgroup: step abstract cores are '%s'",
  476. job->step_alloc_cores);
  477. if (_abs_to_mac(job->job_alloc_cores,
  478. &job_alloc_cores) != SLURM_SUCCESS) {
  479. error("task/cgroup: unable to build job physical cores");
  480. goto error;
  481. }
  482. if (_abs_to_mac(job->step_alloc_cores,
  483. &step_alloc_cores) != SLURM_SUCCESS) {
  484. error("task/cgroup: unable to build step physical cores");
  485. goto error;
  486. }
  487. debug("task/cgroup: job physical cores are '%s'",
  488. job_alloc_cores);
  489. debug("task/cgroup: step physical cores are '%s'",
  490. step_alloc_cores);
  491. /*
  492. * create user cgroup in the cpuset ns (it could already exist)
  493. */
  494. if (xcgroup_create(&cpuset_ns,&user_cpuset_cg,
  495. user_cgroup_path,
  496. getuid(),getgid()) != XCGROUP_SUCCESS) {
  497. goto error;
  498. }
  499. if (xcgroup_instanciate(&user_cpuset_cg) != XCGROUP_SUCCESS) {
  500. xcgroup_destroy(&user_cpuset_cg);
  501. goto error;
  502. }
  503. /*
  504. * check that user's cpuset cgroup is consistant and add the job cores
  505. */
  506. rc = xcgroup_get_param(&user_cpuset_cg,"cpuset.cpus",&cpus,&cpus_size);
  507. if (rc != XCGROUP_SUCCESS || cpus_size == 1) {
  508. /* initialize the cpusets as it was inexistant */
  509. if (_xcgroup_cpuset_init(&user_cpuset_cg) !=
  510. XCGROUP_SUCCESS) {
  511. xcgroup_delete(&user_cpuset_cg);
  512. xcgroup_destroy(&user_cpuset_cg);
  513. goto error;
  514. }
  515. }
  516. user_alloc_cores = xstrdup(job_alloc_cores);
  517. if (cpus != NULL && cpus_size > 1) {
  518. cpus[cpus_size-1]='\0';
  519. xstrcat(user_alloc_cores,",");
  520. xstrcat(user_alloc_cores,cpus);
  521. }
  522. xcgroup_set_param(&user_cpuset_cg,"cpuset.cpus",user_alloc_cores);
  523. xfree(cpus);
  524. /*
  525. * create job cgroup in the cpuset ns (it could already exist)
  526. */
  527. if (xcgroup_create(&cpuset_ns,&job_cpuset_cg,
  528. job_cgroup_path,
  529. getuid(),getgid()) != XCGROUP_SUCCESS) {
  530. xcgroup_destroy(&user_cpuset_cg);
  531. goto error;
  532. }
  533. if (xcgroup_instanciate(&job_cpuset_cg) != XCGROUP_SUCCESS) {
  534. xcgroup_destroy(&user_cpuset_cg);
  535. xcgroup_destroy(&job_cpuset_cg);
  536. goto error;
  537. }
  538. if (_xcgroup_cpuset_init(&job_cpuset_cg) != XCGROUP_SUCCESS) {
  539. xcgroup_destroy(&user_cpuset_cg);
  540. xcgroup_destroy(&job_cpuset_cg);
  541. goto error;
  542. }
  543. xcgroup_set_param(&job_cpuset_cg,"cpuset.cpus",job_alloc_cores);
  544. /*
  545. * create step cgroup in the cpuset ns (it should not exists)
  546. * use job's user uid/gid to enable tasks cgroups creation by
  547. * the user inside the step cgroup owned by root
  548. */
  549. if (xcgroup_create(&cpuset_ns,&step_cpuset_cg,
  550. jobstep_cgroup_path,
  551. uid,gid) != XCGROUP_SUCCESS) {
  552. /* do not delete user/job cgroup as */
  553. /* they can exist for other steps */
  554. xcgroup_destroy(&user_cpuset_cg);
  555. xcgroup_destroy(&job_cpuset_cg);
  556. goto error;
  557. }
  558. if (xcgroup_instanciate(&step_cpuset_cg) != XCGROUP_SUCCESS) {
  559. xcgroup_destroy(&user_cpuset_cg);
  560. xcgroup_destroy(&job_cpuset_cg);
  561. xcgroup_destroy(&step_cpuset_cg);
  562. goto error;
  563. }
  564. if (_xcgroup_cpuset_init(&step_cpuset_cg) != XCGROUP_SUCCESS) {
  565. xcgroup_destroy(&user_cpuset_cg);
  566. xcgroup_destroy(&job_cpuset_cg);
  567. xcgroup_delete(&step_cpuset_cg);
  568. xcgroup_destroy(&step_cpuset_cg);
  569. goto error;
  570. }
  571. xcgroup_set_param(&step_cpuset_cg,"cpuset.cpus",step_alloc_cores);
  572. /* attach the slurmstepd to the step cpuset cgroup */
  573. pid_t pid = getpid();
  574. rc = xcgroup_add_pids(&step_cpuset_cg,&pid,1);
  575. if (rc != XCGROUP_SUCCESS) {
  576. error("task/cgroup: unable to add slurmstepd to cpuset cg '%s'",
  577. step_cpuset_cg.path);
  578. fstatus = SLURM_ERROR;
  579. } else
  580. fstatus = SLURM_SUCCESS;
  581. /* validate the requested cpu frequency and set it */
  582. if (job->cpu_freq != NO_VAL) {
  583. cpu_freq_cgroup_validate(job, step_alloc_cores);
  584. }
  585. error:
  586. xcgroup_unlock(&cpuset_cg);
  587. xcgroup_destroy(&cpuset_cg);
  588. xfree(user_alloc_cores);
  589. xfree(job_alloc_cores);
  590. xfree(step_alloc_cores);
  591. return fstatus;
  592. }
  593. extern int task_cgroup_cpuset_attach_task(slurmd_job_t *job)
  594. {
  595. int fstatus = SLURM_ERROR;
  596. /* tasks are automatically attached as slurmstepd is in the step cg */
  597. fstatus = SLURM_SUCCESS;
  598. return fstatus;
  599. }
  600. /* affinity should be set using sched_setaffinity to not force */
  601. /* user to have to play with the cgroup hierarchy to modify it */
  602. extern int task_cgroup_cpuset_set_task_affinity(slurmd_job_t *job)
  603. {
  604. int fstatus = SLURM_ERROR;
  605. #ifndef HAVE_HWLOC
  606. error("task/cgroup: plugin not compiled with hwloc support, "
  607. "skipping affinity.");
  608. return fstatus;
  609. #else
  610. hwloc_obj_type_t socket_or_node;
  611. uint32_t nldoms;
  612. uint32_t nsockets;
  613. uint32_t ncores;
  614. uint32_t npus;
  615. uint32_t nobj;
  616. uint32_t taskid = job->envtp->localid;
  617. uint32_t jntasks = job->node_tasks;
  618. uint32_t jnpus = jntasks * job->cpus_per_task;
  619. pid_t pid = job->envtp->task_pid;
  620. cpu_bind_type_t bind_type;
  621. int bind_verbose = 0;
  622. hwloc_topology_t topology;
  623. hwloc_bitmap_t cpuset;
  624. hwloc_obj_type_t hwtype;
  625. hwloc_obj_type_t req_hwtype;
  626. size_t tssize;
  627. cpu_set_t ts;
  628. bind_type = job->cpu_bind_type ;
  629. if (conf->task_plugin_param & CPU_BIND_VERBOSE ||
  630. bind_type & CPU_BIND_VERBOSE)
  631. bind_verbose = 1 ;
  632. /* Allocate and initialize hwloc objects */
  633. hwloc_topology_init(&topology);
  634. cpuset = hwloc_bitmap_alloc();
  635. hwloc_topology_load(topology);
  636. if ( hwloc_get_type_depth(topology, HWLOC_OBJ_NODE) >
  637. hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET) ) {
  638. /* One socket contains multiple NUMA-nodes
  639. * like AMD Opteron 6000 series etc.
  640. * In such case, use NUMA-node instead of socket. */
  641. socket_or_node = HWLOC_OBJ_NODE;
  642. } else {
  643. socket_or_node = HWLOC_OBJ_SOCKET;
  644. }
  645. if (bind_type & CPU_BIND_NONE) {
  646. if (bind_verbose)
  647. info("task/cgroup: task[%u] is requesting no affinity",
  648. taskid);
  649. return 0;
  650. } else if (bind_type & CPU_BIND_TO_THREADS) {
  651. if (bind_verbose)
  652. info("task/cgroup: task[%u] is requesting "
  653. "thread level binding",taskid);
  654. req_hwtype = HWLOC_OBJ_PU;
  655. } else if (bind_type & CPU_BIND_TO_CORES) {
  656. if (bind_verbose)
  657. info("task/cgroup: task[%u] is requesting "
  658. "core level binding",taskid);
  659. req_hwtype = HWLOC_OBJ_CORE;
  660. } else if (bind_type & CPU_BIND_TO_SOCKETS) {
  661. if (bind_verbose)
  662. info("task/cgroup: task[%u] is requesting "
  663. "socket level binding",taskid);
  664. req_hwtype = socket_or_node;
  665. } else if (bind_type & CPU_BIND_TO_LDOMS) {
  666. if (bind_verbose)
  667. info("task/cgroup: task[%u] is requesting "
  668. "ldom level binding",taskid);
  669. req_hwtype = HWLOC_OBJ_NODE;
  670. } else {
  671. if (bind_verbose)
  672. info("task/cgroup: task[%u] using core level binding"
  673. " by default",taskid);
  674. req_hwtype = HWLOC_OBJ_CORE;
  675. }
  676. /*
  677. * Perform the topology detection. It will only get allowed PUs.
  678. * Detect in the same time the granularity to use for binding.
  679. * The granularity can be relaxed from threads to cores if enough
  680. * cores are available as with hyperthread support, ntasks-per-core
  681. * param can let us have access to more threads per core for each
  682. * task
  683. * Revert back to machine granularity if no finer-grained granularity
  684. * matching the request is found. This will result in no affinity
  685. * applied.
  686. * The detected granularity will be used to find where to best place
  687. * the task, then the cpu_bind option will be used to relax the
  688. * affinity constraint and use more PUs. (i.e. use a core granularity
  689. * to dispatch the tasks across the sockets and then provide access
  690. * to each task to the cores of its socket.)
  691. */
  692. npus = (uint32_t) hwloc_get_nbobjs_by_type(topology,
  693. HWLOC_OBJ_PU);
  694. ncores = (uint32_t) hwloc_get_nbobjs_by_type(topology,
  695. HWLOC_OBJ_CORE);
  696. nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology,
  697. socket_or_node);
  698. nldoms = (uint32_t) hwloc_get_nbobjs_by_type(topology,
  699. HWLOC_OBJ_NODE);
  700. hwtype = HWLOC_OBJ_MACHINE;
  701. nobj = 1;
  702. if (npus >= jnpus || bind_type & CPU_BIND_TO_THREADS) {
  703. hwtype = HWLOC_OBJ_PU;
  704. nobj = npus;
  705. }
  706. if (ncores >= jnpus || bind_type & CPU_BIND_TO_CORES) {
  707. hwtype = HWLOC_OBJ_CORE;
  708. nobj = ncores;
  709. }
  710. if (nsockets >= jntasks &&
  711. bind_type & CPU_BIND_TO_SOCKETS) {
  712. hwtype = socket_or_node;
  713. nobj = nsockets;
  714. }
  715. /*
  716. * HWLOC returns all the NUMA nodes available regardless of the
  717. * number of underlying sockets available (regardless of the allowed
  718. * resources). So there is no guarantee that each ldom will be populated
  719. * with usable sockets. So add a simple check that at least ensure that
  720. * we have as many sockets as ldoms before moving to ldoms granularity
  721. */
  722. if (nldoms >= jntasks &&
  723. nsockets >= nldoms &&
  724. bind_type & CPU_BIND_TO_LDOMS) {
  725. hwtype = HWLOC_OBJ_NODE;
  726. nobj = nldoms;
  727. }
  728. /*
  729. * Bind the detected object to the taskid, respecting the
  730. * granularity, using the designated or default distribution
  731. * method (block or cyclic).
  732. * If not enough objects to do the job, revert to no affinity mode
  733. */
  734. if (hwloc_compare_types(hwtype,HWLOC_OBJ_MACHINE) == 0) {
  735. info("task/cgroup: task[%u] disabling affinity because of %s "
  736. "granularity",taskid,hwloc_obj_type_string(hwtype));
  737. } else if (hwloc_compare_types(hwtype,HWLOC_OBJ_CORE) >= 0 &&
  738. jnpus > nobj) {
  739. info("task/cgroup: task[%u] not enough %s objects, disabling "
  740. "affinity",taskid,hwloc_obj_type_string(hwtype));
  741. } else {
  742. char *str;
  743. if (bind_verbose) {
  744. info("task/cgroup: task[%u] using %s granularity",
  745. taskid,hwloc_obj_type_string(hwtype));
  746. }
  747. /* There are two "distributions," controlled by the
  748. * -m option of srun and friends. The first is the
  749. * distribution of tasks to nodes. The second is the
  750. * distribution of allocated cpus to tasks for
  751. * binding. This code is handling the second
  752. * distribution. Here's how the values get set, based
  753. * on the value of -m
  754. *
  755. * SLURM_DIST_CYCLIC = srun -m cyclic
  756. * SLURM_DIST_BLOCK = srun -m block
  757. * SLURM_DIST_CYCLIC_CYCLIC = srun -m cyclic:cyclic
  758. * SLURM_DIST_BLOCK_CYCLIC = srun -m block:cyclic
  759. *
  760. * In the first two cases, the user only specified the
  761. * first distribution. The second distribution
  762. * defaults to cyclic. In the second two cases, the
  763. * user explicitly requested a second distribution of
  764. * cyclic. So all these four cases correspond to a
  765. * second distribution of cyclic. So we want to call
  766. * _task_cgroup_cpuset_dist_cyclic.
  767. *
  768. * If the user explicitly specifies a second
  769. * distribution of block, or if
  770. * CR_CORE_DEFAULT_DIST_BLOCK is configured and the
  771. * user does not explicitly specify a second
  772. * distribution of cyclic, the second distribution is
  773. * block, and we need to call
  774. * _task_cgroup_cpuset_dist_block. In these cases,
  775. * task_dist would be set to SLURM_DIST_CYCLIC_BLOCK
  776. * or SLURM_DIST_BLOCK_BLOCK.
  777. *
  778. * You can see the equivalent code for the
  779. * task/affinity plugin in
  780. * src/plugins/task/affinity/dist_tasks.c, around line 384.
  781. */
  782. switch (job->task_dist) {
  783. case SLURM_DIST_CYCLIC:
  784. case SLURM_DIST_BLOCK:
  785. case SLURM_DIST_CYCLIC_CYCLIC:
  786. case SLURM_DIST_BLOCK_CYCLIC:
  787. _task_cgroup_cpuset_dist_cyclic(
  788. topology, hwtype, req_hwtype,
  789. job, bind_verbose, cpuset);
  790. break;
  791. default:
  792. _task_cgroup_cpuset_dist_block(
  793. topology, hwtype, req_hwtype,
  794. nobj, job, bind_verbose, cpuset);
  795. }
  796. hwloc_bitmap_asprintf(&str, cpuset);
  797. tssize = sizeof(cpu_set_t);
  798. if (hwloc_cpuset_to_glibc_sched_affinity(topology,cpuset,
  799. &ts,tssize) == 0) {
  800. fstatus = SLURM_SUCCESS;
  801. if (sched_setaffinity(pid,tssize,&ts)) {
  802. error("task/cgroup: task[%u] unable to set "
  803. "taskset '%s'",taskid,str);
  804. fstatus = SLURM_ERROR;
  805. } else if (bind_verbose) {
  806. info("task/cgroup: task[%u] taskset '%s' is set"
  807. ,taskid,str);
  808. }
  809. } else {
  810. error("task/cgroup: task[%u] unable to build "
  811. "taskset '%s'",taskid,str);
  812. fstatus = SLURM_ERROR;
  813. }
  814. free(str);
  815. }
  816. /* Destroy hwloc objects */
  817. hwloc_bitmap_free(cpuset);
  818. hwloc_topology_destroy(topology);
  819. return fstatus;
  820. #endif
  821. }