dist_tasks.c | searchcode

/src/plugins/select/cons_res/dist_tasks.c

https://github.com/cfenoy/slurm
C | 895 lines | 594 code | 84 blank | 217 comment | 154 complexity | 4e1ef227dc6badeebe7e2a67d41cdda2 MD5 | raw file
Possible License(s): GPL-2.0, AGPL-1.0

/*****************************************************************************\
 *  dist_tasks - Assign task count to {socket,core,thread} or CPU
 *               resources
 *****************************************************************************
 *  Copyright (C) 2006-2008 Hewlett-Packard Development Company, L.P.
 *  Written by Susanne M. Balle, <susanne.balle@hp.com>
 *  CODE-OCEC-09-009. All rights reserved.
 *  Portions copyright (C) 2012 Bull
 *  Written by Martin Perry <martin.perry@bull.com>
 *
 *  This file is part of SLURM, a resource management program.
 *  For details, see <http://www.schedmd.com/slurmdocs/>.
 *  Please also read the included file: DISCLAIMER.
 *
 *  SLURM is free software; you can redistribute it and/or modify it under
 *  the terms of the GNU General Public License as published by the Free
 *  Software Foundation; either version 2 of the License, or (at your option)
 *  any later version.
 *
 *  In addition, as a special exception, the copyright holders give permission
 *  to link the code of portions of this program with the OpenSSL library under
 *  certain conditions as described in each individual source file, and
 *  distribute linked combinations including the two. You must obey the GNU
 *  General Public License in all respects for all of the code used other than
 *  OpenSSL. If you modify file(s) with this exception, you may extend this
 *  exception to your version of the file(s), but you are not obligated to do
 *  so. If you do not wish to do so, delete this exception statement from your
 *  version.  If you delete this exception statement from all source files in
 *  the program, then also delete it here.
 *
 *  SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
 *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
 *  details.
 *
 *  You should have received a copy of the GNU General Public License along
 *  with SLURM; if not, write to the Free Software Foundation, Inc.,
 *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
\*****************************************************************************/

#include "select_cons_res.h"
#include "dist_tasks.h"

#if(0)
/* Using CR_SOCKET or CR_SOCKET_MEMORY will not allocate a socket to more
 * than one job at a time, but it also will not grant a job access to more
 * CPUs on the socket than requested. If ALLOCATE_FULL_SOCKET is defined,
 * then a job will be given access to every cores on each allocated socket.
 */
#define ALLOCATE_FULL_SOCKET 1
#endif

/* Max boards supported for best-fit across boards */
/* Larger board configurations may require new algorithm */
/* for acceptable performance */
#define MAX_BOARDS 8

/* Combination counts
 * comb_counts[n-1][k-1] = number of combinations of
 *   k items from a set of n items
 *
 * Formula is n!/k!(n-k)!
 */
uint32_t comb_counts[MAX_BOARDS][MAX_BOARDS] =
  {{1,0,0,0,0,0,0,0},
   {2,1,0,0,0,0,0,0},
   {3,3,1,0,0,0,0,0},
   {4,6,4,1,0,0,0,0},
   {5,10,10,5,1,0,0,0},
   {6,15,20,15,6,1,0,0},
   {7,21,35,35,21,7,1,0},
   {8,28,56,70,56,28,8,1}};

/* Generate all combinations of k integers from the
 * set of integers 0 to n-1.
 * Return combinations in comb_list.
 *
 * Example: For k = 2 and n = 4, there are six
 *          combinations:
 *          {0,1},{0,2},{0,3},{1,2},{1,3},{2,3}
 *
 */
void _gen_combs(int *comb_list, int n, int k)
{
	int *comb = xmalloc(k * sizeof(int));

	/* Setup comb for the initial combination */
	int i, b;
	for (i = 0; i < k; ++i)
		comb[i] = i;
	b = 0;

	/* Generate all the other combinations */
	while (1) {
		for (i=0; i < k; i++) {
			comb_list[b+i] = comb[i];
		}
		b+=k;
		i = k - 1;
		++comb[i];
		while ((i >= 0) && (comb[i] >= n - k + 1 + i)) {
			--i;
			++comb[i];
		}

		if (comb[0] > n - k)
			break; /* No more combinations */

		for (i = i + 1; i < k; ++i)
			comb[i] = comb[i - 1] + 1;
	}
	xfree(comb);
}

/* _compute_task_c_b_task_dist - compute the number of tasks on each
 * of the node for the cyclic and block distribution. We need to do
 * this in the case of consumable resources so that we have an exact
 * count for the needed hardware resources which will be used later to
 * update the different used resources per node structures.
 *
 * The most common case is when we have more resources than needed. In
 * that case we just "take" what we need and "release" the remaining
 * resources for other jobs. In the case where we oversubscribe the
 * CPUs/Logical processors resources we keep the initial set of
 * resources.
 *
 * IN/OUT job_ptr - pointer to job being scheduled. The per-node
 *                  job_res->cpus array is recomputed here.
 *
 */
static int _compute_c_b_task_dist(struct job_record *job_ptr)
{
	bool over_subscribe = false;
	uint32_t n, i, tid, maxtasks, l;
	uint16_t *avail_cpus;
	job_resources_t *job_res = job_ptr->job_resrcs;
	if (!job_res || !job_res->cpus) {
		error("cons_res: _compute_c_b_task_dist given NULL job_ptr");
		return SLURM_ERROR;
	}

	maxtasks = job_res->ncpus;
	avail_cpus = job_res->cpus;
	job_res->cpus = xmalloc(job_res->nhosts * sizeof(uint16_t));

	/* ncpus is already set the number of tasks if overcommit is used */
	if (!job_ptr->details->overcommit &&
	    (job_ptr->details->cpus_per_task > 1)) {
		if (job_ptr->details->ntasks_per_node == 0)
			maxtasks = maxtasks / job_ptr->details->cpus_per_task;
		else
			maxtasks = job_ptr->details->ntasks_per_node * job_res->nhosts;
	}

	/* Safe guard if the user didn't specified a lower number of
	 * cpus than cpus_per_task or didn't specify the number. */
	if (!maxtasks) {
		error("_compute_c_b_task_dist: request was for 0 tasks, "
		      "setting to 1");
		maxtasks = 1;
	}
	if (job_ptr->details->cpus_per_task == 0)
		job_ptr->details->cpus_per_task = 1;
	for (tid = 0, i = job_ptr->details->cpus_per_task ; (tid < maxtasks);
	     i += job_ptr->details->cpus_per_task ) { /* cycle counter */
		bool space_remaining = false;
		if (over_subscribe) {
			/* 'over_subscribe' is a relief valve that guards
			 * against an infinite loop, and it *should* never
			 * come into play because maxtasks should never be
			 * greater than the total number of available cpus
			 */
			error("cons_res: _compute_c_b_task_dist oversubscribe");
		}
		for (n = 0; ((n < job_res->nhosts) && (tid < maxtasks)); n++) {
			if ((i <= avail_cpus[n]) || over_subscribe) {
				tid++;
				for (l = 0; l < job_ptr->details->cpus_per_task;
				     l++) {
					if (job_res->cpus[n] < avail_cpus[n])
						job_res->cpus[n]++;
				}
				if ((i + 1) <= avail_cpus[n])
					space_remaining = true;
			}
		}
		if (!space_remaining) {
			over_subscribe = true;
		}
	}
	xfree(avail_cpus);
	return SLURM_SUCCESS;
}


/* distribute blocks (planes) of tasks cyclically */
static int _compute_plane_dist(struct job_record *job_ptr)
{
	bool over_subscribe = false;
	uint32_t n, i, p, tid, maxtasks, l;
	uint16_t *avail_cpus, plane_size = 1;
	job_resources_t *job_res = job_ptr->job_resrcs;
	if (!job_res || !job_res->cpus) {
		error("cons_res: _compute_plane_dist given NULL job_res");
		return SLURM_ERROR;
	}

	maxtasks = job_res->ncpus;
	avail_cpus = job_res->cpus;

	if (job_ptr->details->cpus_per_task > 1)
		 maxtasks = maxtasks / job_ptr->details->cpus_per_task;

	if (job_ptr->details && job_ptr->details->mc_ptr)
		plane_size = job_ptr->details->mc_ptr->plane_size;

	if (plane_size <= 0) {
		error("cons_res: _compute_plane_dist received invalid "
		      "plane_size");
		return SLURM_ERROR;
	}
	job_res->cpus = xmalloc(job_res->nhosts * sizeof(uint16_t));

	for (tid = 0, i = 0; (tid < maxtasks); i++) { /* cycle counter */
		bool space_remaining = false;
		if (over_subscribe) {
			/* 'over_subscribe' is a relief valve that guards
			 * against an infinite loop, and it *should* never
			 * come into play because maxtasks should never be
			 * greater than the total number of available cpus
			 */
			error("cons_res: _compute_plane_dist oversubscribe");
		}
		for (n = 0; ((n < job_res->nhosts) && (tid < maxtasks)); n++) {
			for (p = 0; p < plane_size && (tid < maxtasks); p++) {
				if ((job_res->cpus[n] < avail_cpus[n]) ||
				    over_subscribe) {
					tid++;
					for (l=0;
					     l<job_ptr->details->cpus_per_task;
					     l++) {
						if (job_res->cpus[n] <
						    avail_cpus[n])
							job_res->cpus[n]++;
					}
				}
			}
			if (job_res->cpus[n] < avail_cpus[n])
				space_remaining = true;
		}
		if (!space_remaining) {
			over_subscribe = true;
		}
	}
	xfree(avail_cpus);
	return SLURM_SUCCESS;
}

/* sync up core bitmap with new CPU count using a best-fit approach
 * on the available resources on each node
 *
 * "Best-fit" means:
 * 1st priority: Use smallest number of boards with sufficient
 *               available CPUs
 * 2nd priority: Use smallest number of sockets with sufficient
 *               available CPUs
 * 3rd priority: Use board combination with the smallest number
 *               of available CPUs
 * 4th priority: Use higher-numbered boards/sockets/cores first
 *
 * The CPU array contains the distribution of CPUs, which can include
 * virtual CPUs (hyperthreads)
 */
static void _block_sync_core_bitmap(struct job_record *job_ptr,
				    const uint16_t cr_type)
{
	uint32_t c, s, i, j, n, b, z, size, csize, core_cnt;
	uint16_t cpus, num_bits, vpus = 1;
	job_resources_t *job_res = job_ptr->job_resrcs;
	bool alloc_cores = false, alloc_sockets = false;
	uint16_t ntasks_per_core = 0xffff;
	int count, cpu_min, b_min, elig, s_min, comb_idx, sock_idx;
	int elig_idx, comb_brd_idx, sock_list_idx, comb_min, board_num;
	int* boards_cpu_cnt;
	int* sort_brds_cpu_cnt;
	int* sockets_cpu_cnt;
	int* board_combs;
	int* socket_list;
	int* elig_brd_combs;
	int* elig_cpu_cnt;
	bool* sockets_used;
	uint16_t boards_nb;
	uint16_t nboards_nb;
	uint16_t sockets_nb;
	uint16_t ncores_nb;
	uint16_t nsockets_nb;
	uint16_t sock_per_brd;
	uint16_t sock_per_comb;
	uint16_t req_cpus,best_fit_cpus = 0;
	uint32_t best_fit_location = 0;
	uint64_t ncomb_brd;
	bool sufficient,best_fit_sufficient;

	/* qsort compare function for ascending int list */
	int _cmp_int_ascend (const void *a, const void *b)
	{
		return (*(int*)a - *(int*)b);
	}

	/* qsort compare function for descending int list */
	int _cmp_int_descend (const void *a, const void *b)
	{
		return (*(int*)b - *(int*)a);
	}

	/* qsort compare function for board combination socket
	 * list */
	int _cmp_sock (const void *a, const void *b)
	{
		 return (sockets_cpu_cnt[*(int*)b] -
				 sockets_cpu_cnt[*(int*)a]);
	}

	if (!job_res)
		return;

	if (cr_type & CR_CORE)
		alloc_cores = true;
#ifdef ALLOCATE_FULL_SOCKET
	if (cr_type & CR_SOCKET)
		alloc_sockets = true;
#else
	if (cr_type & CR_SOCKET)
		alloc_cores = true;
#endif

	if (job_ptr->details && job_ptr->details->mc_ptr) {
		multi_core_data_t *mc_ptr = job_ptr->details->mc_ptr;
		if (mc_ptr->ntasks_per_core) {
			ntasks_per_core = mc_ptr->ntasks_per_core;
		}
		if ((mc_ptr->threads_per_core != (uint16_t) NO_VAL) &&
		    (mc_ptr->threads_per_core <  ntasks_per_core)) {
			ntasks_per_core = mc_ptr->threads_per_core;
		}
	}

	size  = bit_size(job_res->node_bitmap);
	csize = bit_size(job_res->core_bitmap);

	sockets_nb  = select_node_record[0].sockets;
	sockets_cpu_cnt = xmalloc(sockets_nb * sizeof(int));
	sockets_used = xmalloc(sockets_nb * sizeof(bool));
	boards_nb = select_node_record[0].boards;
	boards_cpu_cnt = xmalloc(boards_nb * sizeof(int));
	sort_brds_cpu_cnt = xmalloc(boards_nb * sizeof(int));

	for (c = 0, i = 0, n = 0; n < size; n++) {

		if (bit_test(job_res->node_bitmap, n) == 0)
			continue;

		core_cnt = 0;
		ncores_nb = select_node_record[n].cores;
		nsockets_nb = select_node_record[n].sockets;
		nboards_nb = select_node_record[n].boards;
		num_bits =  nsockets_nb * ncores_nb;

		if ((c + num_bits) > csize)
			fatal("cons_res: _block_sync_core_bitmap index error");

		cpus  = job_res->cpus[i];
		vpus  = MIN(select_node_record[n].vpus, ntasks_per_core);

		/* compute still required cores on the node */
		req_cpus = cpus / vpus;
		if ( cpus % vpus )
			req_cpus++;

		if (nboards_nb > MAX_BOARDS) {
			debug3("cons_res: node[%u]: exceeds max boards; "
				"doing best-fit across sockets only", n);
			nboards_nb = 1;
		}

		if ( nsockets_nb > sockets_nb) {
			sockets_nb = nsockets_nb;
			xrealloc(sockets_cpu_cnt, sockets_nb * sizeof(int));
			xrealloc(sockets_used,sockets_nb * sizeof(bool));
		}

		if ( nboards_nb > boards_nb) {
			boards_nb = nboards_nb;
			xrealloc(boards_cpu_cnt, boards_nb * sizeof(int));
			xrealloc(sort_brds_cpu_cnt, boards_nb * sizeof(int));
		}

		/* Count available cores on each socket and board */
		sock_per_brd = nsockets_nb / nboards_nb;
		for (b = 0; b < nboards_nb; b++) {
			boards_cpu_cnt[b] = 0;
			sort_brds_cpu_cnt[b] = 0;
		}
		for (s = 0; s < nsockets_nb; s++) {
			sockets_cpu_cnt[s]=0;
			sockets_used[s]=false;
			b = s/sock_per_brd;
			for ( j = c + (s * ncores_nb) ;
			      j < c + ((s+1) * ncores_nb) ;
			      j++ ) {
				if ( bit_test(job_res->core_bitmap,j) ) {
					sockets_cpu_cnt[s]++;
					boards_cpu_cnt[b]++;
					sort_brds_cpu_cnt[b]++;
				}
			}
		}

		/* Sort boards in descending order of available core count */
		qsort(sort_brds_cpu_cnt, nboards_nb, sizeof (int),
				_cmp_int_descend);
		/* Determine minimum number of boards required for the
		 * allocation (b_min) */
		count = 0;
		for (b = 0; b < nboards_nb; b++) {
			count+=sort_brds_cpu_cnt[b];
			if (count >= req_cpus)
				break;
		}
		b_min = b+1;
		sock_per_comb = b_min * sock_per_brd;

		/* Allocate space for list of board combinations */
		ncomb_brd = comb_counts[nboards_nb-1][b_min-1];
		board_combs = xmalloc(ncomb_brd * b_min * sizeof(int));
		/* Generate all combinations of b_min boards on the node */
		_gen_combs(board_combs, nboards_nb, b_min);

		/* Determine which combinations have enough available cores
		 * for the allocation (eligible board combinations)
		 */
		elig_brd_combs = xmalloc(ncomb_brd * sizeof(int));
		elig_cpu_cnt = xmalloc(ncomb_brd * sizeof(int));
		elig = 0;
		for (comb_idx = 0; comb_idx < ncomb_brd; comb_idx++) {
			count = 0;
			for (comb_brd_idx = 0; comb_brd_idx < b_min;
				comb_brd_idx++) {
				board_num = board_combs[(comb_idx * b_min)
				                        + comb_brd_idx];
				count += boards_cpu_cnt[board_num];
			}
			if (count >= req_cpus) {
				elig_brd_combs[elig] = comb_idx;
				elig_cpu_cnt[elig] = count;
				elig++;
			}
		}

		/* Allocate space for list of sockets for each eligible board
		 * combination */
		socket_list = xmalloc(elig * sock_per_comb * sizeof(int));

		/* Generate sorted list of sockets for each eligible board
		 * combination, and find combination with minimum number
		 * of sockets and minimum number of cpus required for the
		 * allocation
		 */
		s_min = sock_per_comb;
		comb_min = 0;
		cpu_min = sock_per_comb * ncores_nb;
		for (elig_idx = 0; elig_idx < elig; elig_idx++) {
			comb_idx = elig_brd_combs[elig_idx];
			for (comb_brd_idx = 0; comb_brd_idx < b_min;
							comb_brd_idx++) {
				board_num = board_combs[(comb_idx * b_min)
				                        + comb_brd_idx];
				sock_list_idx = (elig_idx * sock_per_comb) +
					(comb_brd_idx * sock_per_brd);
				for (sock_idx = 0; sock_idx < sock_per_brd;
								sock_idx++) {
					socket_list[sock_list_idx + sock_idx]
						= (board_num * sock_per_brd)
							+ sock_idx;
				}
			}
			/* Sort this socket list in descending order of
			 * available core count */
			qsort(&socket_list[elig_idx*sock_per_comb],
				sock_per_comb, sizeof (int), _cmp_sock);
			/* Determine minimum number of sockets required for
			 * the allocation from this socket list */
			count = 0;
			for (b = 0; b < sock_per_comb; b++) {
				sock_idx =
				socket_list[(int)((elig_idx*sock_per_comb)+b)];
				count+=sockets_cpu_cnt[sock_idx];
				if (count >= req_cpus)
					break;
			}
			b++;
			/* Use board combination with minimum number
			 * of required sockets and minimum number of CPUs
			 */
			if ((b < s_min) ||
				(b == s_min && elig_cpu_cnt[elig_idx]
				                            <= cpu_min)) {
				s_min = b;
				comb_min = elig_idx;
				cpu_min = elig_cpu_cnt[elig_idx];
			}
		}
		debug3("cons_res: best_fit: node[%u]: required cpus: %u, "
				"min req boards: %u,", n, cpus, b_min);
		debug3("cons_res: best_fit: node[%u]: min req sockets: %u, "
				"min avail cores: %u", n, s_min, cpu_min);
		/* Re-sort socket list for best-fit board combination in
		 * ascending order of socket number */
		qsort(&socket_list[comb_min * sock_per_comb], sock_per_comb,
				sizeof (int), _cmp_int_ascend);

		xfree(board_combs);
		xfree(elig_brd_combs);
		xfree(elig_cpu_cnt);

		/* select cores from the sockets of the best-fit board
		 * combination using a best-fit approach */
		while( cpus > 0 ) {

			best_fit_cpus = 0;
			best_fit_sufficient = false;

			/* search for the best socket, */
			/* starting from the last one to let more room */
			/* in the first one for system usage */
			for ( z = sock_per_comb-1; (int) z >= (int) 0; z-- ) {
				s = socket_list[(comb_min*sock_per_comb)+z];
				sufficient = sockets_cpu_cnt[s] >= req_cpus ;
				if ( (best_fit_cpus == 0) ||
				     (sufficient && !best_fit_sufficient ) ||
				     (sufficient && (sockets_cpu_cnt[s] <
						     best_fit_cpus)) ||
				     (!sufficient && (sockets_cpu_cnt[s] >
						      best_fit_cpus)) ) {
					best_fit_cpus = sockets_cpu_cnt[s];
					best_fit_location = s;
					best_fit_sufficient = sufficient;
				}
			}

			/* check that we have found a usable socket */
			if ( best_fit_cpus == 0 )
				break;

			debug3("cons_res: best_fit: using node[%u]: "
			       "board[%u]: socket[%u]: %u cores available",
			       n, best_fit_location/sock_per_brd,
			       best_fit_location,
			       sockets_cpu_cnt[best_fit_location]);

			/* select socket cores from last to first */
			/* socket[0]:Core[0] would be the last one */
			sockets_used[best_fit_location] = true;

			for ( j = c + ((best_fit_location+1) * ncores_nb)
				      - 1 ;
			      (int) j >= (int) (c + (best_fit_location *
						     ncores_nb)) ;
			      j-- ) {

				/*
				 * if no more cpus to select
				 * release remaining cores unless
				 * we are allocating whole sockets
				 */
				if ( cpus == 0 && alloc_sockets ) {
					if ( bit_test(job_res->core_bitmap,j) )
						core_cnt++;
					continue;
				}
				else if ( cpus == 0 ) {
					bit_clear(job_res->core_bitmap,j);
					continue;
				}

				/*
				 * remove cores from socket count and
				 * cpus count using hyperthreading requirement
				 */
				if ( bit_test(job_res->core_bitmap,j) ) {
					sockets_cpu_cnt[best_fit_location]--;
					core_cnt++;
					if (cpus < vpus)
						cpus = 0;
					else
						cpus -= vpus;
				}

			}

			/* loop again if more cpus required */
			if ( cpus > 0 )
				continue;

			/* release remaining cores of the unused sockets */
			for (s = 0; s < nsockets_nb; s++) {
				if ( sockets_used[s] )
					continue;
				bit_nclear(job_res->core_bitmap,
					   c+(s*ncores_nb),
					   c+((s+1)*ncores_nb)-1);
			}

		}

		xfree(socket_list);
		if (cpus > 0) {
			/* cpu count should NEVER be greater than the number
			 * of set bits in the core bitmap for a given node */
			fatal("cons_res: cpus computation error");
		}

		/* adjust cpus count of the current node */
		if ((alloc_cores || alloc_sockets) &&
		    (select_node_record[n].vpus > 1)) {
			job_res->cpus[i] = core_cnt *
				select_node_record[n].vpus;
		}
		i++;

		/* move c to the next node in core_bitmap */
		c += num_bits;

	}

	xfree(boards_cpu_cnt);
	xfree(sort_brds_cpu_cnt);
	xfree(sockets_cpu_cnt);
	xfree(sockets_used);
}

/* Sync up the core_bitmap with the CPU array using cyclic distribution
 *
 * The CPU array contains the distribution of CPUs, which can include
 * virtual CPUs (hyperthreads)
 */
static int _cyclic_sync_core_bitmap(struct job_record *job_ptr,
				     const uint16_t cr_type)
{
	uint32_t c, i, j, s, n, *sock_start, *sock_end, size, csize, core_cnt;
	uint16_t cps = 0, cpus, vpus, sockets, sock_size;
	job_resources_t *job_res = job_ptr->job_resrcs;
	bitstr_t *core_map;
	bool *sock_used, alloc_cores = false, alloc_sockets = false;
	uint16_t ntasks_per_core = 0xffff;
	int error_code = SLURM_SUCCESS;

	if ((job_res == NULL) || (job_res->core_bitmap == NULL))
		return error_code;

	if (cr_type & CR_CORE)
		alloc_cores = true;
#ifdef ALLOCATE_FULL_SOCKET
	if (cr_type & CR_SOCKET)
		alloc_sockets = true;
#else
	if (cr_type & CR_SOCKET)
		alloc_cores = true;
#endif
	core_map = job_res->core_bitmap;
	if (job_ptr->details && job_ptr->details->mc_ptr) {
		multi_core_data_t *mc_ptr = job_ptr->details->mc_ptr;
		if (mc_ptr->ntasks_per_core) {
			ntasks_per_core = mc_ptr->ntasks_per_core;
		}

		if ((mc_ptr->threads_per_core != (uint16_t) NO_VAL) &&
		    (mc_ptr->threads_per_core <  ntasks_per_core)) {
			ntasks_per_core = mc_ptr->threads_per_core;
		}
	}

	sock_size  = select_node_record[0].sockets;
	sock_start = xmalloc(sock_size * sizeof(uint32_t));
	sock_end   = xmalloc(sock_size * sizeof(uint32_t));
	sock_used  = xmalloc(sock_size * sizeof(bool));

	size  = bit_size(job_res->node_bitmap);
	csize = bit_size(core_map);
	for (c = 0, i = 0, n = 0; n < size; n++) {

		if (bit_test(job_res->node_bitmap, n) == 0)
			continue;
		sockets = select_node_record[n].sockets;
		cps     = select_node_record[n].cores;
		vpus    = MIN(select_node_record[n].vpus, ntasks_per_core);

		if (select_debug_flags & DEBUG_FLAG_CPU_BIND) {
			info("DEBUG: job %u node %s vpus %u cpus %u",
			     job_ptr->job_id,
			     select_node_record[n].node_ptr->name,
			     vpus, job_res->cpus[i]);
		}

		if ((c + (sockets * cps)) > csize)
			fatal("cons_res: _cyclic_sync_core_bitmap index error");

		if (sockets > sock_size) {
			sock_size = sockets;
			xrealloc(sock_start, sock_size * sizeof(uint32_t));
			xrealloc(sock_end,   sock_size * sizeof(uint32_t));
			xrealloc(sock_used,  sock_size * sizeof(bool));
		}

		for (s = 0; s < sockets; s++) {
			sock_start[s] = c + (s * cps);
			sock_end[s]   = sock_start[s] + cps;
		}
		core_cnt = 0;
		cpus  = job_res->cpus[i];
		while (cpus > 0) {
			uint16_t prev_cpus = cpus;
			for (s = 0; s < sockets && cpus > 0; s++) {

				while (sock_start[s] < sock_end[s]) {
					if (bit_test(core_map,sock_start[s])) {
						sock_used[s] = true;
						core_cnt++;
						break;
					} else
						sock_start[s]++;
				}

				if (sock_start[s] == sock_end[s])
					/* this socket is unusable */
					continue;
				if (cpus < vpus)
					cpus = 0;
				else
					cpus -= vpus;
				sock_start[s]++;
			}
			if (prev_cpus == cpus) {
				/* we're stuck! */
				job_ptr->priority = 0;
				job_ptr->state_reason = WAIT_HELD;
				error("cons_res: sync loop not progressing, "
				      "holding job %u", job_ptr->job_id);
				error_code = SLURM_ERROR;
				goto fini;
			}
		}
		/* clear the rest of the cores in each socket
		 * FIXME: do we need min_core/min_socket checks here? */
		for (s = 0; s < sockets; s++) {
			if (sock_start[s] == sock_end[s])
				continue;
			if (!alloc_sockets || !sock_used[s]) {
				bit_nclear(core_map, sock_start[s],
					   sock_end[s]-1);
			}
			if ((select_node_record[n].vpus > 1) &&
			    (alloc_sockets || alloc_cores) && sock_used[s]) {
				for (j=sock_start[s]; j<sock_end[s]; j++) {
					if (bit_test(core_map, j))
						core_cnt++;
				}
			}
		}
		if ((alloc_cores || alloc_sockets) &&
		    (select_node_record[n].vpus > 1)) {
			job_res->cpus[i] = core_cnt *
					   select_node_record[n].vpus;
		}
		i++;
		/* advance 'c' to the beginning of the next node */
		c += sockets * cps;
	}
fini:	xfree(sock_start);
	xfree(sock_end);
	xfree(sock_used);
	return error_code;
}


/* To effectively deal with heterogeneous nodes, we fake a cyclic
 * distribution to figure out how many cpus are needed on each node.
 *
 * This routine is a slightly modified "version" of the routine
 * _task_layout_block in src/common/dist_tasks.c. We do not need to
 * assign tasks to job->hostid[] and job->tids[][] at this point so
 * the cpu allocation is the same for cyclic and block.
 *
 * For the consumable resources support we need to determine what
 * "node/CPU/Core/thread"-tuplets will be allocated for a given job.
 * In the past we assumed that we only allocated one task per CPU (at
 * that point the lowest level of logical processor) and didn't allow
 * the use of overcommit. We have changed this philosophy and are now
 * allowing people to overcommit their resources and expect the system
 * administrator to enable the task/affinity plug-in which will then
 * bind all of a job's tasks to its allocated resources thereby
 * avoiding interference between co-allocated running jobs.
 *
 * In the consumable resources environment we need to determine the
 * layout schema within slurmctld.
 *
 * We have a core_bitmap of all available cores. All we're doing here
 * is removing cores that are not needed based on the task count, and
 * the choice of cores to remove is based on the distribution:
 * - "cyclic" removes cores "evenly", starting from the last socket,
 * - "block" removes cores from the "last" socket(s)
 * - "plane" removes cores "in chunks"
 */
extern int cr_dist(struct job_record *job_ptr, const uint16_t cr_type)
{
	int error_code, cr_cpu = 1;

	if (job_ptr->job_resrcs->node_req == NODE_CR_RESERVED) {
		/* the job has been allocated an EXCLUSIVE set of nodes,
		 * so it gets all of the bits in the core_bitmap and
		 * all of the available CPUs in the cpus array */
		int size = bit_size(job_ptr->job_resrcs->core_bitmap);
		bit_nset(job_ptr->job_resrcs->core_bitmap, 0, size-1);
		return SLURM_SUCCESS;
	}

	if (job_ptr->details->task_dist == SLURM_DIST_PLANE) {
		/* perform a plane distribution on the 'cpus' array */
		error_code = _compute_plane_dist(job_ptr);
		if (error_code != SLURM_SUCCESS) {
			error("cons_res: cr_dist: Error in "
			      "_compute_plane_dist");
			return error_code;
		}
	} else {
		/* perform a cyclic distribution on the 'cpus' array */
		error_code = _compute_c_b_task_dist(job_ptr);
		if (error_code != SLURM_SUCCESS) {
			error("cons_res: cr_dist: Error in "
			      "_compute_c_b_task_dist");
			return error_code;
		}
	}

	/* now sync up the core_bitmap with the allocated 'cpus' array
	 * based on the given distribution AND resource setting */
	if ((cr_type & CR_CORE) || (cr_type & CR_SOCKET))
		cr_cpu = 0;

	if (cr_cpu) {
		_block_sync_core_bitmap(job_ptr, cr_type);
		return SLURM_SUCCESS;
	}

	/*
	 * If SelectTypeParameters mentions to use a block distribution for
	 * cores by default, use that kind of distribution if no particular
	 * cores distribution specified.
	 * Note : cyclic cores distribution, which is the default, is treated
	 * by the next code block
	 */
	if ( slurmctld_conf.select_type_param & CR_CORE_DEFAULT_DIST_BLOCK ) {
		switch(job_ptr->details->task_dist) {
		case SLURM_DIST_ARBITRARY:
		case SLURM_DIST_BLOCK:
		case SLURM_DIST_CYCLIC:
		case SLURM_DIST_UNKNOWN:
			_block_sync_core_bitmap(job_ptr, cr_type);
			return SLURM_SUCCESS;
		}
	}

	/* Determine the number of logical processors per node needed
	 * for this job. Make sure below matches the layouts in
	 * lllp_distribution in plugins/task/affinity/dist_task.c (FIXME) */
	switch(job_ptr->details->task_dist) {
	case SLURM_DIST_BLOCK_BLOCK:
	case SLURM_DIST_CYCLIC_BLOCK:
	case SLURM_DIST_PLANE:
		_block_sync_core_bitmap(job_ptr, cr_type);
		break;
	case SLURM_DIST_ARBITRARY:
	case SLURM_DIST_BLOCK:
	case SLURM_DIST_CYCLIC:
	case SLURM_DIST_BLOCK_CYCLIC:
	case SLURM_DIST_CYCLIC_CYCLIC:
	case SLURM_DIST_UNKNOWN:
		error_code = _cyclic_sync_core_bitmap(job_ptr, cr_type);
		break;
	default:
		error("select/cons_res: invalid task_dist entry");
		return SLURM_ERROR;
	}
	return error_code;
}