cluster_size_autoscaling.go

/test/e2e/autoscaling/cluster_size_autoscaling.go

https://gitlab.com/unofficial-mirrors/kubernetes · Go · 1190 lines · 948 code · 158 blank · 84 comment · 158 complexity · b25ca2088c48fed59875aa37b04f29c0 MD5 · raw file

/*
Copyright 2016 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package autoscaling

import (
	"bytes"
	"fmt"
	"io/ioutil"
	"math"
	"net/http"
	"os/exec"
	"regexp"
	"strconv"
	"strings"
	"time"

	"k8s.io/api/core/v1"
	policy "k8s.io/api/policy/v1beta1"
	"k8s.io/api/scheduling/v1alpha1"
	"k8s.io/apimachinery/pkg/api/errors"
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
	"k8s.io/apimachinery/pkg/fields"
	"k8s.io/apimachinery/pkg/labels"
	utilerrors "k8s.io/apimachinery/pkg/util/errors"
	"k8s.io/apimachinery/pkg/util/intstr"
	"k8s.io/apimachinery/pkg/util/sets"
	"k8s.io/apimachinery/pkg/util/uuid"
	"k8s.io/apimachinery/pkg/util/wait"
	clientset "k8s.io/client-go/kubernetes"
	api "k8s.io/kubernetes/pkg/apis/core"
	"k8s.io/kubernetes/test/e2e/framework"
	"k8s.io/kubernetes/test/e2e/scheduling"
	testutils "k8s.io/kubernetes/test/utils"
	imageutils "k8s.io/kubernetes/test/utils/image"

	"github.com/golang/glog"
	. "github.com/onsi/ginkgo"
	. "github.com/onsi/gomega"
)

const (
	defaultTimeout         = 3 * time.Minute
	resizeTimeout          = 5 * time.Minute
	manualResizeTimeout    = 6 * time.Minute
	scaleUpTimeout         = 5 * time.Minute
	scaleUpTriggerTimeout  = 2 * time.Minute
	scaleDownTimeout       = 20 * time.Minute
	podTimeout             = 2 * time.Minute
	nodesRecoverTimeout    = 5 * time.Minute
	rcCreationRetryTimeout = 4 * time.Minute
	rcCreationRetryDelay   = 20 * time.Second
	makeSchedulableTimeout = 10 * time.Minute
	makeSchedulableDelay   = 20 * time.Second
	freshStatusLimit       = 20 * time.Second

	gkeEndpoint        = "https://test-container.sandbox.googleapis.com"
	gkeUpdateTimeout   = 15 * time.Minute
	gkeNodepoolNameKey = "cloud.google.com/gke-nodepool"

	disabledTaint             = "DisabledForAutoscalingTest"
	criticalAddonsOnlyTaint   = "CriticalAddonsOnly"
	newNodesForScaledownTests = 2
	unhealthyClusterThreshold = 4

	caNoScaleUpStatus      = "NoActivity"
	caOngoingScaleUpStatus = "InProgress"
	timestampFormat        = "2006-01-02 15:04:05 -0700 MST"

	expendablePriorityClassName = "expendable-priority"
	highPriorityClassName       = "high-priority"
)

var _ = SIGDescribe("Cluster size autoscaling [Slow]", func() {
	f := framework.NewDefaultFramework("autoscaling")
	var c clientset.Interface
	var nodeCount int
	var coreCount int64
	var memAllocatableMb int
	var originalSizes map[string]int

	BeforeEach(func() {
		c = f.ClientSet
		framework.SkipUnlessProviderIs("gce", "gke")

		originalSizes = make(map[string]int)
		sum := 0
		for _, mig := range strings.Split(framework.TestContext.CloudConfig.NodeInstanceGroup, ",") {
			size, err := framework.GroupSize(mig)
			framework.ExpectNoError(err)
			By(fmt.Sprintf("Initial size of %s: %d", mig, size))
			originalSizes[mig] = size
			sum += size
		}
		// Give instances time to spin up
		framework.ExpectNoError(framework.WaitForReadyNodes(c, sum, scaleUpTimeout))

		nodes := framework.GetReadySchedulableNodesOrDie(f.ClientSet)
		nodeCount = len(nodes.Items)
		coreCount = 0
		for _, node := range nodes.Items {
			quentity := node.Status.Capacity[v1.ResourceCPU]
			coreCount += quentity.Value()
		}
		By(fmt.Sprintf("Initial number of schedulable nodes: %v", nodeCount))
		Expect(nodeCount).NotTo(BeZero())
		mem := nodes.Items[0].Status.Allocatable[v1.ResourceMemory]
		memAllocatableMb = int((&mem).Value() / 1024 / 1024)

		Expect(nodeCount).Should(Equal(sum))

		if framework.ProviderIs("gke") {
			val, err := isAutoscalerEnabled(5)
			framework.ExpectNoError(err)
			if !val {
				err = enableAutoscaler("default-pool", 3, 5)
				framework.ExpectNoError(err)
			}
			Expect(getNAPNodePoolsNumber()).Should(Equal(0))
		}
	})

	AfterEach(func() {
		if framework.ProviderIs("gke") {
			By("Remove changes introduced by NAP tests")
			removeNAPNodePools()
			disableAutoprovisioning()
		}
		By(fmt.Sprintf("Restoring initial size of the cluster"))
		setMigSizes(originalSizes)
		expectedNodes := 0
		for _, size := range originalSizes {
			expectedNodes += size
		}
		framework.ExpectNoError(framework.WaitForReadyNodes(c, expectedNodes, scaleDownTimeout))
		nodes, err := c.CoreV1().Nodes().List(metav1.ListOptions{})
		framework.ExpectNoError(err)

		s := time.Now()
	makeSchedulableLoop:
		for start := time.Now(); time.Since(start) < makeSchedulableTimeout; time.Sleep(makeSchedulableDelay) {
			for _, n := range nodes.Items {
				err = makeNodeSchedulable(c, &n, true)
				switch err.(type) {
				case CriticalAddonsOnlyError:
					continue makeSchedulableLoop
				default:
					framework.ExpectNoError(err)
				}
			}
			break
		}
		glog.Infof("Made nodes schedulable again in %v", time.Since(s).String())
	})

	It("shouldn't increase cluster size if pending pod is too large [Feature:ClusterSizeAutoscalingScaleUp]", func() {
		By("Creating unschedulable pod")
		ReserveMemory(f, "memory-reservation", 1, int(1.1*float64(memAllocatableMb)), false, defaultTimeout)
		defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, "memory-reservation")

		By("Waiting for scale up hoping it won't happen")
		// Verify that the appropriate event was generated
		eventFound := false
	EventsLoop:
		for start := time.Now(); time.Since(start) < scaleUpTimeout; time.Sleep(20 * time.Second) {
			By("Waiting for NotTriggerScaleUp event")
			events, err := f.ClientSet.CoreV1().Events(f.Namespace.Name).List(metav1.ListOptions{})
			framework.ExpectNoError(err)

			for _, e := range events.Items {
				if e.InvolvedObject.Kind == "Pod" && e.Reason == "NotTriggerScaleUp" && strings.Contains(e.Message, "it wouldn't fit if a new node is added") {
					By("NotTriggerScaleUp event found")
					eventFound = true
					break EventsLoop
				}
			}
		}
		Expect(eventFound).Should(Equal(true))
		// Verify that cluster size is not changed
		framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
			func(size int) bool { return size <= nodeCount }, time.Second))
	})

	simpleScaleUpTest := func(unready int) {
		ReserveMemory(f, "memory-reservation", 100, nodeCount*memAllocatableMb, false, 1*time.Second)
		defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, "memory-reservation")

		// Verify that cluster size is increased
		framework.ExpectNoError(WaitForClusterSizeFuncWithUnready(f.ClientSet,
			func(size int) bool { return size >= nodeCount+1 }, scaleUpTimeout, unready))
		framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
	}

	It("should increase cluster size if pending pods are small [Feature:ClusterSizeAutoscalingScaleUp]",
		func() { simpleScaleUpTest(0) })

	It("Should scale up GPU pool from 0 [Feature:ClusterSizeAutoscalingGpu]", func() {
		framework.SkipUnlessProviderIs("gke")

		const gpuPoolName = "gpu-pool"
		addGpuNodePool(gpuPoolName, "nvidia-tesla-k80", 1, 0)
		defer deleteNodePool(gpuPoolName)

		installNvidiaDriversDaemonSet()

		By("Enable autoscaler")
		framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1))
		defer disableAutoscaler(gpuPoolName, 0, 1)
		Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0))

		By("Schedule a pod which requires GPU")
		framework.ExpectNoError(scheduleGpuPod(f, "gpu-pod-rc"))

		framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
			func(size int) bool { return size == nodeCount+1 }, scaleUpTimeout))
		Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1))
	})

	It("Should scale up GPU pool from 1 [Feature:ClusterSizeAutoscalingGpu]", func() {
		framework.SkipUnlessProviderIs("gke")

		const gpuPoolName = "gpu-pool"
		addGpuNodePool(gpuPoolName, "nvidia-tesla-k80", 1, 1)
		defer deleteNodePool(gpuPoolName)

		installNvidiaDriversDaemonSet()

		By("Schedule a single pod which requires GPU")
		framework.ExpectNoError(scheduleGpuPod(f, "gpu-pod-rc"))

		By("Enable autoscaler")
		framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 2))
		defer disableAutoscaler(gpuPoolName, 0, 2)
		Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1))

		framework.ScaleRC(f.ClientSet, f.ScalesGetter, f.Namespace.Name, "gpu-pod-rc", 2, false)

		framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
			func(size int) bool { return size == nodeCount+2 }, scaleUpTimeout))
		Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(2))
	})

	It("Should not scale GPU pool up if pod does not require GPUs [Feature:ClusterSizeAutoscalingGpu]", func() {
		framework.SkipUnlessProviderIs("gke")

		const gpuPoolName = "gpu-pool"
		addGpuNodePool(gpuPoolName, "nvidia-tesla-k80", 1, 0)
		defer deleteNodePool(gpuPoolName)

		installNvidiaDriversDaemonSet()

		By("Enable autoscaler")
		framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1))
		defer disableAutoscaler(gpuPoolName, 0, 1)
		Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0))

		By("Schedule bunch of pods beyond point of filling default pool but do not request any GPUs")
		ReserveMemory(f, "memory-reservation", 100, nodeCount*memAllocatableMb, false, 1*time.Second)
		defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, "memory-reservation")

		// Verify that cluster size is increased
		framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
			func(size int) bool { return size >= nodeCount+1 }, scaleUpTimeout))

		// Expect gpu pool to stay intact
		Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0))
	})

	It("Should scale down GPU pool from 1 [Feature:ClusterSizeAutoscalingGpu]", func() {
		framework.SkipUnlessProviderIs("gke")

		const gpuPoolName = "gpu-pool"
		addGpuNodePool(gpuPoolName, "nvidia-tesla-k80", 1, 1)
		defer deleteNodePool(gpuPoolName)

		installNvidiaDriversDaemonSet()

		By("Schedule a single pod which requires GPU")
		framework.ExpectNoError(scheduleGpuPod(f, "gpu-pod-rc"))

		By("Enable autoscaler")
		framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1))
		defer disableAutoscaler(gpuPoolName, 0, 1)
		Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1))

		framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, "gpu-pod-rc")

		framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
			func(size int) bool { return size == nodeCount }, scaleDownTimeout))
		Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0))
	})

	It("should increase cluster size if pending pods are small and one node is broken [Feature:ClusterSizeAutoscalingScaleUp]",
		func() {
			framework.TestUnderTemporaryNetworkFailure(c, "default", getAnyNode(c), func() { simpleScaleUpTest(1) })
		})

	It("shouldn't trigger additional scale-ups during processing scale-up [Feature:ClusterSizeAutoscalingScaleUp]", func() {
		// Wait for the situation to stabilize - CA should be running and have up-to-date node readiness info.
		status, err := waitForScaleUpStatus(c, func(s *scaleUpStatus) bool {
			return s.ready == s.target && s.ready <= nodeCount
		}, scaleUpTriggerTimeout)
		framework.ExpectNoError(err)

		unmanagedNodes := nodeCount - status.ready

		By("Schedule more pods than can fit and wait for cluster to scale-up")
		ReserveMemory(f, "memory-reservation", 100, nodeCount*memAllocatableMb, false, 1*time.Second)
		defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, "memory-reservation")

		status, err = waitForScaleUpStatus(c, func(s *scaleUpStatus) bool {
			return s.status == caOngoingScaleUpStatus
		}, scaleUpTriggerTimeout)
		framework.ExpectNoError(err)
		target := status.target
		framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))

		By("Expect no more scale-up to be happening after all pods are scheduled")
		status, err = getScaleUpStatus(c)
		framework.ExpectNoError(err)
		if status.target != target {
			glog.Warningf("Final number of nodes (%v) does not match initial scale-up target (%v).", status.target, target)
		}
		Expect(status.timestamp.Add(freshStatusLimit).Before(time.Now())).Should(Equal(false))
		Expect(status.status).Should(Equal(caNoScaleUpStatus))
		Expect(status.ready).Should(Equal(status.target))
		Expect(len(framework.GetReadySchedulableNodesOrDie(f.ClientSet).Items)).Should(Equal(status.target + unmanagedNodes))
	})

	It("should increase cluster size if pending pods are small and there is another node pool that is not autoscaled [Feature:ClusterSizeAutoscalingScaleUp]", func() {
		framework.SkipUnlessProviderIs("gke")

		By("Creating new node-pool with n1-standard-4 machines")
		const extraPoolName = "extra-pool"
		addNodePool(extraPoolName, "n1-standard-4", 1)
		defer deleteNodePool(extraPoolName)
		extraNodes := getPoolInitialSize(extraPoolName)
		framework.ExpectNoError(framework.WaitForReadyNodes(c, nodeCount+extraNodes, resizeTimeout))
		glog.Infof("Not enabling cluster autoscaler for the node pool (on purpose).")

		By("Getting memory available on new nodes, so we can account for it when creating RC")
		nodes := getPoolNodes(f, extraPoolName)
		Expect(len(nodes)).Should(Equal(extraNodes))
		extraMemMb := 0
		for _, node := range nodes {
			mem := node.Status.Capacity[v1.ResourceMemory]
			extraMemMb += int((&mem).Value() / 1024 / 1024)
		}

		By("Reserving 0.1x more memory than the cluster holds to trigger scale up")
		totalMemoryReservation := int(1.1 * float64(nodeCount*memAllocatableMb+extraMemMb))
		defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, "memory-reservation")
		ReserveMemory(f, "memory-reservation", 100, totalMemoryReservation, false, defaultTimeout)

		// Verify, that cluster size is increased
		framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
			func(size int) bool { return size >= nodeCount+extraNodes+1 }, scaleUpTimeout))
		framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
	})

	It("should disable node pool autoscaling [Feature:ClusterSizeAutoscalingScaleUp]", func() {
		framework.SkipUnlessProviderIs("gke")

		By("Creating new node-pool with n1-standard-4 machines")
		const extraPoolName = "extra-pool"
		addNodePool(extraPoolName, "n1-standard-4", 1)
		defer deleteNodePool(extraPoolName)
		extraNodes := getPoolInitialSize(extraPoolName)
		framework.ExpectNoError(framework.WaitForReadyNodes(c, nodeCount+extraNodes, resizeTimeout))
		framework.ExpectNoError(enableAutoscaler(extraPoolName, 1, 2))
		framework.ExpectNoError(disableAutoscaler(extraPoolName, 1, 2))
	})

	It("should increase cluster size if pods are pending due to host port conflict [Feature:ClusterSizeAutoscalingScaleUp]", func() {
		scheduling.CreateHostPortPods(f, "host-port", nodeCount+2, false)
		defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, "host-port")

		framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
			func(size int) bool { return size >= nodeCount+2 }, scaleUpTimeout))
		framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
	})

	It("should increase cluster size if pods are pending due to pod anti-affinity [Feature:ClusterSizeAutoscalingScaleUp]", func() {
		pods := nodeCount
		newPods := 2
		labels := map[string]string{
			"anti-affinity": "yes",
		}
		By("starting a pod with anti-affinity on each node")
		framework.ExpectNoError(runAntiAffinityPods(f, f.Namespace.Name, pods, "some-pod", labels, labels))
		defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, "some-pod")
		framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))

		By("scheduling extra pods with anti-affinity to existing ones")
		framework.ExpectNoError(runAntiAffinityPods(f, f.Namespace.Name, newPods, "extra-pod", labels, labels))
		defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, "extra-pod")

		framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
		framework.ExpectNoError(framework.WaitForReadyNodes(c, nodeCount+newPods, scaleUpTimeout))
	})

	It("should increase cluster size if pod requesting EmptyDir volume is pending [Feature:ClusterSizeAutoscalingScaleUp]", func() {
		By("creating pods")
		pods := nodeCount
		newPods := 1
		labels := map[string]string{
			"anti-affinity": "yes",
		}
		framework.ExpectNoError(runAntiAffinityPods(f, f.Namespace.Name, pods, "some-pod", labels, labels))
		defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, "some-pod")

		By("waiting for all pods before triggering scale up")
		framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))

		By("creating a pod requesting EmptyDir")
		framework.ExpectNoError(runVolumeAntiAffinityPods(f, f.Namespace.Name, newPods, "extra-pod", labels, labels, emptyDirVolumes))
		defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, "extra-pod")

		framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
		framework.ExpectNoError(framework.WaitForReadyNodes(c, nodeCount+newPods, scaleUpTimeout))
	})

	It("should increase cluster size if pod requesting volume is pending [Feature:ClusterSizeAutoscalingScaleUp]", func() {
		framework.SkipUnlessProviderIs("gce", "gke")

		volumeLabels := labels.Set{
			framework.VolumeSelectorKey: f.Namespace.Name,
		}
		selector := metav1.SetAsLabelSelector(volumeLabels)

		By("creating volume & pvc")
		diskName, err := framework.CreatePDWithRetry()
		framework.ExpectNoError(err)
		pvConfig := framework.PersistentVolumeConfig{
			NamePrefix: "gce-",
			Labels:     volumeLabels,
			PVSource: v1.PersistentVolumeSource{
				GCEPersistentDisk: &v1.GCEPersistentDiskVolumeSource{
					PDName:   diskName,
					FSType:   "ext3",
					ReadOnly: false,
				},
			},
			Prebind: nil,
		}
		emptyStorageClass := ""
		pvcConfig := framework.PersistentVolumeClaimConfig{
			Selector:         selector,
			StorageClassName: &emptyStorageClass,
		}

		pv, pvc, err := framework.CreatePVPVC(c, pvConfig, pvcConfig, f.Namespace.Name, false)
		framework.ExpectNoError(err)
		framework.ExpectNoError(framework.WaitOnPVandPVC(c, f.Namespace.Name, pv, pvc))

		defer func() {
			errs := framework.PVPVCCleanup(c, f.Namespace.Name, pv, pvc)
			if len(errs) > 0 {
				framework.Failf("failed to delete PVC and/or PV. Errors: %v", utilerrors.NewAggregate(errs))
			}
			pv, pvc = nil, nil
			if diskName != "" {
				framework.ExpectNoError(framework.DeletePDWithRetry(diskName))
			}
		}()

		By("creating pods")
		pods := nodeCount
		labels := map[string]string{
			"anti-affinity": "yes",
		}
		framework.ExpectNoError(runAntiAffinityPods(f, f.Namespace.Name, pods, "some-pod", labels, labels))
		defer func() {
			framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, "some-pod")
			glog.Infof("RC and pods not using volume deleted")
		}()

		By("waiting for all pods before triggering scale up")
		framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))

		By("creating a pod requesting PVC")
		pvcPodName := "pvc-pod"
		newPods := 1
		volumes := buildVolumes(pv, pvc)
		framework.ExpectNoError(runVolumeAntiAffinityPods(f, f.Namespace.Name, newPods, pvcPodName, labels, labels, volumes))
		defer func() {
			framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, pvcPodName)
			framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
		}()

		framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
		framework.ExpectNoError(framework.WaitForReadyNodes(c, nodeCount+newPods, scaleUpTimeout))
	})

	It("should add node to the particular mig [Feature:ClusterSizeAutoscalingScaleUp]", func() {
		labelKey := "cluster-autoscaling-test.special-node"
		labelValue := "true"

		By("Finding the smallest MIG")
		minMig := ""
		minSize := nodeCount
		for mig, size := range originalSizes {
			if size <= minSize {
				minMig = mig
				minSize = size
			}
		}

		if minSize == 0 {
			newSizes := make(map[string]int)
			for mig, size := range originalSizes {
				newSizes[mig] = size
			}
			newSizes[minMig] = 1
			setMigSizes(newSizes)
		}

		removeLabels := func(nodesToClean sets.String) {
			By("Removing labels from nodes")
			for node := range nodesToClean {
				framework.RemoveLabelOffNode(c, node, labelKey)
			}
		}

		nodes, err := framework.GetGroupNodes(minMig)
		framework.ExpectNoError(err)
		nodesSet := sets.NewString(nodes...)
		defer removeLabels(nodesSet)
		By(fmt.Sprintf("Annotating nodes of the smallest MIG(%s): %v", minMig, nodes))

		for node := range nodesSet {
			framework.AddOrUpdateLabelOnNode(c, node, labelKey, labelValue)
		}

		scheduling.CreateNodeSelectorPods(f, "node-selector", minSize+1, map[string]string{labelKey: labelValue}, false)

		By("Waiting for new node to appear and annotating it")
		framework.WaitForGroupSize(minMig, int32(minSize+1))
		// Verify that cluster size is increased
		framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
			func(size int) bool { return size >= nodeCount+1 }, scaleUpTimeout))

		newNodes, err := framework.GetGroupNodes(minMig)
		framework.ExpectNoError(err)
		newNodesSet := sets.NewString(newNodes...)
		newNodesSet.Delete(nodes...)
		if len(newNodesSet) > 1 {
			By(fmt.Sprintf("Spotted following new nodes in %s: %v", minMig, newNodesSet))
			glog.Infof("Usually only 1 new node is expected, investigating")
			glog.Infof("Kubectl:%s\n", framework.RunKubectlOrDie("get", "nodes", "-o", "json"))
			if output, err := exec.Command("gcloud", "compute", "instances", "list",
				"--project="+framework.TestContext.CloudConfig.ProjectID,
				"--zone="+framework.TestContext.CloudConfig.Zone).Output(); err == nil {
				glog.Infof("Gcloud compute instances list: %s", output)
			} else {
				glog.Errorf("Failed to get instances list: %v", err)
			}

			for newNode := range newNodesSet {
				if output, err := execCmd("gcloud", "compute", "instances", "describe",
					newNode,
					"--project="+framework.TestContext.CloudConfig.ProjectID,
					"--zone="+framework.TestContext.CloudConfig.Zone).Output(); err == nil {
					glog.Infof("Gcloud compute instances describe: %s", output)
				} else {
					glog.Errorf("Failed to get instances describe: %v", err)
				}
			}

			// TODO: possibly remove broken node from newNodesSet to prevent removeLabel from crashing.
			// However at this moment we DO WANT it to crash so that we don't check all test runs for the
			// rare behavior, but only the broken ones.
		}
		By(fmt.Sprintf("New nodes: %v\n", newNodesSet))
		registeredNodes := sets.NewString()
		for nodeName := range newNodesSet {
			node, err := f.ClientSet.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{})
			if err == nil && node != nil {
				registeredNodes.Insert(nodeName)
			} else {
				glog.Errorf("Failed to get node %v: %v", nodeName, err)
			}
		}
		By(fmt.Sprintf("Setting labels for registered new nodes: %v", registeredNodes.List()))
		for node := range registeredNodes {
			framework.AddOrUpdateLabelOnNode(c, node, labelKey, labelValue)
		}

		defer removeLabels(registeredNodes)

		framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
		framework.ExpectNoError(framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, "node-selector"))
	})

	It("should scale up correct target pool [Feature:ClusterSizeAutoscalingScaleUp]", func() {
		framework.SkipUnlessProviderIs("gke")

		By("Creating new node-pool with n1-standard-4 machines")
		const extraPoolName = "extra-pool"
		addNodePool(extraPoolName, "n1-standard-4", 1)
		defer deleteNodePool(extraPoolName)
		extraNodes := getPoolInitialSize(extraPoolName)
		framework.ExpectNoError(framework.WaitForReadyNodes(c, nodeCount+extraNodes, resizeTimeout))
		framework.ExpectNoError(enableAutoscaler(extraPoolName, 1, 2))
		defer disableAutoscaler(extraPoolName, 1, 2)

		extraPods := extraNodes + 1
		totalMemoryReservation := int(float64(extraPods) * 1.5 * float64(memAllocatableMb))
		By(fmt.Sprintf("Creating rc with %v pods too big to fit default-pool but fitting extra-pool", extraPods))
		defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, "memory-reservation")
		ReserveMemory(f, "memory-reservation", extraPods, totalMemoryReservation, false, defaultTimeout)

		// Apparently GKE master is restarted couple minutes after the node pool is added
		// reseting all the timers in scale down code. Adding 5 extra minutes to workaround
		// this issue.
		// TODO: Remove the extra time when GKE restart is fixed.
		framework.ExpectNoError(framework.WaitForReadyNodes(c, nodeCount+extraNodes+1, scaleUpTimeout+5*time.Minute))
	})

	simpleScaleDownTest := func(unready int) {
		cleanup, err := addKubeSystemPdbs(f)
		defer cleanup()
		framework.ExpectNoError(err)

		By("Manually increase cluster size")
		increasedSize := 0
		newSizes := make(map[string]int)
		for key, val := range originalSizes {
			newSizes[key] = val + 2 + unready
			increasedSize += val + 2 + unready
		}
		setMigSizes(newSizes)
		framework.ExpectNoError(WaitForClusterSizeFuncWithUnready(f.ClientSet,
			func(size int) bool { return size >= increasedSize }, manualResizeTimeout, unready))

		By("Some node should be removed")
		framework.ExpectNoError(WaitForClusterSizeFuncWithUnready(f.ClientSet,
			func(size int) bool { return size < increasedSize }, scaleDownTimeout, unready))
	}

	It("should correctly scale down after a node is not needed [Feature:ClusterSizeAutoscalingScaleDown]",
		func() { simpleScaleDownTest(0) })

	It("should correctly scale down after a node is not needed and one node is broken [Feature:ClusterSizeAutoscalingScaleDown]",
		func() {
			framework.TestUnderTemporaryNetworkFailure(c, "default", getAnyNode(c), func() { simpleScaleDownTest(1) })
		})

	It("should correctly scale down after a node is not needed when there is non autoscaled pool[Feature:ClusterSizeAutoscalingScaleDown]", func() {
		framework.SkipUnlessProviderIs("gke")

		increasedSize := manuallyIncreaseClusterSize(f, originalSizes)

		const extraPoolName = "extra-pool"
		addNodePool(extraPoolName, "n1-standard-1", 3)
		defer deleteNodePool(extraPoolName)
		extraNodes := getPoolInitialSize(extraPoolName)

		framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
			func(size int) bool { return size >= increasedSize+extraNodes }, scaleUpTimeout))

		By("Some node should be removed")
		// Apparently GKE master is restarted couple minutes after the node pool is added
		// reseting all the timers in scale down code. Adding 10 extra minutes to workaround
		// this issue.
		// TODO: Remove the extra time when GKE restart is fixed.
		framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
			func(size int) bool { return size < increasedSize+extraNodes }, scaleDownTimeout+10*time.Minute))
	})

	It("should be able to scale down when rescheduling a pod is required and pdb allows for it[Feature:ClusterSizeAutoscalingScaleDown]", func() {
		runDrainTest(f, originalSizes, f.Namespace.Name, 1, 1, func(increasedSize int) {
			By("Some node should be removed")
			framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
				func(size int) bool { return size < increasedSize }, scaleDownTimeout))
		})
	})

	It("shouldn't be able to scale down when rescheduling a pod is required, but pdb doesn't allow drain[Feature:ClusterSizeAutoscalingScaleDown]", func() {
		runDrainTest(f, originalSizes, f.Namespace.Name, 1, 0, func(increasedSize int) {
			By("No nodes should be removed")
			time.Sleep(scaleDownTimeout)
			nodes := framework.GetReadySchedulableNodesOrDie(f.ClientSet)
			Expect(len(nodes.Items)).Should(Equal(increasedSize))
		})
	})

	It("should be able to scale down by draining multiple pods one by one as dictated by pdb[Feature:ClusterSizeAutoscalingScaleDown]", func() {
		runDrainTest(f, originalSizes, f.Namespace.Name, 2, 1, func(increasedSize int) {
			By("Some node should be removed")
			framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
				func(size int) bool { return size < increasedSize }, scaleDownTimeout))
		})
	})

	It("should be able to scale down by draining system pods with pdb[Feature:ClusterSizeAutoscalingScaleDown]", func() {
		runDrainTest(f, originalSizes, "kube-system", 2, 1, func(increasedSize int) {
			By("Some node should be removed")
			framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
				func(size int) bool { return size < increasedSize }, scaleDownTimeout))
		})
	})

	It("Should be able to scale a node group up from 0[Feature:ClusterSizeAutoscalingScaleUp]", func() {
		// Provider-specific setup
		if framework.ProviderIs("gke") {
			// GKE-specific setup
			By("Add a new node pool with 0 nodes and min size 0")
			const extraPoolName = "extra-pool"
			addNodePool(extraPoolName, "n1-standard-4", 0)
			defer deleteNodePool(extraPoolName)
			framework.ExpectNoError(enableAutoscaler(extraPoolName, 0, 1))
			defer disableAutoscaler(extraPoolName, 0, 1)
		} else {
			// on GCE, run only if there are already at least 2 node groups
			framework.SkipUnlessAtLeast(len(originalSizes), 2, "At least 2 node groups are needed for scale-to-0 tests")

			By("Manually scale smallest node group to 0")
			minMig := ""
			minSize := nodeCount
			for mig, size := range originalSizes {
				if size <= minSize {
					minMig = mig
					minSize = size
				}
			}
			framework.ExpectNoError(framework.ResizeGroup(minMig, int32(0)))
			framework.ExpectNoError(framework.WaitForReadyNodes(c, nodeCount-minSize, resizeTimeout))
		}

		By("Make remaining nodes unschedulable")
		nodes, err := f.ClientSet.CoreV1().Nodes().List(metav1.ListOptions{FieldSelector: fields.Set{
			"spec.unschedulable": "false",
		}.AsSelector().String()})
		framework.ExpectNoError(err)

		for _, node := range nodes.Items {
			err = makeNodeUnschedulable(f.ClientSet, &node)

			defer func(n v1.Node) {
				makeNodeSchedulable(f.ClientSet, &n, false)
			}(node)

			framework.ExpectNoError(err)
		}

		By("Run a scale-up test")
		ReserveMemory(f, "memory-reservation", 1, 100, false, 1*time.Second)
		defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, "memory-reservation")

		// Verify that cluster size is increased
		framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
			func(size int) bool { return size >= len(nodes.Items)+1 }, scaleUpTimeout))
		framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
	})

	// Scale to 0 test is split into two functions (for GKE & GCE.)
	// The reason for it is that scenario is exactly the same,
	// but setup & verification use different APIs.
	//
	// Scenario:
	// (GKE only) add an extra node pool with size 1 & enable autoscaling for it
	// (GCE only) find the smallest MIG & resize it to 1
	// manually drain the single node from this node pool/MIG
	// wait for cluster size to decrease
	// verify the targeted node pool/MIG is of size 0
	gkeScaleToZero := func() {
		// GKE-specific setup
		By("Add a new node pool with size 1 and min size 0")
		const extraPoolName = "extra-pool"
		addNodePool(extraPoolName, "n1-standard-4", 1)
		defer deleteNodePool(extraPoolName)
		extraNodes := getPoolInitialSize(extraPoolName)
		framework.ExpectNoError(framework.WaitForReadyNodes(c, nodeCount+extraNodes, resizeTimeout))
		framework.ExpectNoError(enableAutoscaler(extraPoolName, 0, 1))
		defer disableAutoscaler(extraPoolName, 0, 1)

		ngNodes := getPoolNodes(f, extraPoolName)
		Expect(len(ngNodes)).To(Equal(extraNodes))
		for _, node := range ngNodes {
			By(fmt.Sprintf("Target node for scale-down: %s", node.Name))
		}

		for _, node := range ngNodes {
			drainNode(f, node)
		}
		framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
			func(size int) bool { return size <= nodeCount }, scaleDownTimeout))

		// GKE-specific check
		newSize := getPoolSize(f, extraPoolName)
		Expect(newSize).Should(Equal(0))
	}

	gceScaleToZero := func() {
		// non-GKE only
		By("Find smallest node group and manually scale it to a single node")
		minMig := ""
		minSize := nodeCount
		for mig, size := range originalSizes {
			if size <= minSize {
				minMig = mig
				minSize = size
			}
		}
		framework.ExpectNoError(framework.ResizeGroup(minMig, int32(1)))
		framework.ExpectNoError(framework.WaitForReadyNodes(c, nodeCount-minSize+1, resizeTimeout))
		ngNodes, err := framework.GetGroupNodes(minMig)
		framework.ExpectNoError(err)
		Expect(len(ngNodes) == 1).To(BeTrue())
		node, err := f.ClientSet.CoreV1().Nodes().Get(ngNodes[0], metav1.GetOptions{})
		By(fmt.Sprintf("Target node for scale-down: %s", node.Name))
		framework.ExpectNoError(err)

		// this part is identical
		drainNode(f, node)
		framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
			func(size int) bool { return size < nodeCount-minSize+1 }, scaleDownTimeout))

		// non-GKE only
		newSize, err := framework.GroupSize(minMig)
		framework.ExpectNoError(err)
		Expect(newSize).Should(Equal(0))
	}

	It("Should be able to scale a node group down to 0[Feature:ClusterSizeAutoscalingScaleDown]", func() {
		if framework.ProviderIs("gke") { // In GKE, we can just add a node pool
			gkeScaleToZero()
		} else if len(originalSizes) >= 2 {
			gceScaleToZero()
		} else {
			framework.Skipf("At least 2 node groups are needed for scale-to-0 tests")
		}
	})

	It("Shouldn't perform scale up operation and should list unhealthy status if most of the cluster is broken[Feature:ClusterSizeAutoscalingScaleUp]", func() {
		clusterSize := nodeCount
		for clusterSize < unhealthyClusterThreshold+1 {
			clusterSize = manuallyIncreaseClusterSize(f, originalSizes)
		}

		By("Block network connectivity to some nodes to simulate unhealthy cluster")
		nodesToBreakCount := int(math.Floor(math.Max(float64(unhealthyClusterThreshold), 0.5*float64(clusterSize))))
		nodes, err := f.ClientSet.CoreV1().Nodes().List(metav1.ListOptions{FieldSelector: fields.Set{
			"spec.unschedulable": "false",
		}.AsSelector().String()})
		framework.ExpectNoError(err)
		Expect(nodesToBreakCount <= len(nodes.Items)).To(BeTrue())
		nodesToBreak := nodes.Items[:nodesToBreakCount]

		// TestUnderTemporaryNetworkFailure only removes connectivity to a single node,
		// and accepts func() callback. This is expanding the loop to recursive call
		// to avoid duplicating TestUnderTemporaryNetworkFailure
		var testFunction func()
		testFunction = func() {
			if len(nodesToBreak) > 0 {
				ntb := &nodesToBreak[0]
				nodesToBreak = nodesToBreak[1:]
				framework.TestUnderTemporaryNetworkFailure(c, "default", ntb, testFunction)
			} else {
				ReserveMemory(f, "memory-reservation", 100, nodeCount*memAllocatableMb, false, defaultTimeout)
				defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, "memory-reservation")
				time.Sleep(scaleUpTimeout)
				currentNodes := framework.GetReadySchedulableNodesOrDie(f.ClientSet)
				framework.Logf("Currently available nodes: %v, nodes available at the start of test: %v, disabled nodes: %v", len(currentNodes.Items), len(nodes.Items), nodesToBreakCount)
				Expect(len(currentNodes.Items)).Should(Equal(len(nodes.Items) - nodesToBreakCount))
				status, err := getClusterwideStatus(c)
				framework.Logf("Clusterwide status: %v", status)
				framework.ExpectNoError(err)
				Expect(status).Should(Equal("Unhealthy"))
			}
		}
		testFunction()
		// Give nodes time to recover from network failure
		framework.ExpectNoError(framework.WaitForReadyNodes(c, len(nodes.Items), nodesRecoverTimeout))
	})

	It("should add new node and new node pool on too big pod, scale down to 1 and scale down to 0 [Feature:ClusterSizeAutoscalingScaleWithNAP]", func() {
		framework.SkipUnlessProviderIs("gke")
		framework.ExpectNoError(enableAutoprovisioning(""))
		By("Create first pod")
		cleanupFunc1 := ReserveMemory(f, "memory-reservation1", 1, int(1.1*float64(memAllocatableMb)), true, defaultTimeout)
		defer func() {
			if cleanupFunc1 != nil {
				cleanupFunc1()
			}
		}()
		By("Waiting for scale up")
		// Verify that cluster size increased.
		framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
			func(size int) bool { return size == nodeCount+1 }, defaultTimeout))
		By("Check if NAP group was created")
		Expect(getNAPNodePoolsNumber()).Should(Equal(1))
		By("Create second pod")
		cleanupFunc2 := ReserveMemory(f, "memory-reservation2", 1, int(1.1*float64(memAllocatableMb)), true, defaultTimeout)
		defer func() {
			if cleanupFunc2 != nil {
				cleanupFunc2()
			}
		}()
		By("Waiting for scale up")
		// Verify that cluster size increased.
		framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
			func(size int) bool { return size == nodeCount+2 }, defaultTimeout))
		By("Delete first pod")
		cleanupFunc1()
		cleanupFunc1 = nil
		By("Waiting for scale down to 1")
		// Verify that cluster size decreased.
		framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
			func(size int) bool { return size == nodeCount+1 }, scaleDownTimeout))
		By("Delete second pod")
		cleanupFunc2()
		cleanupFunc2 = nil
		By("Waiting for scale down to 0")
		// Verify that cluster size decreased.
		framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
			func(size int) bool { return size == nodeCount }, scaleDownTimeout))
		By("Waiting for NAP group remove")
		framework.ExpectNoError(waitTillAllNAPNodePoolsAreRemoved())
		By("Check if NAP group was removeed")
		Expect(getNAPNodePoolsNumber()).Should(Equal(0))
	})

	It("shouldn't add new node group if not needed [Feature:ClusterSizeAutoscalingScaleWithNAP]", func() {
		framework.SkipUnlessProviderIs("gke")
		framework.ExpectNoError(enableAutoprovisioning(""))
		By("Create pods")
		// Create nodesCountAfterResize+1 pods allocating 0.7 allocatable on present nodes. One more node will have to be created.
		cleanupFunc := ReserveMemory(f, "memory-reservation", nodeCount+1, int(float64(nodeCount+1)*float64(0.7)*float64(memAllocatableMb)), true, scaleUpTimeout)
		defer cleanupFunc()
		By("Waiting for scale up")
		// Verify that cluster size increased.
		framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
			func(size int) bool { return size >= nodeCount+1 }, scaleUpTimeout))
		By("Check if NAP group was created hoping id didn't happen")
		Expect(getNAPNodePoolsNumber()).Should(Equal(0))
	})

	It("shouldn't scale up if cores limit too low, should scale up after limit is changed [Feature:ClusterSizeAutoscalingScaleWithNAP]", func() {
		framework.SkipUnlessProviderIs("gke")
		By(fmt.Sprintf("Set core limit to %d", coreCount))
		framework.ExpectNoError(enableAutoprovisioning(fmt.Sprintf(`"resource_limits":{"name":"cpu", "minimum":2, "maximum":%d}, "resource_limits":{"name":"memory", "minimum":0, "maximum":10000000}`, coreCount)))
		// Create pod allocating 1.1 allocatable for present nodes. Bigger node will have to be created.
		cleanupFunc := ReserveMemory(f, "memory-reservation", 1, int(1.1*float64(memAllocatableMb)), false, time.Second)
		defer cleanupFunc()
		By(fmt.Sprintf("Waiting for scale up hoping it won't happen, sleep for %s", scaleUpTimeout.String()))
		time.Sleep(scaleUpTimeout)
		// Verify that cluster size is not changed
		framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
			func(size int) bool { return size == nodeCount }, time.Second))
		By("Change resource limits")
		framework.ExpectNoError(enableAutoprovisioning(fmt.Sprintf(`"resource_limits":{"name":"cpu", "minimum":2, "maximum":%d}, "resource_limits":{"name":"memory", "minimum":0, "maximum":10000000}`, coreCount+5)))
		By("Wait for scale up")
		framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
			func(size int) bool { return size == nodeCount+1 }, scaleUpTimeout))
		By("Check if NAP group was created")
		Expect(getNAPNodePoolsNumber()).Should(Equal(1))
	})

	It("should create new node if there is no node for node selector [Feature:ClusterSizeAutoscalingScaleWithNAP]", func() {
		framework.SkipUnlessProviderIs("gke")
		framework.ExpectNoError(enableAutoprovisioning(""))
		// Create pod allocating 0.7 allocatable for present nodes with node selector.
		cleanupFunc := ReserveMemoryWithSelector(f, "memory-reservation", 1, int(0.7*float64(memAllocatableMb)), true, scaleUpTimeout, map[string]string{"test": "test"})
		defer cleanupFunc()
		By("Waiting for scale up")
		// Verify that cluster size increased.
		framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
			func(size int) bool { return size == nodeCount+1 }, defaultTimeout))
		By("Check if NAP group was created")
		Expect(getNAPNodePoolsNumber()).Should(Equal(1))
	})

	It("shouldn't scale up when expendable pod is created [Feature:ClusterSizeAutoscalingScaleUp]", func() {
		// TODO(krzysztof_jastrzebski): Start running this test on GKE when Pod Priority and Preemption is in beta.
		framework.SkipUnlessProviderIs("gce")
		defer createPriorityClasses(f)()
		// Create nodesCountAfterResize+1 pods allocating 0.7 allocatable on present nodes. One more node will have to be created.
		cleanupFunc := ReserveMemoryWithPriority(f, "memory-reservation", nodeCount+1, int(float64(nodeCount+1)*float64(0.7)*float64(memAllocatableMb)), false, time.Second, expendablePriorityClassName)
		defer cleanupFunc()
		By(fmt.Sprintf("Waiting for scale up hoping it won't happen, sleep for %s", scaleUpTimeout.String()))
		time.Sleep(scaleUpTimeout)
		// Verify that cluster size is not changed
		framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
			func(size int) bool { return size == nodeCount }, time.Second))
	})

	It("should scale up when non expendable pod is created [Feature:ClusterSizeAutoscalingScaleUp]", func() {
		// TODO(krzysztof_jastrzebski): Start running this test on GKE when Pod Priority and Preemption is in beta.
		framework.SkipUnlessProviderIs("gce")
		defer createPriorityClasses(f)()
		// Create nodesCountAfterResize+1 pods allocating 0.7 allocatable on present nodes. One more node will have to be created.
		cleanupFunc := ReserveMemoryWithPriority(f, "memory-reservation", nodeCount+1, int(float64(nodeCount+1)*float64(0.7)*float64(memAllocatableMb)), true, scaleUpTimeout, highPriorityClassName)
		defer cleanupFunc()
		// Verify that cluster size is not changed
		framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
			func(size int) bool { return size > nodeCount }, time.Second))
	})

	It("shouldn't scale up when expendable pod is preempted [Feature:ClusterSizeAutoscalingScaleUp]", func() {
		// TODO(krzysztof_jastrzebski): Start running this test on GKE when Pod Priority and Preemption is in beta.
		framework.SkipUnlessProviderIs("gce")
		defer createPriorityClasses(f)()
		// Create nodesCountAfterResize pods allocating 0.7 allocatable on present nodes - one pod per node.
		cleanupFunc1 := ReserveMemoryWithPriority(f, "memory-reservation1", nodeCount, int(float64(nodeCount)*float64(0.7)*float64(memAllocatableMb)), true, defaultTimeout, expendablePriorityClassName)
		defer cleanupFunc1()
		// Create nodesCountAfterResize pods allocating 0.7 allocatable on present nodes - one pod per node. Pods created here should preempt pods created above.
		cleanupFunc2 := ReserveMemoryWithPriority(f, "memory-reservation2", nodeCount, int(float64(nodeCount)*float64(0.7)*float64(memAllocatableMb)), true, defaultTimeout, highPriorityClassName)
		defer cleanupFunc2()
		framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
			func(size int) bool { return size == nodeCount }, time.Second))
	})

	It("should scale down when expendable pod is running [Feature:ClusterSizeAutoscalingScaleDown]", func() {
		// TODO(krzysztof_jastrzebski): Start running this test on GKE when Pod Priority and Preemption is in beta.
		framework.SkipUnlessProviderIs("gce")
		defer createPriorityClasses(f)()
		increasedSize := manuallyIncreaseClusterSize(f, originalSizes)
		// Create increasedSize pods allocating 0.7 allocatable on present nodes - one pod per node.
		cleanupFunc := ReserveMemoryWithPriority(f, "memory-reservation", increasedSize, int(float64(increasedSize)*float64(0.7)*float64(memAllocatableMb)), true, scaleUpTimeout, expendablePriorityClassName)
		defer cleanupFunc()
		By("Waiting for scale down")
		framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
			func(size int) bool { return size == nodeCount }, scaleDownTimeout))
	})

	It("shouldn't scale down when non expendable pod is running [Feature:ClusterSizeAutoscalingScaleDown]", func() {
		// TODO(krzysztof_jastrzebski): Start running this test on GKE when Pod Priority and Preemption is in beta.
		framework.SkipUnlessProviderIs("gce")
		defer createPriorityClasses(f)()
		increasedSize := manuallyIncreaseClusterSize(f, originalSizes)
		// Create increasedSize pods allocating 0.7 allocatable on present nodes - one pod per node.
		cleanupFunc := ReserveMemoryWithPriority(f, "memory-reservation", increasedSize, int(float64(increasedSize)*float64(0.7)*float64(memAllocatableMb)), true, scaleUpTimeout, highPriorityClassName)
		defer cleanupFunc()
		By(fmt.Sprintf("Waiting for scale down hoping it won't happen, sleep for %s", scaleDownTimeout.String()))
		time.Sleep(scaleDownTimeout)
		framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
			func(size int) bool { return size == increasedSize }, time.Second))
	})
})

func installNvidiaDriversDaemonSet() {
	By("Add daemonset which installs nvidia drivers")
	// the link differs from one in GKE documentation; discussed with @mindprince this one should be used
	framework.RunKubectlOrDie("apply", "-f", "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/daemonset.yaml")
}

func execCmd(args ...string) *exec.Cmd {
	glog.Infof("Executing: %s", strings.Join(args, " "))
	return exec.Command(args[0], args[1:]...)
}

func runDrainTest(f *framework.Framework, migSizes map[string]int, namespace string, podsPerNode, pdbSize int, verifyFunction func(int)) {
	increasedSize := manuallyIncreaseClusterSize(f, migSizes)

	nodes, err := f.ClientSet.CoreV1().Nodes().List(metav1.ListOptions{FieldSelector: fields.Set{
		"spec.unschedulable": "false",
	}.AsSelector().String()})
	framework.ExpectNoError(err)
	numPods := len(nodes.Items) * podsPerNode
	testID := string(uuid.NewUUID()) // So that we can label and find pods
	labelMap := map[string]string{"test_id": testID}
	framework.ExpectNoError(runReplicatedPodOnEachNode(f, nodes.Items, namespace, podsPerNode, "reschedulable-pods", labelMap, 0))

	defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, namespace, "reschedulable-pods")

	By("Create a PodDisruptionBudget")
	minAvailable := intstr.FromInt(numPods - pdbSize)
	pdb := &policy.PodDisruptionBudget{
		ObjectMeta: metav1.ObjectMeta{
			Name:      "test_pdb",
			Namespace: namespace,
		},
		Spec: policy.PodDisruptionBudgetSpec{
			Selector:     &metav1.LabelSelector{MatchLabels: labelMap},
			MinAvailable: &minAvailable,
		},
	}
	_, err = f.ClientSet.PolicyV1beta1().PodDisruptionBudgets(namespace).Create(pdb)

	defer func() {
		f.ClientSet.PolicyV1beta1().PodDisruptionBudgets(namespace).Delete(pdb.Name, &metav1.DeleteOptions{})
	}()

	framework.ExpectNoError(err)
	verifyFunction(increasedSize)
}

func getGKEURL(apiVersion string, suffix string) string {
	out, err := execCmd("gcloud", "auth", "print-access-token").Output()
	framework.ExpectNoError(err)
	token := strings.Replace(string(out), "\n", "", -1)

	return fmt.Sprintf("%s/%s/%s?access_token=%s",
		gkeEndpoint,
		apiVersion,
		suffix,
		token)
}

func getGKEClusterURL(apiVersion string) string {
	if isRegionalCluster() {
		// TODO(bskiba): Use locations API for all clusters once it's graduated to v1.
		return getGKEURL(apiVersion, fmt.Sprintf("projects/%s/locations/%s/clusters/%s",
			framework.TestContext.CloudConfig.ProjectID,
			framework.TestContext.CloudConfig.Region,
			framework.TestContext.CloudConfig.Cluster))
	} else {
		return getGKEURL(apiVersion, fmt.Sprintf("projects/%s/zones/%s/clusters/%s",
			framework.TestContext.CloudConfig.ProjectID,
			framework.TestContext.CloudConfig.Zone,
			framework.TestContext.CloudConfig.Cluster))
	}
}

func getCluster(apiVersion string) (string, error) {
	resp, err := http.Get(getGKEClusterURL(apiVersion))
	if err != nil {
		return "", err
	}
	defer resp.Body.Close()
	body, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		return "", err
	}
	if resp.StatusCode != http.StatusOK {
		return "", fmt.Errorf("error: %s %s", resp.Status, body)
	}

	return string(body), nil
}

func isAutoscalerEnabled(expectedMaxNodeCountInTargetPool int) (bool, error) {
	apiVersion := "v1"
	if isRegionalCluster() {
		apiVersion = "v1beta1"
	}
	strBody, err := getCluster(apiVersion)
	if err != nil {
		return false, err
	}
	if strings.Contains(strBody, "\"maxNodeCount\": "+strconv.Itoa(expectedMaxNodeCountInTargetPool)) {
		return true, nil
	}
	return false, nil
}

func getClusterLocation() string {
	if isRegionalCluster() {
		return "--region=" + framework.TestContext.CloudConfig.Region
	} else {
		return "--zone=" + framework.TestContext.CloudConfig.Zone
	}
}

func getGcloudCommandFromTrack(commandTrack string, args []string) []string {
	command := []string{"gcloud"}
	if commandTrack == "beta" || commandTrack == "alpha" {
		command = append(command, commandTrack)
	}
	command = append(command, args...)
	command = append(command, getClusterLocation())
	command = append(command, "--project="+framework.TestContext.CloudConfig.ProjectID)
	return command
}

func getGcloudCommand(args []string) []string {
	track := ""
	if isRegionalCluster() {
		track = "beta"
	}
	return getGcloudCommandFromTrack(track, args)
}

func isRegionalCluster() bool {
	// TODO(bskiba): Use an appropri
Tech Fingerprint

Alerts (3)

Complexity hotspot; lines 183 to 184 (total complexity: 6)
183 184
'ioutil.ReadAll' Deprecated: As of Go 1.16, see io or os packages for replacement
1136