PageRenderTime 134ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/test/e2e/autoscaling/cluster_size_autoscaling.go

https://gitlab.com/unofficial-mirrors/kubernetes
Go | 1190 lines | 948 code | 158 blank | 84 comment | 158 complexity | b25ca2088c48fed59875aa37b04f29c0 MD5 | raw file
  1. /*
  2. Copyright 2016 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package autoscaling
  14. import (
  15. "bytes"
  16. "fmt"
  17. "io/ioutil"
  18. "math"
  19. "net/http"
  20. "os/exec"
  21. "regexp"
  22. "strconv"
  23. "strings"
  24. "time"
  25. "k8s.io/api/core/v1"
  26. policy "k8s.io/api/policy/v1beta1"
  27. "k8s.io/api/scheduling/v1alpha1"
  28. "k8s.io/apimachinery/pkg/api/errors"
  29. metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
  30. "k8s.io/apimachinery/pkg/fields"
  31. "k8s.io/apimachinery/pkg/labels"
  32. utilerrors "k8s.io/apimachinery/pkg/util/errors"
  33. "k8s.io/apimachinery/pkg/util/intstr"
  34. "k8s.io/apimachinery/pkg/util/sets"
  35. "k8s.io/apimachinery/pkg/util/uuid"
  36. "k8s.io/apimachinery/pkg/util/wait"
  37. clientset "k8s.io/client-go/kubernetes"
  38. api "k8s.io/kubernetes/pkg/apis/core"
  39. "k8s.io/kubernetes/test/e2e/framework"
  40. "k8s.io/kubernetes/test/e2e/scheduling"
  41. testutils "k8s.io/kubernetes/test/utils"
  42. imageutils "k8s.io/kubernetes/test/utils/image"
  43. "github.com/golang/glog"
  44. . "github.com/onsi/ginkgo"
  45. . "github.com/onsi/gomega"
  46. )
  47. const (
  48. defaultTimeout = 3 * time.Minute
  49. resizeTimeout = 5 * time.Minute
  50. manualResizeTimeout = 6 * time.Minute
  51. scaleUpTimeout = 5 * time.Minute
  52. scaleUpTriggerTimeout = 2 * time.Minute
  53. scaleDownTimeout = 20 * time.Minute
  54. podTimeout = 2 * time.Minute
  55. nodesRecoverTimeout = 5 * time.Minute
  56. rcCreationRetryTimeout = 4 * time.Minute
  57. rcCreationRetryDelay = 20 * time.Second
  58. makeSchedulableTimeout = 10 * time.Minute
  59. makeSchedulableDelay = 20 * time.Second
  60. freshStatusLimit = 20 * time.Second
  61. gkeEndpoint = "https://test-container.sandbox.googleapis.com"
  62. gkeUpdateTimeout = 15 * time.Minute
  63. gkeNodepoolNameKey = "cloud.google.com/gke-nodepool"
  64. disabledTaint = "DisabledForAutoscalingTest"
  65. criticalAddonsOnlyTaint = "CriticalAddonsOnly"
  66. newNodesForScaledownTests = 2
  67. unhealthyClusterThreshold = 4
  68. caNoScaleUpStatus = "NoActivity"
  69. caOngoingScaleUpStatus = "InProgress"
  70. timestampFormat = "2006-01-02 15:04:05 -0700 MST"
  71. expendablePriorityClassName = "expendable-priority"
  72. highPriorityClassName = "high-priority"
  73. )
  74. var _ = SIGDescribe("Cluster size autoscaling [Slow]", func() {
  75. f := framework.NewDefaultFramework("autoscaling")
  76. var c clientset.Interface
  77. var nodeCount int
  78. var coreCount int64
  79. var memAllocatableMb int
  80. var originalSizes map[string]int
  81. BeforeEach(func() {
  82. c = f.ClientSet
  83. framework.SkipUnlessProviderIs("gce", "gke")
  84. originalSizes = make(map[string]int)
  85. sum := 0
  86. for _, mig := range strings.Split(framework.TestContext.CloudConfig.NodeInstanceGroup, ",") {
  87. size, err := framework.GroupSize(mig)
  88. framework.ExpectNoError(err)
  89. By(fmt.Sprintf("Initial size of %s: %d", mig, size))
  90. originalSizes[mig] = size
  91. sum += size
  92. }
  93. // Give instances time to spin up
  94. framework.ExpectNoError(framework.WaitForReadyNodes(c, sum, scaleUpTimeout))
  95. nodes := framework.GetReadySchedulableNodesOrDie(f.ClientSet)
  96. nodeCount = len(nodes.Items)
  97. coreCount = 0
  98. for _, node := range nodes.Items {
  99. quentity := node.Status.Capacity[v1.ResourceCPU]
  100. coreCount += quentity.Value()
  101. }
  102. By(fmt.Sprintf("Initial number of schedulable nodes: %v", nodeCount))
  103. Expect(nodeCount).NotTo(BeZero())
  104. mem := nodes.Items[0].Status.Allocatable[v1.ResourceMemory]
  105. memAllocatableMb = int((&mem).Value() / 1024 / 1024)
  106. Expect(nodeCount).Should(Equal(sum))
  107. if framework.ProviderIs("gke") {
  108. val, err := isAutoscalerEnabled(5)
  109. framework.ExpectNoError(err)
  110. if !val {
  111. err = enableAutoscaler("default-pool", 3, 5)
  112. framework.ExpectNoError(err)
  113. }
  114. Expect(getNAPNodePoolsNumber()).Should(Equal(0))
  115. }
  116. })
  117. AfterEach(func() {
  118. if framework.ProviderIs("gke") {
  119. By("Remove changes introduced by NAP tests")
  120. removeNAPNodePools()
  121. disableAutoprovisioning()
  122. }
  123. By(fmt.Sprintf("Restoring initial size of the cluster"))
  124. setMigSizes(originalSizes)
  125. expectedNodes := 0
  126. for _, size := range originalSizes {
  127. expectedNodes += size
  128. }
  129. framework.ExpectNoError(framework.WaitForReadyNodes(c, expectedNodes, scaleDownTimeout))
  130. nodes, err := c.CoreV1().Nodes().List(metav1.ListOptions{})
  131. framework.ExpectNoError(err)
  132. s := time.Now()
  133. makeSchedulableLoop:
  134. for start := time.Now(); time.Since(start) < makeSchedulableTimeout; time.Sleep(makeSchedulableDelay) {
  135. for _, n := range nodes.Items {
  136. err = makeNodeSchedulable(c, &n, true)
  137. switch err.(type) {
  138. case CriticalAddonsOnlyError:
  139. continue makeSchedulableLoop
  140. default:
  141. framework.ExpectNoError(err)
  142. }
  143. }
  144. break
  145. }
  146. glog.Infof("Made nodes schedulable again in %v", time.Since(s).String())
  147. })
  148. It("shouldn't increase cluster size if pending pod is too large [Feature:ClusterSizeAutoscalingScaleUp]", func() {
  149. By("Creating unschedulable pod")
  150. ReserveMemory(f, "memory-reservation", 1, int(1.1*float64(memAllocatableMb)), false, defaultTimeout)
  151. defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, "memory-reservation")
  152. By("Waiting for scale up hoping it won't happen")
  153. // Verify that the appropriate event was generated
  154. eventFound := false
  155. EventsLoop:
  156. for start := time.Now(); time.Since(start) < scaleUpTimeout; time.Sleep(20 * time.Second) {
  157. By("Waiting for NotTriggerScaleUp event")
  158. events, err := f.ClientSet.CoreV1().Events(f.Namespace.Name).List(metav1.ListOptions{})
  159. framework.ExpectNoError(err)
  160. for _, e := range events.Items {
  161. if e.InvolvedObject.Kind == "Pod" && e.Reason == "NotTriggerScaleUp" && strings.Contains(e.Message, "it wouldn't fit if a new node is added") {
  162. By("NotTriggerScaleUp event found")
  163. eventFound = true
  164. break EventsLoop
  165. }
  166. }
  167. }
  168. Expect(eventFound).Should(Equal(true))
  169. // Verify that cluster size is not changed
  170. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  171. func(size int) bool { return size <= nodeCount }, time.Second))
  172. })
  173. simpleScaleUpTest := func(unready int) {
  174. ReserveMemory(f, "memory-reservation", 100, nodeCount*memAllocatableMb, false, 1*time.Second)
  175. defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, "memory-reservation")
  176. // Verify that cluster size is increased
  177. framework.ExpectNoError(WaitForClusterSizeFuncWithUnready(f.ClientSet,
  178. func(size int) bool { return size >= nodeCount+1 }, scaleUpTimeout, unready))
  179. framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
  180. }
  181. It("should increase cluster size if pending pods are small [Feature:ClusterSizeAutoscalingScaleUp]",
  182. func() { simpleScaleUpTest(0) })
  183. It("Should scale up GPU pool from 0 [Feature:ClusterSizeAutoscalingGpu]", func() {
  184. framework.SkipUnlessProviderIs("gke")
  185. const gpuPoolName = "gpu-pool"
  186. addGpuNodePool(gpuPoolName, "nvidia-tesla-k80", 1, 0)
  187. defer deleteNodePool(gpuPoolName)
  188. installNvidiaDriversDaemonSet()
  189. By("Enable autoscaler")
  190. framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1))
  191. defer disableAutoscaler(gpuPoolName, 0, 1)
  192. Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0))
  193. By("Schedule a pod which requires GPU")
  194. framework.ExpectNoError(scheduleGpuPod(f, "gpu-pod-rc"))
  195. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  196. func(size int) bool { return size == nodeCount+1 }, scaleUpTimeout))
  197. Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1))
  198. })
  199. It("Should scale up GPU pool from 1 [Feature:ClusterSizeAutoscalingGpu]", func() {
  200. framework.SkipUnlessProviderIs("gke")
  201. const gpuPoolName = "gpu-pool"
  202. addGpuNodePool(gpuPoolName, "nvidia-tesla-k80", 1, 1)
  203. defer deleteNodePool(gpuPoolName)
  204. installNvidiaDriversDaemonSet()
  205. By("Schedule a single pod which requires GPU")
  206. framework.ExpectNoError(scheduleGpuPod(f, "gpu-pod-rc"))
  207. By("Enable autoscaler")
  208. framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 2))
  209. defer disableAutoscaler(gpuPoolName, 0, 2)
  210. Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1))
  211. framework.ScaleRC(f.ClientSet, f.ScalesGetter, f.Namespace.Name, "gpu-pod-rc", 2, false)
  212. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  213. func(size int) bool { return size == nodeCount+2 }, scaleUpTimeout))
  214. Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(2))
  215. })
  216. It("Should not scale GPU pool up if pod does not require GPUs [Feature:ClusterSizeAutoscalingGpu]", func() {
  217. framework.SkipUnlessProviderIs("gke")
  218. const gpuPoolName = "gpu-pool"
  219. addGpuNodePool(gpuPoolName, "nvidia-tesla-k80", 1, 0)
  220. defer deleteNodePool(gpuPoolName)
  221. installNvidiaDriversDaemonSet()
  222. By("Enable autoscaler")
  223. framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1))
  224. defer disableAutoscaler(gpuPoolName, 0, 1)
  225. Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0))
  226. By("Schedule bunch of pods beyond point of filling default pool but do not request any GPUs")
  227. ReserveMemory(f, "memory-reservation", 100, nodeCount*memAllocatableMb, false, 1*time.Second)
  228. defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, "memory-reservation")
  229. // Verify that cluster size is increased
  230. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  231. func(size int) bool { return size >= nodeCount+1 }, scaleUpTimeout))
  232. // Expect gpu pool to stay intact
  233. Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0))
  234. })
  235. It("Should scale down GPU pool from 1 [Feature:ClusterSizeAutoscalingGpu]", func() {
  236. framework.SkipUnlessProviderIs("gke")
  237. const gpuPoolName = "gpu-pool"
  238. addGpuNodePool(gpuPoolName, "nvidia-tesla-k80", 1, 1)
  239. defer deleteNodePool(gpuPoolName)
  240. installNvidiaDriversDaemonSet()
  241. By("Schedule a single pod which requires GPU")
  242. framework.ExpectNoError(scheduleGpuPod(f, "gpu-pod-rc"))
  243. By("Enable autoscaler")
  244. framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1))
  245. defer disableAutoscaler(gpuPoolName, 0, 1)
  246. Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1))
  247. framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, "gpu-pod-rc")
  248. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  249. func(size int) bool { return size == nodeCount }, scaleDownTimeout))
  250. Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0))
  251. })
  252. It("should increase cluster size if pending pods are small and one node is broken [Feature:ClusterSizeAutoscalingScaleUp]",
  253. func() {
  254. framework.TestUnderTemporaryNetworkFailure(c, "default", getAnyNode(c), func() { simpleScaleUpTest(1) })
  255. })
  256. It("shouldn't trigger additional scale-ups during processing scale-up [Feature:ClusterSizeAutoscalingScaleUp]", func() {
  257. // Wait for the situation to stabilize - CA should be running and have up-to-date node readiness info.
  258. status, err := waitForScaleUpStatus(c, func(s *scaleUpStatus) bool {
  259. return s.ready == s.target && s.ready <= nodeCount
  260. }, scaleUpTriggerTimeout)
  261. framework.ExpectNoError(err)
  262. unmanagedNodes := nodeCount - status.ready
  263. By("Schedule more pods than can fit and wait for cluster to scale-up")
  264. ReserveMemory(f, "memory-reservation", 100, nodeCount*memAllocatableMb, false, 1*time.Second)
  265. defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, "memory-reservation")
  266. status, err = waitForScaleUpStatus(c, func(s *scaleUpStatus) bool {
  267. return s.status == caOngoingScaleUpStatus
  268. }, scaleUpTriggerTimeout)
  269. framework.ExpectNoError(err)
  270. target := status.target
  271. framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
  272. By("Expect no more scale-up to be happening after all pods are scheduled")
  273. status, err = getScaleUpStatus(c)
  274. framework.ExpectNoError(err)
  275. if status.target != target {
  276. glog.Warningf("Final number of nodes (%v) does not match initial scale-up target (%v).", status.target, target)
  277. }
  278. Expect(status.timestamp.Add(freshStatusLimit).Before(time.Now())).Should(Equal(false))
  279. Expect(status.status).Should(Equal(caNoScaleUpStatus))
  280. Expect(status.ready).Should(Equal(status.target))
  281. Expect(len(framework.GetReadySchedulableNodesOrDie(f.ClientSet).Items)).Should(Equal(status.target + unmanagedNodes))
  282. })
  283. It("should increase cluster size if pending pods are small and there is another node pool that is not autoscaled [Feature:ClusterSizeAutoscalingScaleUp]", func() {
  284. framework.SkipUnlessProviderIs("gke")
  285. By("Creating new node-pool with n1-standard-4 machines")
  286. const extraPoolName = "extra-pool"
  287. addNodePool(extraPoolName, "n1-standard-4", 1)
  288. defer deleteNodePool(extraPoolName)
  289. extraNodes := getPoolInitialSize(extraPoolName)
  290. framework.ExpectNoError(framework.WaitForReadyNodes(c, nodeCount+extraNodes, resizeTimeout))
  291. glog.Infof("Not enabling cluster autoscaler for the node pool (on purpose).")
  292. By("Getting memory available on new nodes, so we can account for it when creating RC")
  293. nodes := getPoolNodes(f, extraPoolName)
  294. Expect(len(nodes)).Should(Equal(extraNodes))
  295. extraMemMb := 0
  296. for _, node := range nodes {
  297. mem := node.Status.Capacity[v1.ResourceMemory]
  298. extraMemMb += int((&mem).Value() / 1024 / 1024)
  299. }
  300. By("Reserving 0.1x more memory than the cluster holds to trigger scale up")
  301. totalMemoryReservation := int(1.1 * float64(nodeCount*memAllocatableMb+extraMemMb))
  302. defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, "memory-reservation")
  303. ReserveMemory(f, "memory-reservation", 100, totalMemoryReservation, false, defaultTimeout)
  304. // Verify, that cluster size is increased
  305. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  306. func(size int) bool { return size >= nodeCount+extraNodes+1 }, scaleUpTimeout))
  307. framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
  308. })
  309. It("should disable node pool autoscaling [Feature:ClusterSizeAutoscalingScaleUp]", func() {
  310. framework.SkipUnlessProviderIs("gke")
  311. By("Creating new node-pool with n1-standard-4 machines")
  312. const extraPoolName = "extra-pool"
  313. addNodePool(extraPoolName, "n1-standard-4", 1)
  314. defer deleteNodePool(extraPoolName)
  315. extraNodes := getPoolInitialSize(extraPoolName)
  316. framework.ExpectNoError(framework.WaitForReadyNodes(c, nodeCount+extraNodes, resizeTimeout))
  317. framework.ExpectNoError(enableAutoscaler(extraPoolName, 1, 2))
  318. framework.ExpectNoError(disableAutoscaler(extraPoolName, 1, 2))
  319. })
  320. It("should increase cluster size if pods are pending due to host port conflict [Feature:ClusterSizeAutoscalingScaleUp]", func() {
  321. scheduling.CreateHostPortPods(f, "host-port", nodeCount+2, false)
  322. defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, "host-port")
  323. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  324. func(size int) bool { return size >= nodeCount+2 }, scaleUpTimeout))
  325. framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
  326. })
  327. It("should increase cluster size if pods are pending due to pod anti-affinity [Feature:ClusterSizeAutoscalingScaleUp]", func() {
  328. pods := nodeCount
  329. newPods := 2
  330. labels := map[string]string{
  331. "anti-affinity": "yes",
  332. }
  333. By("starting a pod with anti-affinity on each node")
  334. framework.ExpectNoError(runAntiAffinityPods(f, f.Namespace.Name, pods, "some-pod", labels, labels))
  335. defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, "some-pod")
  336. framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
  337. By("scheduling extra pods with anti-affinity to existing ones")
  338. framework.ExpectNoError(runAntiAffinityPods(f, f.Namespace.Name, newPods, "extra-pod", labels, labels))
  339. defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, "extra-pod")
  340. framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
  341. framework.ExpectNoError(framework.WaitForReadyNodes(c, nodeCount+newPods, scaleUpTimeout))
  342. })
  343. It("should increase cluster size if pod requesting EmptyDir volume is pending [Feature:ClusterSizeAutoscalingScaleUp]", func() {
  344. By("creating pods")
  345. pods := nodeCount
  346. newPods := 1
  347. labels := map[string]string{
  348. "anti-affinity": "yes",
  349. }
  350. framework.ExpectNoError(runAntiAffinityPods(f, f.Namespace.Name, pods, "some-pod", labels, labels))
  351. defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, "some-pod")
  352. By("waiting for all pods before triggering scale up")
  353. framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
  354. By("creating a pod requesting EmptyDir")
  355. framework.ExpectNoError(runVolumeAntiAffinityPods(f, f.Namespace.Name, newPods, "extra-pod", labels, labels, emptyDirVolumes))
  356. defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, "extra-pod")
  357. framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
  358. framework.ExpectNoError(framework.WaitForReadyNodes(c, nodeCount+newPods, scaleUpTimeout))
  359. })
  360. It("should increase cluster size if pod requesting volume is pending [Feature:ClusterSizeAutoscalingScaleUp]", func() {
  361. framework.SkipUnlessProviderIs("gce", "gke")
  362. volumeLabels := labels.Set{
  363. framework.VolumeSelectorKey: f.Namespace.Name,
  364. }
  365. selector := metav1.SetAsLabelSelector(volumeLabels)
  366. By("creating volume & pvc")
  367. diskName, err := framework.CreatePDWithRetry()
  368. framework.ExpectNoError(err)
  369. pvConfig := framework.PersistentVolumeConfig{
  370. NamePrefix: "gce-",
  371. Labels: volumeLabels,
  372. PVSource: v1.PersistentVolumeSource{
  373. GCEPersistentDisk: &v1.GCEPersistentDiskVolumeSource{
  374. PDName: diskName,
  375. FSType: "ext3",
  376. ReadOnly: false,
  377. },
  378. },
  379. Prebind: nil,
  380. }
  381. emptyStorageClass := ""
  382. pvcConfig := framework.PersistentVolumeClaimConfig{
  383. Selector: selector,
  384. StorageClassName: &emptyStorageClass,
  385. }
  386. pv, pvc, err := framework.CreatePVPVC(c, pvConfig, pvcConfig, f.Namespace.Name, false)
  387. framework.ExpectNoError(err)
  388. framework.ExpectNoError(framework.WaitOnPVandPVC(c, f.Namespace.Name, pv, pvc))
  389. defer func() {
  390. errs := framework.PVPVCCleanup(c, f.Namespace.Name, pv, pvc)
  391. if len(errs) > 0 {
  392. framework.Failf("failed to delete PVC and/or PV. Errors: %v", utilerrors.NewAggregate(errs))
  393. }
  394. pv, pvc = nil, nil
  395. if diskName != "" {
  396. framework.ExpectNoError(framework.DeletePDWithRetry(diskName))
  397. }
  398. }()
  399. By("creating pods")
  400. pods := nodeCount
  401. labels := map[string]string{
  402. "anti-affinity": "yes",
  403. }
  404. framework.ExpectNoError(runAntiAffinityPods(f, f.Namespace.Name, pods, "some-pod", labels, labels))
  405. defer func() {
  406. framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, "some-pod")
  407. glog.Infof("RC and pods not using volume deleted")
  408. }()
  409. By("waiting for all pods before triggering scale up")
  410. framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
  411. By("creating a pod requesting PVC")
  412. pvcPodName := "pvc-pod"
  413. newPods := 1
  414. volumes := buildVolumes(pv, pvc)
  415. framework.ExpectNoError(runVolumeAntiAffinityPods(f, f.Namespace.Name, newPods, pvcPodName, labels, labels, volumes))
  416. defer func() {
  417. framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, pvcPodName)
  418. framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
  419. }()
  420. framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
  421. framework.ExpectNoError(framework.WaitForReadyNodes(c, nodeCount+newPods, scaleUpTimeout))
  422. })
  423. It("should add node to the particular mig [Feature:ClusterSizeAutoscalingScaleUp]", func() {
  424. labelKey := "cluster-autoscaling-test.special-node"
  425. labelValue := "true"
  426. By("Finding the smallest MIG")
  427. minMig := ""
  428. minSize := nodeCount
  429. for mig, size := range originalSizes {
  430. if size <= minSize {
  431. minMig = mig
  432. minSize = size
  433. }
  434. }
  435. if minSize == 0 {
  436. newSizes := make(map[string]int)
  437. for mig, size := range originalSizes {
  438. newSizes[mig] = size
  439. }
  440. newSizes[minMig] = 1
  441. setMigSizes(newSizes)
  442. }
  443. removeLabels := func(nodesToClean sets.String) {
  444. By("Removing labels from nodes")
  445. for node := range nodesToClean {
  446. framework.RemoveLabelOffNode(c, node, labelKey)
  447. }
  448. }
  449. nodes, err := framework.GetGroupNodes(minMig)
  450. framework.ExpectNoError(err)
  451. nodesSet := sets.NewString(nodes...)
  452. defer removeLabels(nodesSet)
  453. By(fmt.Sprintf("Annotating nodes of the smallest MIG(%s): %v", minMig, nodes))
  454. for node := range nodesSet {
  455. framework.AddOrUpdateLabelOnNode(c, node, labelKey, labelValue)
  456. }
  457. scheduling.CreateNodeSelectorPods(f, "node-selector", minSize+1, map[string]string{labelKey: labelValue}, false)
  458. By("Waiting for new node to appear and annotating it")
  459. framework.WaitForGroupSize(minMig, int32(minSize+1))
  460. // Verify that cluster size is increased
  461. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  462. func(size int) bool { return size >= nodeCount+1 }, scaleUpTimeout))
  463. newNodes, err := framework.GetGroupNodes(minMig)
  464. framework.ExpectNoError(err)
  465. newNodesSet := sets.NewString(newNodes...)
  466. newNodesSet.Delete(nodes...)
  467. if len(newNodesSet) > 1 {
  468. By(fmt.Sprintf("Spotted following new nodes in %s: %v", minMig, newNodesSet))
  469. glog.Infof("Usually only 1 new node is expected, investigating")
  470. glog.Infof("Kubectl:%s\n", framework.RunKubectlOrDie("get", "nodes", "-o", "json"))
  471. if output, err := exec.Command("gcloud", "compute", "instances", "list",
  472. "--project="+framework.TestContext.CloudConfig.ProjectID,
  473. "--zone="+framework.TestContext.CloudConfig.Zone).Output(); err == nil {
  474. glog.Infof("Gcloud compute instances list: %s", output)
  475. } else {
  476. glog.Errorf("Failed to get instances list: %v", err)
  477. }
  478. for newNode := range newNodesSet {
  479. if output, err := execCmd("gcloud", "compute", "instances", "describe",
  480. newNode,
  481. "--project="+framework.TestContext.CloudConfig.ProjectID,
  482. "--zone="+framework.TestContext.CloudConfig.Zone).Output(); err == nil {
  483. glog.Infof("Gcloud compute instances describe: %s", output)
  484. } else {
  485. glog.Errorf("Failed to get instances describe: %v", err)
  486. }
  487. }
  488. // TODO: possibly remove broken node from newNodesSet to prevent removeLabel from crashing.
  489. // However at this moment we DO WANT it to crash so that we don't check all test runs for the
  490. // rare behavior, but only the broken ones.
  491. }
  492. By(fmt.Sprintf("New nodes: %v\n", newNodesSet))
  493. registeredNodes := sets.NewString()
  494. for nodeName := range newNodesSet {
  495. node, err := f.ClientSet.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{})
  496. if err == nil && node != nil {
  497. registeredNodes.Insert(nodeName)
  498. } else {
  499. glog.Errorf("Failed to get node %v: %v", nodeName, err)
  500. }
  501. }
  502. By(fmt.Sprintf("Setting labels for registered new nodes: %v", registeredNodes.List()))
  503. for node := range registeredNodes {
  504. framework.AddOrUpdateLabelOnNode(c, node, labelKey, labelValue)
  505. }
  506. defer removeLabels(registeredNodes)
  507. framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
  508. framework.ExpectNoError(framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, "node-selector"))
  509. })
  510. It("should scale up correct target pool [Feature:ClusterSizeAutoscalingScaleUp]", func() {
  511. framework.SkipUnlessProviderIs("gke")
  512. By("Creating new node-pool with n1-standard-4 machines")
  513. const extraPoolName = "extra-pool"
  514. addNodePool(extraPoolName, "n1-standard-4", 1)
  515. defer deleteNodePool(extraPoolName)
  516. extraNodes := getPoolInitialSize(extraPoolName)
  517. framework.ExpectNoError(framework.WaitForReadyNodes(c, nodeCount+extraNodes, resizeTimeout))
  518. framework.ExpectNoError(enableAutoscaler(extraPoolName, 1, 2))
  519. defer disableAutoscaler(extraPoolName, 1, 2)
  520. extraPods := extraNodes + 1
  521. totalMemoryReservation := int(float64(extraPods) * 1.5 * float64(memAllocatableMb))
  522. By(fmt.Sprintf("Creating rc with %v pods too big to fit default-pool but fitting extra-pool", extraPods))
  523. defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, "memory-reservation")
  524. ReserveMemory(f, "memory-reservation", extraPods, totalMemoryReservation, false, defaultTimeout)
  525. // Apparently GKE master is restarted couple minutes after the node pool is added
  526. // reseting all the timers in scale down code. Adding 5 extra minutes to workaround
  527. // this issue.
  528. // TODO: Remove the extra time when GKE restart is fixed.
  529. framework.ExpectNoError(framework.WaitForReadyNodes(c, nodeCount+extraNodes+1, scaleUpTimeout+5*time.Minute))
  530. })
  531. simpleScaleDownTest := func(unready int) {
  532. cleanup, err := addKubeSystemPdbs(f)
  533. defer cleanup()
  534. framework.ExpectNoError(err)
  535. By("Manually increase cluster size")
  536. increasedSize := 0
  537. newSizes := make(map[string]int)
  538. for key, val := range originalSizes {
  539. newSizes[key] = val + 2 + unready
  540. increasedSize += val + 2 + unready
  541. }
  542. setMigSizes(newSizes)
  543. framework.ExpectNoError(WaitForClusterSizeFuncWithUnready(f.ClientSet,
  544. func(size int) bool { return size >= increasedSize }, manualResizeTimeout, unready))
  545. By("Some node should be removed")
  546. framework.ExpectNoError(WaitForClusterSizeFuncWithUnready(f.ClientSet,
  547. func(size int) bool { return size < increasedSize }, scaleDownTimeout, unready))
  548. }
  549. It("should correctly scale down after a node is not needed [Feature:ClusterSizeAutoscalingScaleDown]",
  550. func() { simpleScaleDownTest(0) })
  551. It("should correctly scale down after a node is not needed and one node is broken [Feature:ClusterSizeAutoscalingScaleDown]",
  552. func() {
  553. framework.TestUnderTemporaryNetworkFailure(c, "default", getAnyNode(c), func() { simpleScaleDownTest(1) })
  554. })
  555. It("should correctly scale down after a node is not needed when there is non autoscaled pool[Feature:ClusterSizeAutoscalingScaleDown]", func() {
  556. framework.SkipUnlessProviderIs("gke")
  557. increasedSize := manuallyIncreaseClusterSize(f, originalSizes)
  558. const extraPoolName = "extra-pool"
  559. addNodePool(extraPoolName, "n1-standard-1", 3)
  560. defer deleteNodePool(extraPoolName)
  561. extraNodes := getPoolInitialSize(extraPoolName)
  562. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  563. func(size int) bool { return size >= increasedSize+extraNodes }, scaleUpTimeout))
  564. By("Some node should be removed")
  565. // Apparently GKE master is restarted couple minutes after the node pool is added
  566. // reseting all the timers in scale down code. Adding 10 extra minutes to workaround
  567. // this issue.
  568. // TODO: Remove the extra time when GKE restart is fixed.
  569. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  570. func(size int) bool { return size < increasedSize+extraNodes }, scaleDownTimeout+10*time.Minute))
  571. })
  572. It("should be able to scale down when rescheduling a pod is required and pdb allows for it[Feature:ClusterSizeAutoscalingScaleDown]", func() {
  573. runDrainTest(f, originalSizes, f.Namespace.Name, 1, 1, func(increasedSize int) {
  574. By("Some node should be removed")
  575. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  576. func(size int) bool { return size < increasedSize }, scaleDownTimeout))
  577. })
  578. })
  579. It("shouldn't be able to scale down when rescheduling a pod is required, but pdb doesn't allow drain[Feature:ClusterSizeAutoscalingScaleDown]", func() {
  580. runDrainTest(f, originalSizes, f.Namespace.Name, 1, 0, func(increasedSize int) {
  581. By("No nodes should be removed")
  582. time.Sleep(scaleDownTimeout)
  583. nodes := framework.GetReadySchedulableNodesOrDie(f.ClientSet)
  584. Expect(len(nodes.Items)).Should(Equal(increasedSize))
  585. })
  586. })
  587. It("should be able to scale down by draining multiple pods one by one as dictated by pdb[Feature:ClusterSizeAutoscalingScaleDown]", func() {
  588. runDrainTest(f, originalSizes, f.Namespace.Name, 2, 1, func(increasedSize int) {
  589. By("Some node should be removed")
  590. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  591. func(size int) bool { return size < increasedSize }, scaleDownTimeout))
  592. })
  593. })
  594. It("should be able to scale down by draining system pods with pdb[Feature:ClusterSizeAutoscalingScaleDown]", func() {
  595. runDrainTest(f, originalSizes, "kube-system", 2, 1, func(increasedSize int) {
  596. By("Some node should be removed")
  597. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  598. func(size int) bool { return size < increasedSize }, scaleDownTimeout))
  599. })
  600. })
  601. It("Should be able to scale a node group up from 0[Feature:ClusterSizeAutoscalingScaleUp]", func() {
  602. // Provider-specific setup
  603. if framework.ProviderIs("gke") {
  604. // GKE-specific setup
  605. By("Add a new node pool with 0 nodes and min size 0")
  606. const extraPoolName = "extra-pool"
  607. addNodePool(extraPoolName, "n1-standard-4", 0)
  608. defer deleteNodePool(extraPoolName)
  609. framework.ExpectNoError(enableAutoscaler(extraPoolName, 0, 1))
  610. defer disableAutoscaler(extraPoolName, 0, 1)
  611. } else {
  612. // on GCE, run only if there are already at least 2 node groups
  613. framework.SkipUnlessAtLeast(len(originalSizes), 2, "At least 2 node groups are needed for scale-to-0 tests")
  614. By("Manually scale smallest node group to 0")
  615. minMig := ""
  616. minSize := nodeCount
  617. for mig, size := range originalSizes {
  618. if size <= minSize {
  619. minMig = mig
  620. minSize = size
  621. }
  622. }
  623. framework.ExpectNoError(framework.ResizeGroup(minMig, int32(0)))
  624. framework.ExpectNoError(framework.WaitForReadyNodes(c, nodeCount-minSize, resizeTimeout))
  625. }
  626. By("Make remaining nodes unschedulable")
  627. nodes, err := f.ClientSet.CoreV1().Nodes().List(metav1.ListOptions{FieldSelector: fields.Set{
  628. "spec.unschedulable": "false",
  629. }.AsSelector().String()})
  630. framework.ExpectNoError(err)
  631. for _, node := range nodes.Items {
  632. err = makeNodeUnschedulable(f.ClientSet, &node)
  633. defer func(n v1.Node) {
  634. makeNodeSchedulable(f.ClientSet, &n, false)
  635. }(node)
  636. framework.ExpectNoError(err)
  637. }
  638. By("Run a scale-up test")
  639. ReserveMemory(f, "memory-reservation", 1, 100, false, 1*time.Second)
  640. defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, "memory-reservation")
  641. // Verify that cluster size is increased
  642. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  643. func(size int) bool { return size >= len(nodes.Items)+1 }, scaleUpTimeout))
  644. framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
  645. })
  646. // Scale to 0 test is split into two functions (for GKE & GCE.)
  647. // The reason for it is that scenario is exactly the same,
  648. // but setup & verification use different APIs.
  649. //
  650. // Scenario:
  651. // (GKE only) add an extra node pool with size 1 & enable autoscaling for it
  652. // (GCE only) find the smallest MIG & resize it to 1
  653. // manually drain the single node from this node pool/MIG
  654. // wait for cluster size to decrease
  655. // verify the targeted node pool/MIG is of size 0
  656. gkeScaleToZero := func() {
  657. // GKE-specific setup
  658. By("Add a new node pool with size 1 and min size 0")
  659. const extraPoolName = "extra-pool"
  660. addNodePool(extraPoolName, "n1-standard-4", 1)
  661. defer deleteNodePool(extraPoolName)
  662. extraNodes := getPoolInitialSize(extraPoolName)
  663. framework.ExpectNoError(framework.WaitForReadyNodes(c, nodeCount+extraNodes, resizeTimeout))
  664. framework.ExpectNoError(enableAutoscaler(extraPoolName, 0, 1))
  665. defer disableAutoscaler(extraPoolName, 0, 1)
  666. ngNodes := getPoolNodes(f, extraPoolName)
  667. Expect(len(ngNodes)).To(Equal(extraNodes))
  668. for _, node := range ngNodes {
  669. By(fmt.Sprintf("Target node for scale-down: %s", node.Name))
  670. }
  671. for _, node := range ngNodes {
  672. drainNode(f, node)
  673. }
  674. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  675. func(size int) bool { return size <= nodeCount }, scaleDownTimeout))
  676. // GKE-specific check
  677. newSize := getPoolSize(f, extraPoolName)
  678. Expect(newSize).Should(Equal(0))
  679. }
  680. gceScaleToZero := func() {
  681. // non-GKE only
  682. By("Find smallest node group and manually scale it to a single node")
  683. minMig := ""
  684. minSize := nodeCount
  685. for mig, size := range originalSizes {
  686. if size <= minSize {
  687. minMig = mig
  688. minSize = size
  689. }
  690. }
  691. framework.ExpectNoError(framework.ResizeGroup(minMig, int32(1)))
  692. framework.ExpectNoError(framework.WaitForReadyNodes(c, nodeCount-minSize+1, resizeTimeout))
  693. ngNodes, err := framework.GetGroupNodes(minMig)
  694. framework.ExpectNoError(err)
  695. Expect(len(ngNodes) == 1).To(BeTrue())
  696. node, err := f.ClientSet.CoreV1().Nodes().Get(ngNodes[0], metav1.GetOptions{})
  697. By(fmt.Sprintf("Target node for scale-down: %s", node.Name))
  698. framework.ExpectNoError(err)
  699. // this part is identical
  700. drainNode(f, node)
  701. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  702. func(size int) bool { return size < nodeCount-minSize+1 }, scaleDownTimeout))
  703. // non-GKE only
  704. newSize, err := framework.GroupSize(minMig)
  705. framework.ExpectNoError(err)
  706. Expect(newSize).Should(Equal(0))
  707. }
  708. It("Should be able to scale a node group down to 0[Feature:ClusterSizeAutoscalingScaleDown]", func() {
  709. if framework.ProviderIs("gke") { // In GKE, we can just add a node pool
  710. gkeScaleToZero()
  711. } else if len(originalSizes) >= 2 {
  712. gceScaleToZero()
  713. } else {
  714. framework.Skipf("At least 2 node groups are needed for scale-to-0 tests")
  715. }
  716. })
  717. It("Shouldn't perform scale up operation and should list unhealthy status if most of the cluster is broken[Feature:ClusterSizeAutoscalingScaleUp]", func() {
  718. clusterSize := nodeCount
  719. for clusterSize < unhealthyClusterThreshold+1 {
  720. clusterSize = manuallyIncreaseClusterSize(f, originalSizes)
  721. }
  722. By("Block network connectivity to some nodes to simulate unhealthy cluster")
  723. nodesToBreakCount := int(math.Floor(math.Max(float64(unhealthyClusterThreshold), 0.5*float64(clusterSize))))
  724. nodes, err := f.ClientSet.CoreV1().Nodes().List(metav1.ListOptions{FieldSelector: fields.Set{
  725. "spec.unschedulable": "false",
  726. }.AsSelector().String()})
  727. framework.ExpectNoError(err)
  728. Expect(nodesToBreakCount <= len(nodes.Items)).To(BeTrue())
  729. nodesToBreak := nodes.Items[:nodesToBreakCount]
  730. // TestUnderTemporaryNetworkFailure only removes connectivity to a single node,
  731. // and accepts func() callback. This is expanding the loop to recursive call
  732. // to avoid duplicating TestUnderTemporaryNetworkFailure
  733. var testFunction func()
  734. testFunction = func() {
  735. if len(nodesToBreak) > 0 {
  736. ntb := &nodesToBreak[0]
  737. nodesToBreak = nodesToBreak[1:]
  738. framework.TestUnderTemporaryNetworkFailure(c, "default", ntb, testFunction)
  739. } else {
  740. ReserveMemory(f, "memory-reservation", 100, nodeCount*memAllocatableMb, false, defaultTimeout)
  741. defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, "memory-reservation")
  742. time.Sleep(scaleUpTimeout)
  743. currentNodes := framework.GetReadySchedulableNodesOrDie(f.ClientSet)
  744. framework.Logf("Currently available nodes: %v, nodes available at the start of test: %v, disabled nodes: %v", len(currentNodes.Items), len(nodes.Items), nodesToBreakCount)
  745. Expect(len(currentNodes.Items)).Should(Equal(len(nodes.Items) - nodesToBreakCount))
  746. status, err := getClusterwideStatus(c)
  747. framework.Logf("Clusterwide status: %v", status)
  748. framework.ExpectNoError(err)
  749. Expect(status).Should(Equal("Unhealthy"))
  750. }
  751. }
  752. testFunction()
  753. // Give nodes time to recover from network failure
  754. framework.ExpectNoError(framework.WaitForReadyNodes(c, len(nodes.Items), nodesRecoverTimeout))
  755. })
  756. It("should add new node and new node pool on too big pod, scale down to 1 and scale down to 0 [Feature:ClusterSizeAutoscalingScaleWithNAP]", func() {
  757. framework.SkipUnlessProviderIs("gke")
  758. framework.ExpectNoError(enableAutoprovisioning(""))
  759. By("Create first pod")
  760. cleanupFunc1 := ReserveMemory(f, "memory-reservation1", 1, int(1.1*float64(memAllocatableMb)), true, defaultTimeout)
  761. defer func() {
  762. if cleanupFunc1 != nil {
  763. cleanupFunc1()
  764. }
  765. }()
  766. By("Waiting for scale up")
  767. // Verify that cluster size increased.
  768. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  769. func(size int) bool { return size == nodeCount+1 }, defaultTimeout))
  770. By("Check if NAP group was created")
  771. Expect(getNAPNodePoolsNumber()).Should(Equal(1))
  772. By("Create second pod")
  773. cleanupFunc2 := ReserveMemory(f, "memory-reservation2", 1, int(1.1*float64(memAllocatableMb)), true, defaultTimeout)
  774. defer func() {
  775. if cleanupFunc2 != nil {
  776. cleanupFunc2()
  777. }
  778. }()
  779. By("Waiting for scale up")
  780. // Verify that cluster size increased.
  781. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  782. func(size int) bool { return size == nodeCount+2 }, defaultTimeout))
  783. By("Delete first pod")
  784. cleanupFunc1()
  785. cleanupFunc1 = nil
  786. By("Waiting for scale down to 1")
  787. // Verify that cluster size decreased.
  788. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  789. func(size int) bool { return size == nodeCount+1 }, scaleDownTimeout))
  790. By("Delete second pod")
  791. cleanupFunc2()
  792. cleanupFunc2 = nil
  793. By("Waiting for scale down to 0")
  794. // Verify that cluster size decreased.
  795. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  796. func(size int) bool { return size == nodeCount }, scaleDownTimeout))
  797. By("Waiting for NAP group remove")
  798. framework.ExpectNoError(waitTillAllNAPNodePoolsAreRemoved())
  799. By("Check if NAP group was removeed")
  800. Expect(getNAPNodePoolsNumber()).Should(Equal(0))
  801. })
  802. It("shouldn't add new node group if not needed [Feature:ClusterSizeAutoscalingScaleWithNAP]", func() {
  803. framework.SkipUnlessProviderIs("gke")
  804. framework.ExpectNoError(enableAutoprovisioning(""))
  805. By("Create pods")
  806. // Create nodesCountAfterResize+1 pods allocating 0.7 allocatable on present nodes. One more node will have to be created.
  807. cleanupFunc := ReserveMemory(f, "memory-reservation", nodeCount+1, int(float64(nodeCount+1)*float64(0.7)*float64(memAllocatableMb)), true, scaleUpTimeout)
  808. defer cleanupFunc()
  809. By("Waiting for scale up")
  810. // Verify that cluster size increased.
  811. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  812. func(size int) bool { return size >= nodeCount+1 }, scaleUpTimeout))
  813. By("Check if NAP group was created hoping id didn't happen")
  814. Expect(getNAPNodePoolsNumber()).Should(Equal(0))
  815. })
  816. It("shouldn't scale up if cores limit too low, should scale up after limit is changed [Feature:ClusterSizeAutoscalingScaleWithNAP]", func() {
  817. framework.SkipUnlessProviderIs("gke")
  818. By(fmt.Sprintf("Set core limit to %d", coreCount))
  819. framework.ExpectNoError(enableAutoprovisioning(fmt.Sprintf(`"resource_limits":{"name":"cpu", "minimum":2, "maximum":%d}, "resource_limits":{"name":"memory", "minimum":0, "maximum":10000000}`, coreCount)))
  820. // Create pod allocating 1.1 allocatable for present nodes. Bigger node will have to be created.
  821. cleanupFunc := ReserveMemory(f, "memory-reservation", 1, int(1.1*float64(memAllocatableMb)), false, time.Second)
  822. defer cleanupFunc()
  823. By(fmt.Sprintf("Waiting for scale up hoping it won't happen, sleep for %s", scaleUpTimeout.String()))
  824. time.Sleep(scaleUpTimeout)
  825. // Verify that cluster size is not changed
  826. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  827. func(size int) bool { return size == nodeCount }, time.Second))
  828. By("Change resource limits")
  829. framework.ExpectNoError(enableAutoprovisioning(fmt.Sprintf(`"resource_limits":{"name":"cpu", "minimum":2, "maximum":%d}, "resource_limits":{"name":"memory", "minimum":0, "maximum":10000000}`, coreCount+5)))
  830. By("Wait for scale up")
  831. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  832. func(size int) bool { return size == nodeCount+1 }, scaleUpTimeout))
  833. By("Check if NAP group was created")
  834. Expect(getNAPNodePoolsNumber()).Should(Equal(1))
  835. })
  836. It("should create new node if there is no node for node selector [Feature:ClusterSizeAutoscalingScaleWithNAP]", func() {
  837. framework.SkipUnlessProviderIs("gke")
  838. framework.ExpectNoError(enableAutoprovisioning(""))
  839. // Create pod allocating 0.7 allocatable for present nodes with node selector.
  840. cleanupFunc := ReserveMemoryWithSelector(f, "memory-reservation", 1, int(0.7*float64(memAllocatableMb)), true, scaleUpTimeout, map[string]string{"test": "test"})
  841. defer cleanupFunc()
  842. By("Waiting for scale up")
  843. // Verify that cluster size increased.
  844. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  845. func(size int) bool { return size == nodeCount+1 }, defaultTimeout))
  846. By("Check if NAP group was created")
  847. Expect(getNAPNodePoolsNumber()).Should(Equal(1))
  848. })
  849. It("shouldn't scale up when expendable pod is created [Feature:ClusterSizeAutoscalingScaleUp]", func() {
  850. // TODO(krzysztof_jastrzebski): Start running this test on GKE when Pod Priority and Preemption is in beta.
  851. framework.SkipUnlessProviderIs("gce")
  852. defer createPriorityClasses(f)()
  853. // Create nodesCountAfterResize+1 pods allocating 0.7 allocatable on present nodes. One more node will have to be created.
  854. cleanupFunc := ReserveMemoryWithPriority(f, "memory-reservation", nodeCount+1, int(float64(nodeCount+1)*float64(0.7)*float64(memAllocatableMb)), false, time.Second, expendablePriorityClassName)
  855. defer cleanupFunc()
  856. By(fmt.Sprintf("Waiting for scale up hoping it won't happen, sleep for %s", scaleUpTimeout.String()))
  857. time.Sleep(scaleUpTimeout)
  858. // Verify that cluster size is not changed
  859. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  860. func(size int) bool { return size == nodeCount }, time.Second))
  861. })
  862. It("should scale up when non expendable pod is created [Feature:ClusterSizeAutoscalingScaleUp]", func() {
  863. // TODO(krzysztof_jastrzebski): Start running this test on GKE when Pod Priority and Preemption is in beta.
  864. framework.SkipUnlessProviderIs("gce")
  865. defer createPriorityClasses(f)()
  866. // Create nodesCountAfterResize+1 pods allocating 0.7 allocatable on present nodes. One more node will have to be created.
  867. cleanupFunc := ReserveMemoryWithPriority(f, "memory-reservation", nodeCount+1, int(float64(nodeCount+1)*float64(0.7)*float64(memAllocatableMb)), true, scaleUpTimeout, highPriorityClassName)
  868. defer cleanupFunc()
  869. // Verify that cluster size is not changed
  870. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  871. func(size int) bool { return size > nodeCount }, time.Second))
  872. })
  873. It("shouldn't scale up when expendable pod is preempted [Feature:ClusterSizeAutoscalingScaleUp]", func() {
  874. // TODO(krzysztof_jastrzebski): Start running this test on GKE when Pod Priority and Preemption is in beta.
  875. framework.SkipUnlessProviderIs("gce")
  876. defer createPriorityClasses(f)()
  877. // Create nodesCountAfterResize pods allocating 0.7 allocatable on present nodes - one pod per node.
  878. cleanupFunc1 := ReserveMemoryWithPriority(f, "memory-reservation1", nodeCount, int(float64(nodeCount)*float64(0.7)*float64(memAllocatableMb)), true, defaultTimeout, expendablePriorityClassName)
  879. defer cleanupFunc1()
  880. // Create nodesCountAfterResize pods allocating 0.7 allocatable on present nodes - one pod per node. Pods created here should preempt pods created above.
  881. cleanupFunc2 := ReserveMemoryWithPriority(f, "memory-reservation2", nodeCount, int(float64(nodeCount)*float64(0.7)*float64(memAllocatableMb)), true, defaultTimeout, highPriorityClassName)
  882. defer cleanupFunc2()
  883. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  884. func(size int) bool { return size == nodeCount }, time.Second))
  885. })
  886. It("should scale down when expendable pod is running [Feature:ClusterSizeAutoscalingScaleDown]", func() {
  887. // TODO(krzysztof_jastrzebski): Start running this test on GKE when Pod Priority and Preemption is in beta.
  888. framework.SkipUnlessProviderIs("gce")
  889. defer createPriorityClasses(f)()
  890. increasedSize := manuallyIncreaseClusterSize(f, originalSizes)
  891. // Create increasedSize pods allocating 0.7 allocatable on present nodes - one pod per node.
  892. cleanupFunc := ReserveMemoryWithPriority(f, "memory-reservation", increasedSize, int(float64(increasedSize)*float64(0.7)*float64(memAllocatableMb)), true, scaleUpTimeout, expendablePriorityClassName)
  893. defer cleanupFunc()
  894. By("Waiting for scale down")
  895. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  896. func(size int) bool { return size == nodeCount }, scaleDownTimeout))
  897. })
  898. It("shouldn't scale down when non expendable pod is running [Feature:ClusterSizeAutoscalingScaleDown]", func() {
  899. // TODO(krzysztof_jastrzebski): Start running this test on GKE when Pod Priority and Preemption is in beta.
  900. framework.SkipUnlessProviderIs("gce")
  901. defer createPriorityClasses(f)()
  902. increasedSize := manuallyIncreaseClusterSize(f, originalSizes)
  903. // Create increasedSize pods allocating 0.7 allocatable on present nodes - one pod per node.
  904. cleanupFunc := ReserveMemoryWithPriority(f, "memory-reservation", increasedSize, int(float64(increasedSize)*float64(0.7)*float64(memAllocatableMb)), true, scaleUpTimeout, highPriorityClassName)
  905. defer cleanupFunc()
  906. By(fmt.Sprintf("Waiting for scale down hoping it won't happen, sleep for %s", scaleDownTimeout.String()))
  907. time.Sleep(scaleDownTimeout)
  908. framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
  909. func(size int) bool { return size == increasedSize }, time.Second))
  910. })
  911. })
  912. func installNvidiaDriversDaemonSet() {
  913. By("Add daemonset which installs nvidia drivers")
  914. // the link differs from one in GKE documentation; discussed with @mindprince this one should be used
  915. framework.RunKubectlOrDie("apply", "-f", "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/daemonset.yaml")
  916. }
  917. func execCmd(args ...string) *exec.Cmd {
  918. glog.Infof("Executing: %s", strings.Join(args, " "))
  919. return exec.Command(args[0], args[1:]...)
  920. }
  921. func runDrainTest(f *framework.Framework, migSizes map[string]int, namespace string, podsPerNode, pdbSize int, verifyFunction func(int)) {
  922. increasedSize := manuallyIncreaseClusterSize(f, migSizes)
  923. nodes, err := f.ClientSet.CoreV1().Nodes().List(metav1.ListOptions{FieldSelector: fields.Set{
  924. "spec.unschedulable": "false",
  925. }.AsSelector().String()})
  926. framework.ExpectNoError(err)
  927. numPods := len(nodes.Items) * podsPerNode
  928. testID := string(uuid.NewUUID()) // So that we can label and find pods
  929. labelMap := map[string]string{"test_id": testID}
  930. framework.ExpectNoError(runReplicatedPodOnEachNode(f, nodes.Items, namespace, podsPerNode, "reschedulable-pods", labelMap, 0))
  931. defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, namespace, "reschedulable-pods")
  932. By("Create a PodDisruptionBudget")
  933. minAvailable := intstr.FromInt(numPods - pdbSize)
  934. pdb := &policy.PodDisruptionBudget{
  935. ObjectMeta: metav1.ObjectMeta{
  936. Name: "test_pdb",
  937. Namespace: namespace,
  938. },
  939. Spec: policy.PodDisruptionBudgetSpec{
  940. Selector: &metav1.LabelSelector{MatchLabels: labelMap},
  941. MinAvailable: &minAvailable,
  942. },
  943. }
  944. _, err = f.ClientSet.PolicyV1beta1().PodDisruptionBudgets(namespace).Create(pdb)
  945. defer func() {
  946. f.ClientSet.PolicyV1beta1().PodDisruptionBudgets(namespace).Delete(pdb.Name, &metav1.DeleteOptions{})
  947. }()
  948. framework.ExpectNoError(err)
  949. verifyFunction(increasedSize)
  950. }
  951. func getGKEURL(apiVersion string, suffix string) string {
  952. out, err := execCmd("gcloud", "auth", "print-access-token").Output()
  953. framework.ExpectNoError(err)
  954. token := strings.Replace(string(out), "\n", "", -1)
  955. return fmt.Sprintf("%s/%s/%s?access_token=%s",
  956. gkeEndpoint,
  957. apiVersion,
  958. suffix,
  959. token)
  960. }
  961. func getGKEClusterURL(apiVersion string) string {
  962. if isRegionalCluster() {
  963. // TODO(bskiba): Use locations API for all clusters once it's graduated to v1.
  964. return getGKEURL(apiVersion, fmt.Sprintf("projects/%s/locations/%s/clusters/%s",
  965. framework.TestContext.CloudConfig.ProjectID,
  966. framework.TestContext.CloudConfig.Region,
  967. framework.TestContext.CloudConfig.Cluster))
  968. } else {
  969. return getGKEURL(apiVersion, fmt.Sprintf("projects/%s/zones/%s/clusters/%s",
  970. framework.TestContext.CloudConfig.ProjectID,
  971. framework.TestContext.CloudConfig.Zone,
  972. framework.TestContext.CloudConfig.Cluster))
  973. }
  974. }
  975. func getCluster(apiVersion string) (string, error) {
  976. resp, err := http.Get(getGKEClusterURL(apiVersion))
  977. if err != nil {
  978. return "", err
  979. }
  980. defer resp.Body.Close()
  981. body, err := ioutil.ReadAll(resp.Body)
  982. if err != nil {
  983. return "", err
  984. }
  985. if resp.StatusCode != http.StatusOK {
  986. return "", fmt.Errorf("error: %s %s", resp.Status, body)
  987. }
  988. return string(body), nil
  989. }
  990. func isAutoscalerEnabled(expectedMaxNodeCountInTargetPool int) (bool, error) {
  991. apiVersion := "v1"
  992. if isRegionalCluster() {
  993. apiVersion = "v1beta1"
  994. }
  995. strBody, err := getCluster(apiVersion)
  996. if err != nil {
  997. return false, err
  998. }
  999. if strings.Contains(strBody, "\"maxNodeCount\": "+strconv.Itoa(expectedMaxNodeCountInTargetPool)) {
  1000. return true, nil
  1001. }
  1002. return false, nil
  1003. }
  1004. func getClusterLocation() string {
  1005. if isRegionalCluster() {
  1006. return "--region=" + framework.TestContext.CloudConfig.Region
  1007. } else {
  1008. return "--zone=" + framework.TestContext.CloudConfig.Zone
  1009. }
  1010. }
  1011. func getGcloudCommandFromTrack(commandTrack string, args []string) []string {
  1012. command := []string{"gcloud"}
  1013. if commandTrack == "beta" || commandTrack == "alpha" {
  1014. command = append(command, commandTrack)
  1015. }
  1016. command = append(command, args...)
  1017. command = append(command, getClusterLocation())
  1018. command = append(command, "--project="+framework.TestContext.CloudConfig.ProjectID)
  1019. return command
  1020. }
  1021. func getGcloudCommand(args []string) []string {
  1022. track := ""
  1023. if isRegionalCluster() {
  1024. track = "beta"
  1025. }
  1026. return getGcloudCommandFromTrack(track, args)
  1027. }
  1028. func isRegionalCluster() bool {
  1029. // TODO(bskiba): Use an appropri