PageRenderTime 86ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 2ms

/test/e2e/framework/util.go

https://bitbucket.org/Jake-Qu/kubernetes-mirror
Go | 5247 lines | 4217 code | 468 blank | 562 comment | 1218 complexity | 268edbed25ecbdf39f292eb10065dd0b MD5 | raw file
Possible License(s): MIT, MPL-2.0-no-copyleft-exception, 0BSD, CC0-1.0, BSD-2-Clause, Apache-2.0, BSD-3-Clause
  1. /*
  2. Copyright 2014 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package framework
  14. import (
  15. "bytes"
  16. "context"
  17. "encoding/json"
  18. "errors"
  19. "fmt"
  20. "io"
  21. "io/ioutil"
  22. "math/rand"
  23. "net"
  24. "net/http"
  25. "net/url"
  26. "os"
  27. "os/exec"
  28. "path"
  29. "path/filepath"
  30. "regexp"
  31. "sort"
  32. "strconv"
  33. "strings"
  34. "sync"
  35. "syscall"
  36. "text/tabwriter"
  37. "time"
  38. "github.com/golang/glog"
  39. "golang.org/x/crypto/ssh"
  40. "golang.org/x/net/websocket"
  41. "google.golang.org/api/googleapi"
  42. . "github.com/onsi/ginkgo"
  43. . "github.com/onsi/gomega"
  44. gomegatypes "github.com/onsi/gomega/types"
  45. apps "k8s.io/api/apps/v1"
  46. batch "k8s.io/api/batch/v1"
  47. "k8s.io/api/core/v1"
  48. extensions "k8s.io/api/extensions/v1beta1"
  49. apierrs "k8s.io/apimachinery/pkg/api/errors"
  50. metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
  51. "k8s.io/apimachinery/pkg/fields"
  52. "k8s.io/apimachinery/pkg/labels"
  53. "k8s.io/apimachinery/pkg/runtime"
  54. "k8s.io/apimachinery/pkg/runtime/schema"
  55. "k8s.io/apimachinery/pkg/types"
  56. "k8s.io/apimachinery/pkg/util/sets"
  57. "k8s.io/apimachinery/pkg/util/uuid"
  58. "k8s.io/apimachinery/pkg/util/wait"
  59. utilyaml "k8s.io/apimachinery/pkg/util/yaml"
  60. "k8s.io/apimachinery/pkg/watch"
  61. "k8s.io/client-go/discovery"
  62. "k8s.io/client-go/dynamic"
  63. restclient "k8s.io/client-go/rest"
  64. "k8s.io/client-go/tools/clientcmd"
  65. clientcmdapi "k8s.io/client-go/tools/clientcmd/api"
  66. utilfeature "k8s.io/apiserver/pkg/util/feature"
  67. clientset "k8s.io/client-go/kubernetes"
  68. scaleclient "k8s.io/client-go/scale"
  69. "k8s.io/kubernetes/pkg/api/legacyscheme"
  70. podutil "k8s.io/kubernetes/pkg/api/v1/pod"
  71. appsinternal "k8s.io/kubernetes/pkg/apis/apps"
  72. batchinternal "k8s.io/kubernetes/pkg/apis/batch"
  73. api "k8s.io/kubernetes/pkg/apis/core"
  74. extensionsinternal "k8s.io/kubernetes/pkg/apis/extensions"
  75. "k8s.io/kubernetes/pkg/client/clientset_generated/internalclientset"
  76. "k8s.io/kubernetes/pkg/client/conditions"
  77. "k8s.io/kubernetes/pkg/cloudprovider/providers/azure"
  78. gcecloud "k8s.io/kubernetes/pkg/cloudprovider/providers/gce"
  79. "k8s.io/kubernetes/pkg/controller"
  80. nodectlr "k8s.io/kubernetes/pkg/controller/nodelifecycle"
  81. "k8s.io/kubernetes/pkg/features"
  82. "k8s.io/kubernetes/pkg/kubectl"
  83. kubeletapis "k8s.io/kubernetes/pkg/kubelet/apis"
  84. "k8s.io/kubernetes/pkg/kubelet/util/format"
  85. "k8s.io/kubernetes/pkg/master/ports"
  86. "k8s.io/kubernetes/pkg/scheduler/algorithm/predicates"
  87. schedulercache "k8s.io/kubernetes/pkg/scheduler/cache"
  88. sshutil "k8s.io/kubernetes/pkg/ssh"
  89. "k8s.io/kubernetes/pkg/util/system"
  90. taintutils "k8s.io/kubernetes/pkg/util/taints"
  91. utilversion "k8s.io/kubernetes/pkg/util/version"
  92. "k8s.io/kubernetes/test/e2e/framework/ginkgowrapper"
  93. testutils "k8s.io/kubernetes/test/utils"
  94. imageutils "k8s.io/kubernetes/test/utils/image"
  95. uexec "k8s.io/utils/exec"
  96. )
  97. const (
  98. // How long to wait for the pod to be listable
  99. PodListTimeout = time.Minute
  100. // Initial pod start can be delayed O(minutes) by slow docker pulls
  101. // TODO: Make this 30 seconds once #4566 is resolved.
  102. PodStartTimeout = 5 * time.Minute
  103. // Same as `PodStartTimeout` to wait for the pod to be started, but shorter.
  104. // Use it case by case when we are sure pod start will not be delayed
  105. // minutes by slow docker pulls or something else.
  106. PodStartShortTimeout = 1 * time.Minute
  107. // How long to wait for a pod to be deleted
  108. PodDeleteTimeout = 5 * time.Minute
  109. // If there are any orphaned namespaces to clean up, this test is running
  110. // on a long lived cluster. A long wait here is preferably to spurious test
  111. // failures caused by leaked resources from a previous test run.
  112. NamespaceCleanupTimeout = 15 * time.Minute
  113. // Some pods can take much longer to get ready due to volume attach/detach latency.
  114. slowPodStartTimeout = 15 * time.Minute
  115. // How long to wait for a service endpoint to be resolvable.
  116. ServiceStartTimeout = 3 * time.Minute
  117. // How often to Poll pods, nodes and claims.
  118. Poll = 2 * time.Second
  119. pollShortTimeout = 1 * time.Minute
  120. pollLongTimeout = 5 * time.Minute
  121. // service accounts are provisioned after namespace creation
  122. // a service account is required to support pod creation in a namespace as part of admission control
  123. ServiceAccountProvisionTimeout = 2 * time.Minute
  124. // How long to try single API calls (like 'get' or 'list'). Used to prevent
  125. // transient failures from failing tests.
  126. // TODO: client should not apply this timeout to Watch calls. Increased from 30s until that is fixed.
  127. SingleCallTimeout = 5 * time.Minute
  128. // How long nodes have to be "ready" when a test begins. They should already
  129. // be "ready" before the test starts, so this is small.
  130. NodeReadyInitialTimeout = 20 * time.Second
  131. // How long pods have to be "ready" when a test begins.
  132. PodReadyBeforeTimeout = 5 * time.Minute
  133. // How long pods have to become scheduled onto nodes
  134. podScheduledBeforeTimeout = PodListTimeout + (20 * time.Second)
  135. podRespondingTimeout = 15 * time.Minute
  136. ServiceRespondingTimeout = 2 * time.Minute
  137. EndpointRegisterTimeout = time.Minute
  138. // How long claims have to become dynamically provisioned
  139. ClaimProvisionTimeout = 5 * time.Minute
  140. // Same as `ClaimProvisionTimeout` to wait for claim to be dynamically provisioned, but shorter.
  141. // Use it case by case when we are sure this timeout is enough.
  142. ClaimProvisionShortTimeout = 1 * time.Minute
  143. // How long claims have to become bound
  144. ClaimBindingTimeout = 3 * time.Minute
  145. // How long claims have to become deleted
  146. ClaimDeletingTimeout = 3 * time.Minute
  147. // How long PVs have to beome reclaimed
  148. PVReclaimingTimeout = 3 * time.Minute
  149. // How long PVs have to become bound
  150. PVBindingTimeout = 3 * time.Minute
  151. // How long PVs have to become deleted
  152. PVDeletingTimeout = 3 * time.Minute
  153. // How long a node is allowed to become "Ready" after it is restarted before
  154. // the test is considered failed.
  155. RestartNodeReadyAgainTimeout = 5 * time.Minute
  156. // How long a pod is allowed to become "running" and "ready" after a node
  157. // restart before test is considered failed.
  158. RestartPodReadyAgainTimeout = 5 * time.Minute
  159. // Number of objects that gc can delete in a second.
  160. // GC issues 2 requestes for single delete.
  161. gcThroughput = 10
  162. // Minimal number of nodes for the cluster to be considered large.
  163. largeClusterThreshold = 100
  164. // TODO(justinsb): Avoid hardcoding this.
  165. awsMasterIP = "172.20.0.9"
  166. // ssh port
  167. sshPort = "22"
  168. // ImagePrePullingTimeout is the time we wait for the e2e-image-puller
  169. // static pods to pull the list of seeded images. If they don't pull
  170. // images within this time we simply log their output and carry on
  171. // with the tests.
  172. ImagePrePullingTimeout = 5 * time.Minute
  173. )
  174. var (
  175. BusyBoxImage = "busybox"
  176. // Label allocated to the image puller static pod that runs on each node
  177. // before e2es.
  178. ImagePullerLabels = map[string]string{"name": "e2e-image-puller"}
  179. // For parsing Kubectl version for version-skewed testing.
  180. gitVersionRegexp = regexp.MustCompile("GitVersion:\"(v.+?)\"")
  181. // Slice of regexps for names of pods that have to be running to consider a Node "healthy"
  182. requiredPerNodePods = []*regexp.Regexp{
  183. regexp.MustCompile(".*kube-proxy.*"),
  184. regexp.MustCompile(".*fluentd-elasticsearch.*"),
  185. regexp.MustCompile(".*node-problem-detector.*"),
  186. }
  187. // Serve hostname image name
  188. ServeHostnameImage = imageutils.GetE2EImage(imageutils.ServeHostname)
  189. )
  190. type Address struct {
  191. internalIP string
  192. externalIP string
  193. hostname string
  194. }
  195. // GetServerArchitecture fetches the architecture of the cluster's apiserver.
  196. func GetServerArchitecture(c clientset.Interface) string {
  197. arch := ""
  198. sVer, err := c.Discovery().ServerVersion()
  199. if err != nil || sVer.Platform == "" {
  200. // If we failed to get the server version for some reason, default to amd64.
  201. arch = "amd64"
  202. } else {
  203. // Split the platform string into OS and Arch separately.
  204. // The platform string may for example be "linux/amd64", "linux/arm" or "windows/amd64".
  205. osArchArray := strings.Split(sVer.Platform, "/")
  206. arch = osArchArray[1]
  207. }
  208. return arch
  209. }
  210. // GetPauseImageName fetches the pause image name for the same architecture as the apiserver.
  211. func GetPauseImageName(c clientset.Interface) string {
  212. return imageutils.GetE2EImageWithArch(imageutils.Pause, GetServerArchitecture(c))
  213. }
  214. func GetServicesProxyRequest(c clientset.Interface, request *restclient.Request) (*restclient.Request, error) {
  215. return request.Resource("services").SubResource("proxy"), nil
  216. }
  217. // unique identifier of the e2e run
  218. var RunId = uuid.NewUUID()
  219. type CreateTestingNSFn func(baseName string, c clientset.Interface, labels map[string]string) (*v1.Namespace, error)
  220. type ContainerFailures struct {
  221. status *v1.ContainerStateTerminated
  222. Restarts int
  223. }
  224. func GetMasterHost() string {
  225. masterUrl, err := url.Parse(TestContext.Host)
  226. ExpectNoError(err)
  227. return masterUrl.Host
  228. }
  229. func nowStamp() string {
  230. return time.Now().Format(time.StampMilli)
  231. }
  232. func log(level string, format string, args ...interface{}) {
  233. fmt.Fprintf(GinkgoWriter, nowStamp()+": "+level+": "+format+"\n", args...)
  234. }
  235. func Logf(format string, args ...interface{}) {
  236. log("INFO", format, args...)
  237. }
  238. func Failf(format string, args ...interface{}) {
  239. FailfWithOffset(1, format, args...)
  240. }
  241. // FailfWithOffset calls "Fail" and logs the error at "offset" levels above its caller
  242. // (for example, for call chain f -> g -> FailfWithOffset(1, ...) error would be logged for "f").
  243. func FailfWithOffset(offset int, format string, args ...interface{}) {
  244. msg := fmt.Sprintf(format, args...)
  245. log("INFO", msg)
  246. ginkgowrapper.Fail(nowStamp()+": "+msg, 1+offset)
  247. }
  248. func Skipf(format string, args ...interface{}) {
  249. msg := fmt.Sprintf(format, args...)
  250. log("INFO", msg)
  251. ginkgowrapper.Skip(nowStamp() + ": " + msg)
  252. }
  253. func SkipUnlessNodeCountIsAtLeast(minNodeCount int) {
  254. if TestContext.CloudConfig.NumNodes < minNodeCount {
  255. Skipf("Requires at least %d nodes (not %d)", minNodeCount, TestContext.CloudConfig.NumNodes)
  256. }
  257. }
  258. func SkipUnlessNodeCountIsAtMost(maxNodeCount int) {
  259. if TestContext.CloudConfig.NumNodes > maxNodeCount {
  260. Skipf("Requires at most %d nodes (not %d)", maxNodeCount, TestContext.CloudConfig.NumNodes)
  261. }
  262. }
  263. func SkipUnlessAtLeast(value int, minValue int, message string) {
  264. if value < minValue {
  265. Skipf(message)
  266. }
  267. }
  268. func SkipIfProviderIs(unsupportedProviders ...string) {
  269. if ProviderIs(unsupportedProviders...) {
  270. Skipf("Not supported for providers %v (found %s)", unsupportedProviders, TestContext.Provider)
  271. }
  272. }
  273. func SkipUnlessLocalEphemeralStorageEnabled() {
  274. if !utilfeature.DefaultFeatureGate.Enabled(features.LocalStorageCapacityIsolation) {
  275. Skipf("Only supported when %v feature is enabled", features.LocalStorageCapacityIsolation)
  276. }
  277. }
  278. func SkipUnlessSSHKeyPresent() {
  279. if _, err := GetSigner(TestContext.Provider); err != nil {
  280. Skipf("No SSH Key for provider %s: '%v'", TestContext.Provider, err)
  281. }
  282. }
  283. func SkipUnlessProviderIs(supportedProviders ...string) {
  284. if !ProviderIs(supportedProviders...) {
  285. Skipf("Only supported for providers %v (not %s)", supportedProviders, TestContext.Provider)
  286. }
  287. }
  288. func SkipUnlessMultizone(c clientset.Interface) {
  289. zones, err := GetClusterZones(c)
  290. if err != nil {
  291. Skipf("Error listing cluster zones")
  292. }
  293. if zones.Len() <= 1 {
  294. Skipf("Requires more than one zone")
  295. }
  296. }
  297. func SkipIfMultizone(c clientset.Interface) {
  298. zones, err := GetClusterZones(c)
  299. if err != nil {
  300. Skipf("Error listing cluster zones")
  301. }
  302. if zones.Len() > 1 {
  303. Skipf("Requires more than one zone")
  304. }
  305. }
  306. func SkipUnlessClusterMonitoringModeIs(supportedMonitoring ...string) {
  307. if !ClusterMonitoringModeIs(supportedMonitoring...) {
  308. Skipf("Only next monitoring modes are supported %v (not %s)", supportedMonitoring, TestContext.ClusterMonitoringMode)
  309. }
  310. }
  311. func SkipUnlessPrometheusMonitoringIsEnabled(supportedMonitoring ...string) {
  312. if !TestContext.EnablePrometheusMonitoring {
  313. Skipf("Skipped because prometheus monitoring is not enabled")
  314. }
  315. }
  316. func SkipUnlessMasterOSDistroIs(supportedMasterOsDistros ...string) {
  317. if !MasterOSDistroIs(supportedMasterOsDistros...) {
  318. Skipf("Only supported for master OS distro %v (not %s)", supportedMasterOsDistros, TestContext.MasterOSDistro)
  319. }
  320. }
  321. func SkipUnlessNodeOSDistroIs(supportedNodeOsDistros ...string) {
  322. if !NodeOSDistroIs(supportedNodeOsDistros...) {
  323. Skipf("Only supported for node OS distro %v (not %s)", supportedNodeOsDistros, TestContext.NodeOSDistro)
  324. }
  325. }
  326. func SkipUnlessSecretExistsAfterWait(c clientset.Interface, name, namespace string, timeout time.Duration) {
  327. Logf("Waiting for secret %v in namespace %v to exist in duration %v", name, namespace, timeout)
  328. start := time.Now()
  329. if wait.PollImmediate(15*time.Second, timeout, func() (bool, error) {
  330. _, err := c.CoreV1().Secrets(namespace).Get(name, metav1.GetOptions{})
  331. if err != nil {
  332. Logf("Secret %v in namespace %v still does not exist after duration %v", name, namespace, time.Since(start))
  333. return false, nil
  334. }
  335. return true, nil
  336. }) != nil {
  337. Skipf("Secret %v in namespace %v did not exist after timeout of %v", name, namespace, timeout)
  338. }
  339. Logf("Secret %v in namespace %v found after duration %v", name, namespace, time.Since(start))
  340. }
  341. func SkipIfContainerRuntimeIs(runtimes ...string) {
  342. for _, runtime := range runtimes {
  343. if runtime == TestContext.ContainerRuntime {
  344. Skipf("Not supported under container runtime %s", runtime)
  345. }
  346. }
  347. }
  348. func RunIfContainerRuntimeIs(runtimes ...string) {
  349. for _, runtime := range runtimes {
  350. if runtime == TestContext.ContainerRuntime {
  351. return
  352. }
  353. }
  354. Skipf("Skipped because container runtime %q is not in %s", TestContext.ContainerRuntime, runtimes)
  355. }
  356. func RunIfSystemSpecNameIs(names ...string) {
  357. for _, name := range names {
  358. if name == TestContext.SystemSpecName {
  359. return
  360. }
  361. }
  362. Skipf("Skipped because system spec name %q is not in %v", TestContext.SystemSpecName, names)
  363. }
  364. func ProviderIs(providers ...string) bool {
  365. for _, provider := range providers {
  366. if strings.ToLower(provider) == strings.ToLower(TestContext.Provider) {
  367. return true
  368. }
  369. }
  370. return false
  371. }
  372. func ClusterMonitoringModeIs(monitoringModes ...string) bool {
  373. for _, mode := range monitoringModes {
  374. if strings.ToLower(mode) == strings.ToLower(TestContext.ClusterMonitoringMode) {
  375. return true
  376. }
  377. }
  378. return false
  379. }
  380. func MasterOSDistroIs(supportedMasterOsDistros ...string) bool {
  381. for _, distro := range supportedMasterOsDistros {
  382. if strings.ToLower(distro) == strings.ToLower(TestContext.MasterOSDistro) {
  383. return true
  384. }
  385. }
  386. return false
  387. }
  388. func NodeOSDistroIs(supportedNodeOsDistros ...string) bool {
  389. for _, distro := range supportedNodeOsDistros {
  390. if strings.ToLower(distro) == strings.ToLower(TestContext.NodeOSDistro) {
  391. return true
  392. }
  393. }
  394. return false
  395. }
  396. func ProxyMode(f *Framework) (string, error) {
  397. pod := &v1.Pod{
  398. ObjectMeta: metav1.ObjectMeta{
  399. Name: "kube-proxy-mode-detector",
  400. Namespace: f.Namespace.Name,
  401. },
  402. Spec: v1.PodSpec{
  403. HostNetwork: true,
  404. Containers: []v1.Container{
  405. {
  406. Name: "detector",
  407. Image: imageutils.GetE2EImage(imageutils.Net),
  408. Command: []string{"/bin/sleep", "3600"},
  409. },
  410. },
  411. },
  412. }
  413. f.PodClient().CreateSync(pod)
  414. defer f.PodClient().DeleteSync(pod.Name, &metav1.DeleteOptions{}, DefaultPodDeletionTimeout)
  415. cmd := "curl -q -s --connect-timeout 1 http://localhost:10249/proxyMode"
  416. stdout, err := RunHostCmd(pod.Namespace, pod.Name, cmd)
  417. if err != nil {
  418. return "", err
  419. }
  420. Logf("ProxyMode: %s", stdout)
  421. return stdout, nil
  422. }
  423. func SkipUnlessServerVersionGTE(v *utilversion.Version, c discovery.ServerVersionInterface) {
  424. gte, err := ServerVersionGTE(v, c)
  425. if err != nil {
  426. Failf("Failed to get server version: %v", err)
  427. }
  428. if !gte {
  429. Skipf("Not supported for server versions before %q", v)
  430. }
  431. }
  432. func SkipIfMissingResource(dynamicClient dynamic.Interface, gvr schema.GroupVersionResource, namespace string) {
  433. resourceClient := dynamicClient.Resource(gvr).Namespace(namespace)
  434. _, err := resourceClient.List(metav1.ListOptions{})
  435. if err != nil {
  436. // not all resources support list, so we ignore those
  437. if apierrs.IsMethodNotSupported(err) || apierrs.IsNotFound(err) || apierrs.IsForbidden(err) {
  438. Skipf("Could not find %s resource, skipping test: %#v", gvr, err)
  439. }
  440. Failf("Unexpected error getting %v: %v", gvr, err)
  441. }
  442. }
  443. // ProvidersWithSSH are those providers where each node is accessible with SSH
  444. var ProvidersWithSSH = []string{"gce", "gke", "aws", "local"}
  445. type podCondition func(pod *v1.Pod) (bool, error)
  446. // logPodStates logs basic info of provided pods for debugging.
  447. func logPodStates(pods []v1.Pod) {
  448. // Find maximum widths for pod, node, and phase strings for column printing.
  449. maxPodW, maxNodeW, maxPhaseW, maxGraceW := len("POD"), len("NODE"), len("PHASE"), len("GRACE")
  450. for i := range pods {
  451. pod := &pods[i]
  452. if len(pod.ObjectMeta.Name) > maxPodW {
  453. maxPodW = len(pod.ObjectMeta.Name)
  454. }
  455. if len(pod.Spec.NodeName) > maxNodeW {
  456. maxNodeW = len(pod.Spec.NodeName)
  457. }
  458. if len(pod.Status.Phase) > maxPhaseW {
  459. maxPhaseW = len(pod.Status.Phase)
  460. }
  461. }
  462. // Increase widths by one to separate by a single space.
  463. maxPodW++
  464. maxNodeW++
  465. maxPhaseW++
  466. maxGraceW++
  467. // Log pod info. * does space padding, - makes them left-aligned.
  468. Logf("%-[1]*[2]s %-[3]*[4]s %-[5]*[6]s %-[7]*[8]s %[9]s",
  469. maxPodW, "POD", maxNodeW, "NODE", maxPhaseW, "PHASE", maxGraceW, "GRACE", "CONDITIONS")
  470. for _, pod := range pods {
  471. grace := ""
  472. if pod.DeletionGracePeriodSeconds != nil {
  473. grace = fmt.Sprintf("%ds", *pod.DeletionGracePeriodSeconds)
  474. }
  475. Logf("%-[1]*[2]s %-[3]*[4]s %-[5]*[6]s %-[7]*[8]s %[9]s",
  476. maxPodW, pod.ObjectMeta.Name, maxNodeW, pod.Spec.NodeName, maxPhaseW, pod.Status.Phase, maxGraceW, grace, pod.Status.Conditions)
  477. }
  478. Logf("") // Final empty line helps for readability.
  479. }
  480. // errorBadPodsStates create error message of basic info of bad pods for debugging.
  481. func errorBadPodsStates(badPods []v1.Pod, desiredPods int, ns, desiredState string, timeout time.Duration) string {
  482. errStr := fmt.Sprintf("%d / %d pods in namespace %q are NOT in %s state in %v\n", len(badPods), desiredPods, ns, desiredState, timeout)
  483. // Print bad pods info only if there are fewer than 10 bad pods
  484. if len(badPods) > 10 {
  485. return errStr + "There are too many bad pods. Please check log for details."
  486. }
  487. buf := bytes.NewBuffer(nil)
  488. w := tabwriter.NewWriter(buf, 0, 0, 1, ' ', 0)
  489. fmt.Fprintln(w, "POD\tNODE\tPHASE\tGRACE\tCONDITIONS")
  490. for _, badPod := range badPods {
  491. grace := ""
  492. if badPod.DeletionGracePeriodSeconds != nil {
  493. grace = fmt.Sprintf("%ds", *badPod.DeletionGracePeriodSeconds)
  494. }
  495. podInfo := fmt.Sprintf("%s\t%s\t%s\t%s\t%+v",
  496. badPod.ObjectMeta.Name, badPod.Spec.NodeName, badPod.Status.Phase, grace, badPod.Status.Conditions)
  497. fmt.Fprintln(w, podInfo)
  498. }
  499. w.Flush()
  500. return errStr + buf.String()
  501. }
  502. // WaitForPodsSuccess waits till all labels matching the given selector enter
  503. // the Success state. The caller is expected to only invoke this method once the
  504. // pods have been created.
  505. func WaitForPodsSuccess(c clientset.Interface, ns string, successPodLabels map[string]string, timeout time.Duration) error {
  506. successPodSelector := labels.SelectorFromSet(successPodLabels)
  507. start, badPods, desiredPods := time.Now(), []v1.Pod{}, 0
  508. if wait.PollImmediate(30*time.Second, timeout, func() (bool, error) {
  509. podList, err := c.CoreV1().Pods(ns).List(metav1.ListOptions{LabelSelector: successPodSelector.String()})
  510. if err != nil {
  511. Logf("Error getting pods in namespace %q: %v", ns, err)
  512. if testutils.IsRetryableAPIError(err) {
  513. return false, nil
  514. }
  515. return false, err
  516. }
  517. if len(podList.Items) == 0 {
  518. Logf("Waiting for pods to enter Success, but no pods in %q match label %v", ns, successPodLabels)
  519. return true, nil
  520. }
  521. badPods = []v1.Pod{}
  522. desiredPods = len(podList.Items)
  523. for _, pod := range podList.Items {
  524. if pod.Status.Phase != v1.PodSucceeded {
  525. badPods = append(badPods, pod)
  526. }
  527. }
  528. successPods := len(podList.Items) - len(badPods)
  529. Logf("%d / %d pods in namespace %q are in Success state (%d seconds elapsed)",
  530. successPods, len(podList.Items), ns, int(time.Since(start).Seconds()))
  531. if len(badPods) == 0 {
  532. return true, nil
  533. }
  534. return false, nil
  535. }) != nil {
  536. logPodStates(badPods)
  537. LogPodsWithLabels(c, ns, successPodLabels, Logf)
  538. return errors.New(errorBadPodsStates(badPods, desiredPods, ns, "SUCCESS", timeout))
  539. }
  540. return nil
  541. }
  542. // WaitForPodsRunningReady waits up to timeout to ensure that all pods in
  543. // namespace ns are either running and ready, or failed but controlled by a
  544. // controller. Also, it ensures that at least minPods are running and
  545. // ready. It has separate behavior from other 'wait for' pods functions in
  546. // that it requests the list of pods on every iteration. This is useful, for
  547. // example, in cluster startup, because the number of pods increases while
  548. // waiting. All pods that are in SUCCESS state are not counted.
  549. //
  550. // If ignoreLabels is not empty, pods matching this selector are ignored.
  551. func WaitForPodsRunningReady(c clientset.Interface, ns string, minPods, allowedNotReadyPods int32, timeout time.Duration, ignoreLabels map[string]string) error {
  552. ignoreSelector := labels.SelectorFromSet(ignoreLabels)
  553. start := time.Now()
  554. Logf("Waiting up to %v for all pods (need at least %d) in namespace '%s' to be running and ready",
  555. timeout, minPods, ns)
  556. wg := sync.WaitGroup{}
  557. wg.Add(1)
  558. var ignoreNotReady bool
  559. badPods := []v1.Pod{}
  560. desiredPods := 0
  561. notReady := int32(0)
  562. if wait.PollImmediate(Poll, timeout, func() (bool, error) {
  563. // We get the new list of pods, replication controllers, and
  564. // replica sets in every iteration because more pods come
  565. // online during startup and we want to ensure they are also
  566. // checked.
  567. replicas, replicaOk := int32(0), int32(0)
  568. rcList, err := c.CoreV1().ReplicationControllers(ns).List(metav1.ListOptions{})
  569. if err != nil {
  570. Logf("Error getting replication controllers in namespace '%s': %v", ns, err)
  571. if testutils.IsRetryableAPIError(err) {
  572. return false, nil
  573. }
  574. return false, err
  575. }
  576. for _, rc := range rcList.Items {
  577. replicas += *rc.Spec.Replicas
  578. replicaOk += rc.Status.ReadyReplicas
  579. }
  580. rsList, err := c.ExtensionsV1beta1().ReplicaSets(ns).List(metav1.ListOptions{})
  581. if err != nil {
  582. Logf("Error getting replication sets in namespace %q: %v", ns, err)
  583. if testutils.IsRetryableAPIError(err) {
  584. return false, nil
  585. }
  586. return false, err
  587. }
  588. for _, rs := range rsList.Items {
  589. replicas += *rs.Spec.Replicas
  590. replicaOk += rs.Status.ReadyReplicas
  591. }
  592. podList, err := c.CoreV1().Pods(ns).List(metav1.ListOptions{})
  593. if err != nil {
  594. Logf("Error getting pods in namespace '%s': %v", ns, err)
  595. if testutils.IsRetryableAPIError(err) {
  596. return false, nil
  597. }
  598. return false, err
  599. }
  600. nOk := int32(0)
  601. notReady = int32(0)
  602. badPods = []v1.Pod{}
  603. desiredPods = len(podList.Items)
  604. for _, pod := range podList.Items {
  605. if len(ignoreLabels) != 0 && ignoreSelector.Matches(labels.Set(pod.Labels)) {
  606. continue
  607. }
  608. res, err := testutils.PodRunningReady(&pod)
  609. switch {
  610. case res && err == nil:
  611. nOk++
  612. case pod.Status.Phase == v1.PodSucceeded:
  613. Logf("The status of Pod %s is Succeeded, skipping waiting", pod.ObjectMeta.Name)
  614. // it doesn't make sense to wait for this pod
  615. continue
  616. case pod.Status.Phase != v1.PodFailed:
  617. Logf("The status of Pod %s is %s (Ready = false), waiting for it to be either Running (with Ready = true) or Failed", pod.ObjectMeta.Name, pod.Status.Phase)
  618. notReady++
  619. badPods = append(badPods, pod)
  620. default:
  621. if metav1.GetControllerOf(&pod) == nil {
  622. Logf("Pod %s is Failed, but it's not controlled by a controller", pod.ObjectMeta.Name)
  623. badPods = append(badPods, pod)
  624. }
  625. //ignore failed pods that are controlled by some controller
  626. }
  627. }
  628. Logf("%d / %d pods in namespace '%s' are running and ready (%d seconds elapsed)",
  629. nOk, len(podList.Items), ns, int(time.Since(start).Seconds()))
  630. Logf("expected %d pod replicas in namespace '%s', %d are Running and Ready.", replicas, ns, replicaOk)
  631. if replicaOk == replicas && nOk >= minPods && len(badPods) == 0 {
  632. return true, nil
  633. }
  634. ignoreNotReady = (notReady <= allowedNotReadyPods)
  635. logPodStates(badPods)
  636. return false, nil
  637. }) != nil {
  638. if !ignoreNotReady {
  639. return errors.New(errorBadPodsStates(badPods, desiredPods, ns, "RUNNING and READY", timeout))
  640. }
  641. Logf("Number of not-ready pods (%d) is below the allowed threshold (%d).", notReady, allowedNotReadyPods)
  642. }
  643. return nil
  644. }
  645. func kubectlLogPod(c clientset.Interface, pod v1.Pod, containerNameSubstr string, logFunc func(ftm string, args ...interface{})) {
  646. for _, container := range pod.Spec.Containers {
  647. if strings.Contains(container.Name, containerNameSubstr) {
  648. // Contains() matches all strings if substr is empty
  649. logs, err := GetPodLogs(c, pod.Namespace, pod.Name, container.Name)
  650. if err != nil {
  651. logs, err = getPreviousPodLogs(c, pod.Namespace, pod.Name, container.Name)
  652. if err != nil {
  653. logFunc("Failed to get logs of pod %v, container %v, err: %v", pod.Name, container.Name, err)
  654. }
  655. }
  656. logFunc("Logs of %v/%v:%v on node %v", pod.Namespace, pod.Name, container.Name, pod.Spec.NodeName)
  657. logFunc("%s : STARTLOG\n%s\nENDLOG for container %v:%v:%v", containerNameSubstr, logs, pod.Namespace, pod.Name, container.Name)
  658. }
  659. }
  660. }
  661. func LogFailedContainers(c clientset.Interface, ns string, logFunc func(ftm string, args ...interface{})) {
  662. podList, err := c.CoreV1().Pods(ns).List(metav1.ListOptions{})
  663. if err != nil {
  664. logFunc("Error getting pods in namespace '%s': %v", ns, err)
  665. return
  666. }
  667. logFunc("Running kubectl logs on non-ready containers in %v", ns)
  668. for _, pod := range podList.Items {
  669. if res, err := testutils.PodRunningReady(&pod); !res || err != nil {
  670. kubectlLogPod(c, pod, "", Logf)
  671. }
  672. }
  673. }
  674. func LogPodsWithLabels(c clientset.Interface, ns string, match map[string]string, logFunc func(ftm string, args ...interface{})) {
  675. podList, err := c.CoreV1().Pods(ns).List(metav1.ListOptions{LabelSelector: labels.SelectorFromSet(match).String()})
  676. if err != nil {
  677. logFunc("Error getting pods in namespace %q: %v", ns, err)
  678. return
  679. }
  680. logFunc("Running kubectl logs on pods with labels %v in %v", match, ns)
  681. for _, pod := range podList.Items {
  682. kubectlLogPod(c, pod, "", logFunc)
  683. }
  684. }
  685. func LogContainersInPodsWithLabels(c clientset.Interface, ns string, match map[string]string, containerSubstr string, logFunc func(ftm string, args ...interface{})) {
  686. podList, err := c.CoreV1().Pods(ns).List(metav1.ListOptions{LabelSelector: labels.SelectorFromSet(match).String()})
  687. if err != nil {
  688. Logf("Error getting pods in namespace %q: %v", ns, err)
  689. return
  690. }
  691. for _, pod := range podList.Items {
  692. kubectlLogPod(c, pod, containerSubstr, logFunc)
  693. }
  694. }
  695. // DeleteNamespaces deletes all namespaces that match the given delete and skip filters.
  696. // Filter is by simple strings.Contains; first skip filter, then delete filter.
  697. // Returns the list of deleted namespaces or an error.
  698. func DeleteNamespaces(c clientset.Interface, deleteFilter, skipFilter []string) ([]string, error) {
  699. By("Deleting namespaces")
  700. nsList, err := c.CoreV1().Namespaces().List(metav1.ListOptions{})
  701. Expect(err).NotTo(HaveOccurred())
  702. var deleted []string
  703. var wg sync.WaitGroup
  704. OUTER:
  705. for _, item := range nsList.Items {
  706. if skipFilter != nil {
  707. for _, pattern := range skipFilter {
  708. if strings.Contains(item.Name, pattern) {
  709. continue OUTER
  710. }
  711. }
  712. }
  713. if deleteFilter != nil {
  714. var shouldDelete bool
  715. for _, pattern := range deleteFilter {
  716. if strings.Contains(item.Name, pattern) {
  717. shouldDelete = true
  718. break
  719. }
  720. }
  721. if !shouldDelete {
  722. continue OUTER
  723. }
  724. }
  725. wg.Add(1)
  726. deleted = append(deleted, item.Name)
  727. go func(nsName string) {
  728. defer wg.Done()
  729. defer GinkgoRecover()
  730. Expect(c.CoreV1().Namespaces().Delete(nsName, nil)).To(Succeed())
  731. Logf("namespace : %v api call to delete is complete ", nsName)
  732. }(item.Name)
  733. }
  734. wg.Wait()
  735. return deleted, nil
  736. }
  737. func WaitForNamespacesDeleted(c clientset.Interface, namespaces []string, timeout time.Duration) error {
  738. By("Waiting for namespaces to vanish")
  739. nsMap := map[string]bool{}
  740. for _, ns := range namespaces {
  741. nsMap[ns] = true
  742. }
  743. //Now POLL until all namespaces have been eradicated.
  744. return wait.Poll(2*time.Second, timeout,
  745. func() (bool, error) {
  746. nsList, err := c.CoreV1().Namespaces().List(metav1.ListOptions{})
  747. if err != nil {
  748. return false, err
  749. }
  750. for _, item := range nsList.Items {
  751. if _, ok := nsMap[item.Name]; ok {
  752. return false, nil
  753. }
  754. }
  755. return true, nil
  756. })
  757. }
  758. func waitForServiceAccountInNamespace(c clientset.Interface, ns, serviceAccountName string, timeout time.Duration) error {
  759. w, err := c.CoreV1().ServiceAccounts(ns).Watch(metav1.SingleObject(metav1.ObjectMeta{Name: serviceAccountName}))
  760. if err != nil {
  761. return err
  762. }
  763. _, err = watch.Until(timeout, w, conditions.ServiceAccountHasSecrets)
  764. return err
  765. }
  766. func WaitForPodCondition(c clientset.Interface, ns, podName, desc string, timeout time.Duration, condition podCondition) error {
  767. Logf("Waiting up to %v for pod %q in namespace %q to be %q", timeout, podName, ns, desc)
  768. for start := time.Now(); time.Since(start) < timeout; time.Sleep(Poll) {
  769. pod, err := c.CoreV1().Pods(ns).Get(podName, metav1.GetOptions{})
  770. if err != nil {
  771. if apierrs.IsNotFound(err) {
  772. Logf("Pod %q in namespace %q not found. Error: %v", podName, ns, err)
  773. return err
  774. }
  775. Logf("Get pod %q in namespace %q failed, ignoring for %v. Error: %v", podName, ns, Poll, err)
  776. continue
  777. }
  778. // log now so that current pod info is reported before calling `condition()`
  779. Logf("Pod %q: Phase=%q, Reason=%q, readiness=%t. Elapsed: %v",
  780. podName, pod.Status.Phase, pod.Status.Reason, podutil.IsPodReady(pod), time.Since(start))
  781. if done, err := condition(pod); done {
  782. if err == nil {
  783. Logf("Pod %q satisfied condition %q", podName, desc)
  784. }
  785. return err
  786. }
  787. }
  788. return fmt.Errorf("Gave up after waiting %v for pod %q to be %q", timeout, podName, desc)
  789. }
  790. // WaitForMatchPodsCondition finds match pods based on the input ListOptions.
  791. // waits and checks if all match pods are in the given podCondition
  792. func WaitForMatchPodsCondition(c clientset.Interface, opts metav1.ListOptions, desc string, timeout time.Duration, condition podCondition) error {
  793. Logf("Waiting up to %v for matching pods' status to be %s", timeout, desc)
  794. for start := time.Now(); time.Since(start) < timeout; time.Sleep(Poll) {
  795. pods, err := c.CoreV1().Pods(metav1.NamespaceAll).List(opts)
  796. if err != nil {
  797. return err
  798. }
  799. conditionNotMatch := []string{}
  800. for _, pod := range pods.Items {
  801. done, err := condition(&pod)
  802. if done && err != nil {
  803. return fmt.Errorf("Unexpected error: %v", err)
  804. }
  805. if !done {
  806. conditionNotMatch = append(conditionNotMatch, format.Pod(&pod))
  807. }
  808. }
  809. if len(conditionNotMatch) <= 0 {
  810. return err
  811. }
  812. Logf("%d pods are not %s: %v", len(conditionNotMatch), desc, conditionNotMatch)
  813. }
  814. return fmt.Errorf("gave up waiting for matching pods to be '%s' after %v", desc, timeout)
  815. }
  816. // WaitForDefaultServiceAccountInNamespace waits for the default service account to be provisioned
  817. // the default service account is what is associated with pods when they do not specify a service account
  818. // as a result, pods are not able to be provisioned in a namespace until the service account is provisioned
  819. func WaitForDefaultServiceAccountInNamespace(c clientset.Interface, namespace string) error {
  820. return waitForServiceAccountInNamespace(c, namespace, "default", ServiceAccountProvisionTimeout)
  821. }
  822. // WaitForPersistentVolumePhase waits for a PersistentVolume to be in a specific phase or until timeout occurs, whichever comes first.
  823. func WaitForPersistentVolumePhase(phase v1.PersistentVolumePhase, c clientset.Interface, pvName string, Poll, timeout time.Duration) error {
  824. Logf("Waiting up to %v for PersistentVolume %s to have phase %s", timeout, pvName, phase)
  825. for start := time.Now(); time.Since(start) < timeout; time.Sleep(Poll) {
  826. pv, err := c.CoreV1().PersistentVolumes().Get(pvName, metav1.GetOptions{})
  827. if err != nil {
  828. Logf("Get persistent volume %s in failed, ignoring for %v: %v", pvName, Poll, err)
  829. continue
  830. } else {
  831. if pv.Status.Phase == phase {
  832. Logf("PersistentVolume %s found and phase=%s (%v)", pvName, phase, time.Since(start))
  833. return nil
  834. } else {
  835. Logf("PersistentVolume %s found but phase is %s instead of %s.", pvName, pv.Status.Phase, phase)
  836. }
  837. }
  838. }
  839. return fmt.Errorf("PersistentVolume %s not in phase %s within %v", pvName, phase, timeout)
  840. }
  841. // WaitForStatefulSetReplicasReady waits for all replicas of a StatefulSet to become ready or until timeout occurs, whichever comes first.
  842. func WaitForStatefulSetReplicasReady(statefulSetName, ns string, c clientset.Interface, Poll, timeout time.Duration) error {
  843. Logf("Waiting up to %v for StatefulSet %s to have all replicas ready", timeout, statefulSetName)
  844. for start := time.Now(); time.Since(start) < timeout; time.Sleep(Poll) {
  845. sts, err := c.AppsV1().StatefulSets(ns).Get(statefulSetName, metav1.GetOptions{})
  846. if err != nil {
  847. Logf("Get StatefulSet %s failed, ignoring for %v: %v", statefulSetName, Poll, err)
  848. continue
  849. } else {
  850. if sts.Status.ReadyReplicas == *sts.Spec.Replicas {
  851. Logf("All %d replicas of StatefulSet %s are ready. (%v)", sts.Status.ReadyReplicas, statefulSetName, time.Since(start))
  852. return nil
  853. } else {
  854. Logf("StatefulSet %s found but there are %d ready replicas and %d total replicas.", statefulSetName, sts.Status.ReadyReplicas, *sts.Spec.Replicas)
  855. }
  856. }
  857. }
  858. return fmt.Errorf("StatefulSet %s still has unready pods within %v", statefulSetName, timeout)
  859. }
  860. // WaitForPersistentVolumeDeleted waits for a PersistentVolume to get deleted or until timeout occurs, whichever comes first.
  861. func WaitForPersistentVolumeDeleted(c clientset.Interface, pvName string, Poll, timeout time.Duration) error {
  862. Logf("Waiting up to %v for PersistentVolume %s to get deleted", timeout, pvName)
  863. for start := time.Now(); time.Since(start) < timeout; time.Sleep(Poll) {
  864. pv, err := c.CoreV1().PersistentVolumes().Get(pvName, metav1.GetOptions{})
  865. if err == nil {
  866. Logf("PersistentVolume %s found and phase=%s (%v)", pvName, pv.Status.Phase, time.Since(start))
  867. continue
  868. } else {
  869. if apierrs.IsNotFound(err) {
  870. Logf("PersistentVolume %s was removed", pvName)
  871. return nil
  872. } else {
  873. Logf("Get persistent volume %s in failed, ignoring for %v: %v", pvName, Poll, err)
  874. }
  875. }
  876. }
  877. return fmt.Errorf("PersistentVolume %s still exists within %v", pvName, timeout)
  878. }
  879. // WaitForPersistentVolumeClaimPhase waits for a PersistentVolumeClaim to be in a specific phase or until timeout occurs, whichever comes first.
  880. func WaitForPersistentVolumeClaimPhase(phase v1.PersistentVolumeClaimPhase, c clientset.Interface, ns string, pvcName string, Poll, timeout time.Duration) error {
  881. Logf("Waiting up to %v for PersistentVolumeClaim %s to have phase %s", timeout, pvcName, phase)
  882. for start := time.Now(); time.Since(start) < timeout; time.Sleep(Poll) {
  883. pvc, err := c.CoreV1().PersistentVolumeClaims(ns).Get(pvcName, metav1.GetOptions{})
  884. if err != nil {
  885. Logf("Failed to get claim %q, retrying in %v. Error: %v", pvcName, Poll, err)
  886. continue
  887. } else {
  888. if pvc.Status.Phase == phase {
  889. Logf("PersistentVolumeClaim %s found and phase=%s (%v)", pvcName, phase, time.Since(start))
  890. return nil
  891. } else {
  892. Logf("PersistentVolumeClaim %s found but phase is %s instead of %s.", pvcName, pvc.Status.Phase, phase)
  893. }
  894. }
  895. }
  896. return fmt.Errorf("PersistentVolumeClaim %s not in phase %s within %v", pvcName, phase, timeout)
  897. }
  898. // CreateTestingNS should be used by every test, note that we append a common prefix to the provided test name.
  899. // Please see NewFramework instead of using this directly.
  900. func CreateTestingNS(baseName string, c clientset.Interface, labels map[string]string) (*v1.Namespace, error) {
  901. if labels == nil {
  902. labels = map[string]string{}
  903. }
  904. labels["e2e-run"] = string(RunId)
  905. namespaceObj := &v1.Namespace{
  906. ObjectMeta: metav1.ObjectMeta{
  907. GenerateName: fmt.Sprintf("e2e-tests-%v-", baseName),
  908. Namespace: "",
  909. Labels: labels,
  910. },
  911. Status: v1.NamespaceStatus{},
  912. }
  913. // Be robust about making the namespace creation call.
  914. var got *v1.Namespace
  915. if err := wait.PollImmediate(Poll, 30*time.Second, func() (bool, error) {
  916. var err error
  917. got, err = c.CoreV1().Namespaces().Create(namespaceObj)
  918. if err != nil {
  919. Logf("Unexpected error while creating namespace: %v", err)
  920. return false, nil
  921. }
  922. return true, nil
  923. }); err != nil {
  924. return nil, err
  925. }
  926. if TestContext.VerifyServiceAccount {
  927. if err := WaitForDefaultServiceAccountInNamespace(c, got.Name); err != nil {
  928. // Even if we fail to create serviceAccount in the namespace,
  929. // we have successfully create a namespace.
  930. // So, return the created namespace.
  931. return got, err
  932. }
  933. }
  934. return got, nil
  935. }
  936. // CheckTestingNSDeletedExcept checks whether all e2e based existing namespaces are in the Terminating state
  937. // and waits until they are finally deleted. It ignores namespace skip.
  938. func CheckTestingNSDeletedExcept(c clientset.Interface, skip string) error {
  939. // TODO: Since we don't have support for bulk resource deletion in the API,
  940. // while deleting a namespace we are deleting all objects from that namespace
  941. // one by one (one deletion == one API call). This basically exposes us to
  942. // throttling - currently controller-manager has a limit of max 20 QPS.
  943. // Once #10217 is implemented and used in namespace-controller, deleting all
  944. // object from a given namespace should be much faster and we will be able
  945. // to lower this timeout.
  946. // However, now Density test is producing ~26000 events and Load capacity test
  947. // is producing ~35000 events, thus assuming there are no other requests it will
  948. // take ~30 minutes to fully delete the namespace. Thus I'm setting it to 60
  949. // minutes to avoid any timeouts here.
  950. timeout := 60 * time.Minute
  951. Logf("Waiting for terminating namespaces to be deleted...")
  952. for start := time.Now(); time.Since(start) < timeout; time.Sleep(15 * time.Second) {
  953. namespaces, err := c.CoreV1().Namespaces().List(metav1.ListOptions{})
  954. if err != nil {
  955. Logf("Listing namespaces failed: %v", err)
  956. continue
  957. }
  958. terminating := 0
  959. for _, ns := range namespaces.Items {
  960. if strings.HasPrefix(ns.ObjectMeta.Name, "e2e-tests-") && ns.ObjectMeta.Name != skip {
  961. if ns.Status.Phase == v1.NamespaceActive {
  962. return fmt.Errorf("Namespace %s is active", ns.ObjectMeta.Name)
  963. }
  964. terminating++
  965. }
  966. }
  967. if terminating == 0 {
  968. return nil
  969. }
  970. }
  971. return fmt.Errorf("Waiting for terminating namespaces to be deleted timed out")
  972. }
  973. // deleteNS deletes the provided namespace, waits for it to be completely deleted, and then checks
  974. // whether there are any pods remaining in a non-terminating state.
  975. func deleteNS(c clientset.Interface, dynamicClient dynamic.Interface, namespace string, timeout time.Duration) error {
  976. startTime := time.Now()
  977. if err := c.CoreV1().Namespaces().Delete(namespace, nil); err != nil {
  978. return err
  979. }
  980. // wait for namespace to delete or timeout.
  981. err := wait.PollImmediate(2*time.Second, timeout, func() (bool, error) {
  982. if _, err := c.CoreV1().Namespaces().Get(namespace, metav1.GetOptions{}); err != nil {
  983. if apierrs.IsNotFound(err) {
  984. return true, nil
  985. }
  986. Logf("Error while waiting for namespace to be terminated: %v", err)
  987. return false, nil
  988. }
  989. return false, nil
  990. })
  991. // verify there is no more remaining content in the namespace
  992. remainingContent, cerr := hasRemainingContent(c, dynamicClient, namespace)
  993. if cerr != nil {
  994. return cerr
  995. }
  996. // if content remains, let's dump information about the namespace, and system for flake debugging.
  997. remainingPods := 0
  998. missingTimestamp := 0
  999. if remainingContent {
  1000. // log information about namespace, and set of namespaces in api server to help flake detection
  1001. logNamespace(c, namespace)
  1002. logNamespaces(c, namespace)
  1003. // if we can, check if there were pods remaining with no timestamp.
  1004. remainingPods, missingTimestamp, _ = countRemainingPods(c, namespace)
  1005. }
  1006. // a timeout waiting for namespace deletion happened!
  1007. if err != nil {
  1008. // some content remains in the namespace
  1009. if remainingContent {
  1010. // pods remain
  1011. if remainingPods > 0 {
  1012. if missingTimestamp != 0 {
  1013. // pods remained, but were not undergoing deletion (namespace controller is probably culprit)
  1014. return fmt.Errorf("namespace %v was not deleted with limit: %v, pods remaining: %v, pods missing deletion timestamp: %v", namespace, err, remainingPods, missingTimestamp)
  1015. }
  1016. // but they were all undergoing deletion (kubelet is probably culprit, check NodeLost)
  1017. return fmt.Errorf("namespace %v was not deleted with limit: %v, pods remaining: %v", namespace, err, remainingPods)
  1018. }
  1019. // other content remains (namespace controller is probably screwed up)
  1020. return fmt.Errorf("namespace %v was not deleted with limit: %v, namespaced content other than pods remain", namespace, err)
  1021. }
  1022. // no remaining content, but namespace was not deleted (namespace controller is probably wedged)
  1023. return fmt.Errorf("namespace %v was not deleted with limit: %v, namespace is empty but is not yet removed", namespace, err)
  1024. }
  1025. Logf("namespace %v deletion completed in %s", namespace, time.Since(startTime))
  1026. return nil
  1027. }
  1028. // logNamespaces logs the number of namespaces by phase
  1029. // namespace is the namespace the test was operating against that failed to delete so it can be grepped in logs
  1030. func logNamespaces(c clientset.Interface, namespace string) {
  1031. namespaceList, err := c.CoreV1().Namespaces().List(metav1.ListOptions{})
  1032. if err != nil {
  1033. Logf("namespace: %v, unable to list namespaces: %v", namespace, err)
  1034. return
  1035. }
  1036. numActive := 0
  1037. numTerminating := 0
  1038. for _, namespace := range namespaceList.Items {
  1039. if namespace.Status.Phase == v1.NamespaceActive {
  1040. numActive++
  1041. } else {
  1042. numTerminating++
  1043. }
  1044. }
  1045. Logf("namespace: %v, total namespaces: %v, active: %v, terminating: %v", namespace, len(namespaceList.Items), numActive, numTerminating)
  1046. }
  1047. // logNamespace logs detail about a namespace
  1048. func logNamespace(c clientset.Interface, namespace string) {
  1049. ns, err := c.CoreV1().Namespaces().Get(namespace, metav1.GetOptions{})
  1050. if err != nil {
  1051. if apierrs.IsNotFound(err) {
  1052. Logf("namespace: %v no longer exists", namespace)
  1053. return
  1054. }
  1055. Logf("namespace: %v, unable to get namespace due to error: %v", namespace, err)
  1056. return
  1057. }
  1058. Logf("namespace: %v, DeletionTimetamp: %v, Finalizers: %v, Phase: %v", ns.Name, ns.DeletionTimestamp, ns.Spec.Finalizers, ns.Status.Phase)
  1059. }
  1060. // countRemainingPods queries the server to count number of remaining pods, and number of pods that had a missing deletion timestamp.
  1061. func countRemainingPods(c clientset.Interface, namespace string) (int, int, error) {
  1062. // check for remaining pods
  1063. pods, err := c.CoreV1().Pods(namespace).List(metav1.ListOptions{})
  1064. if err != nil {
  1065. return 0, 0, err
  1066. }
  1067. // nothing remains!
  1068. if len(pods.Items) == 0 {
  1069. return 0, 0, nil
  1070. }
  1071. // stuff remains, log about it
  1072. logPodStates(pods.Items)
  1073. // check if there were any pods with missing deletion timestamp
  1074. numPods := len(pods.Items)
  1075. missingTimestamp := 0
  1076. for _, pod := range pods.Items {
  1077. if pod.DeletionTimestamp == nil {
  1078. missingTimestamp++
  1079. }
  1080. }
  1081. return numPods, missingTimestamp, nil
  1082. }
  1083. // isDynamicDiscoveryError returns true if the error is a group discovery error
  1084. // only for groups expected to be created/deleted dynamically during e2e tests
  1085. func isDynamicDiscoveryError(err error) bool {
  1086. if !discovery.IsGroupDiscoveryFailedError(err) {
  1087. return false
  1088. }
  1089. discoveryErr := err.(*discovery.ErrGroupDiscoveryFailed)
  1090. for gv := range discoveryErr.Groups {
  1091. switch gv.Group {
  1092. case "mygroup.example.com":
  1093. // custom_resource_definition
  1094. // garbage_collector
  1095. case "wardle.k8s.io":
  1096. // aggregator
  1097. case "metrics.k8s.io":
  1098. // aggregated metrics server add-on, no persisted resources
  1099. default:
  1100. Logf("discovery error for unexpected group: %#v", gv)
  1101. return false
  1102. }
  1103. }
  1104. return true
  1105. }
  1106. // hasRemainingContent checks if there is remaining content in the namespace via API discovery
  1107. func hasRemainingContent(c clientset.Interface, dynamicClient dynamic.Interface, namespace string) (bool, error) {
  1108. // some tests generate their own framework.Client rather than the default
  1109. // TODO: ensure every test call has a configured dynamicClient
  1110. if dynamicClient == nil {
  1111. return false, nil
  1112. }
  1113. // find out what content is supported on the server
  1114. // Since extension apiserver is not always available, e.g. metrics server sometimes goes down,
  1115. // add retry here.
  1116. resources, err := waitForServerPreferredNamespacedResources(c.Discovery(), 30*time.Second)
  1117. if err != nil {
  1118. return false, err
  1119. }
  1120. groupVersionResources, err := discovery.GroupVersionResources(resources)
  1121. if err != nil {
  1122. return false, err
  1123. }
  1124. // TODO: temporary hack for https://github.com/kubernetes/kubernetes/issues/31798
  1125. ignoredResources := sets.NewString("bindings")
  1126. contentRemaining := false
  1127. // dump how many of resource type is on the server in a log.
  1128. for gvr := range groupVersionResources {
  1129. // get a client for this group version...
  1130. dynamicClient := dynamicClient.Resource(gvr).Namespace(namespace)
  1131. if err != nil {
  1132. // not all resource types support list, so some errors here are normal depending on the resource type.
  1133. Logf("namespace: %s, unable to get client - gvr: %v, error: %v", namespace, gvr, err)
  1134. continue
  1135. }
  1136. // get the api resource
  1137. apiResource := metav1.APIResource{Name: gvr.Resource, Namespaced: true}
  1138. if ignoredResources.Has(gvr.Resource) {
  1139. Logf("namespace: %s, resource: %s, ignored listing per whitelist", namespace, apiResource.Name)
  1140. continue
  1141. }
  1142. unstructuredList, err := dynamicClient.List(metav1.ListOptions{})
  1143. if err != nil {
  1144. // not all resources support list, so we ignore those
  1145. if apierrs.IsMethodNotSupported(err) || apierrs.IsNotFound(err) || apierrs.IsForbidden(err) {
  1146. continue
  1147. }
  1148. // skip unavailable servers
  1149. if apierrs.IsServiceUnavailable(err) {
  1150. continue
  1151. }
  1152. return false, err
  1153. }
  1154. if len(unstructuredList.Items) > 0 {
  1155. Logf("namespace: %s, resource: %s, items remaining: %v", namespace, apiResource.Name, len(unstructuredList.Items))
  1156. contentRemaining = true
  1157. }
  1158. }
  1159. return contentRemaining, nil
  1160. }
  1161. func ContainerInitInvariant(older, newer runtime.Object) error {
  1162. oldPod := older.(*v1.Pod)
  1163. newPod := newer.(*v1.Pod)
  1164. if len(oldPod.Spec.InitContainers) == 0 {
  1165. return nil
  1166. }
  1167. if len(oldPod.Spec.InitContainers) != len(newPod.Spec.InitContainers) {
  1168. return fmt.Errorf("init container list changed")
  1169. }
  1170. if oldPod.UID != newPod.UID {
  1171. return fmt.Errorf("two different pods exist in the condition: %s vs %s", oldPod.UID, newPod.UID)
  1172. }
  1173. if err := initContainersInvariants(oldPod); err != nil {
  1174. return err
  1175. }
  1176. if err := initContainersInvariants(newPod); err != nil {
  1177. return err
  1178. }
  1179. oldInit, _, _ := podInitialized(oldPod)
  1180. newInit, _, _ := podInitialized(newPod)
  1181. if oldInit && !newInit {
  1182. // TODO: we may in the future enable resetting PodInitialized = false if the kubelet needs to restart it
  1183. // from scratch
  1184. return fmt.Errorf("pod cannot be initialized and then regress to not being initialized")
  1185. }
  1186. return nil
  1187. }
  1188. func podInitialized(pod *v1.Pod) (ok bool, failed bool, err error) {
  1189. allInit := true
  1190. initFailed := false
  1191. for _, s := range pod.Status.InitContainerStatuses {
  1192. switch {
  1193. case initFailed && s.State.Waiting == nil:
  1194. return allInit, initFailed, fmt.Errorf("container %s is after a failed container but isn't waiting", s.Name)
  1195. case allInit && s.State.Waiting == nil:
  1196. return allInit, initFailed, fmt.Errorf("container %s is after an initializing container but isn't waiting", s.Name)
  1197. case s.State.Terminated == nil:
  1198. allInit = false
  1199. case s.State.Terminated.ExitCode != 0:
  1200. allInit = false
  1201. initFailed = true
  1202. case !s.Ready:
  1203. return allInit, initFailed, fmt.Errorf("container %s initialized but isn't marked as ready", s.Name)
  1204. }
  1205. }
  1206. return allInit, initFailed, nil
  1207. }
  1208. func initContainersInvariants(pod *v1.Pod) error {
  1209. allInit, initFailed, err := podInitialized(pod)
  1210. if err != nil {
  1211. return err
  1212. }
  1213. if !allInit || initFailed {
  1214. for _, s := range pod.Status.ContainerStatuses {
  1215. if s.State.Waiting == nil || s.RestartCount != 0 {
  1216. return fmt.Errorf("container %s is not waiting but initialization not complete", s.Name)
  1217. }
  1218. if s.State.Waiting.Reason != "PodInitializing" {
  1219. return fmt.Errorf("container %s should have reason PodInitializing: %s", s.Name, s.State.Waiting.Reason)
  1220. }
  1221. }
  1222. }
  1223. _, c := podutil.GetPodCondition(&pod.Status, v1.PodInitialized)
  1224. if c == nil {
  1225. return fmt.Errorf("pod does not have initialized condition")
  1226. }
  1227. if c.LastTransitionTime.IsZero() {
  1228. return fmt.Errorf("PodInitialized condition should always have a transition time")
  1229. }
  1230. switch {
  1231. case c.Status == v1.ConditionUnknown:
  1232. return fmt.Errorf("PodInitialized condition should never be Unknown")
  1233. case c.Status == v1.ConditionTrue && (initFailed || !allInit):
  1234. return fmt.Errorf("PodInitialized condition was True but all not all containers initialized")
  1235. case c.Status == v1.ConditionFalse && (!initFailed && allInit):
  1236. return fmt.Errorf("PodInitialized condition was False but all containers initialized")
  1237. }
  1238. return nil
  1239. }
  1240. type InvariantFunc func(older, newer runtime.Object) error
  1241. func CheckInvariants(events []watch.Event, fns ...InvariantFunc) error {
  1242. errs := sets.NewString()
  1243. for i := range events {
  1244. j := i + 1
  1245. if j >= len(events) {
  1246. continue
  1247. }
  1248. for _, fn := range fns {
  1249. if err := fn(events[i].Object, events[j].Object); err != nil {
  1250. errs.Insert(err.Error())
  1251. }
  1252. }
  1253. }
  1254. if errs.Len() > 0 {
  1255. return fmt.Errorf("invariants violated:\n* %s", strings.Join(errs.List(), "\n* "))
  1256. }
  1257. return nil
  1258. }
  1259. // Waits default amount of time (PodStartTimeout) for the specified pod to become running.
  1260. // Returns an error if timeout occurs first, or pod goes in to failed state.
  1261. func WaitForPodRunningInNamespace(c clientset.Interface, pod *v1.Pod) error {
  1262. if pod.Status.Phase == v1.PodRunning {
  1263. return nil
  1264. }
  1265. return WaitTimeoutForPodRunningInNamespace(c, pod.Name, pod.Namespace, PodStartTimeout)
  1266. }
  1267. // Waits default amount of time (PodStartTimeout) for the specified pod to become running.
  1268. // Returns an error if timeout occurs first, or pod goes in to failed state.
  1269. func WaitForPodNameRunningInNamespace(c clientset.Interface, podName, namespace string) error {
  1270. return WaitTimeoutForPodRunningInNamespace(c, podName, namespace, PodStartTimeout)
  1271. }
  1272. // Waits an extended amount of time (slowPodStartTimeout) for the specified pod to become running.
  1273. // The resourceVersion is used when Watching object changes, it tells since when we care
  1274. // about changes to the pod. Returns an error if timeout occurs first, or pod goes in to failed state.
  1275. func waitForPodRunningInNamespaceSlow(c clientset.Interface, podName, namespace string) error {
  1276. return WaitTimeoutForPodRunningInNamespace(c, podName, namespace, slowPodStartTimeout)
  1277. }
  1278. func WaitTimeoutForPodRunningInNamespace(c clientset.Interface, podName, namespace string, timeout time.Duration) error {
  1279. return wait.PollImmediate(Poll, timeout, podRunning(c, podName, namespace))
  1280. }
  1281. func podRunning(c clientset.Interface, podName, namespace string) wait.ConditionFunc {
  1282. return func() (bool, error) {
  1283. pod, err := c.CoreV1().Pods(namespace).Get(podName, metav1.GetOptions{})
  1284. if err != nil {
  1285. return false, err
  1286. }
  1287. switch pod.Status.Phase {
  1288. case v1.PodRunning:
  1289. return true, nil
  1290. case v1.PodFailed, v1.PodSucceeded:
  1291. return false, conditions.ErrPodCompleted
  1292. }
  1293. return false, nil
  1294. }
  1295. }
  1296. // Waits default amount of time (DefaultPodDeletionTimeout) for the specified pod to stop running.
  1297. // Returns an error if timeout occurs first.
  1298. func WaitForPodNoLongerRunningInNamespace(c clientset.Interface, podName, namespace string) error {
  1299. return WaitTimeoutForPodNoLongerRunningInNamespace(c, podName, namespace, DefaultPodDeletionTimeout)
  1300. }
  1301. func WaitTimeoutForPodNoLongerRunningInNamespace(c clientset.Interface, podName, namespace string, timeout time.Duration) error {
  1302. return wait.PollImmediate(Poll, timeout, podCompleted(c, podName, namespace))
  1303. }
  1304. func podCompleted(c clientset.Interface, podName, namespace string) wait.ConditionFunc {
  1305. return func() (bool, error) {
  1306. pod, err := c.CoreV1().Pods(namespace).Get(podName, metav1.GetOptions{})
  1307. if err != nil {
  1308. return false, err
  1309. }
  1310. switch pod.Status.Phase {
  1311. case v1.PodFailed, v1.PodSucceeded:
  1312. return true, nil
  1313. }
  1314. return false, nil
  1315. }
  1316. }
  1317. func waitTimeoutForPodReadyInNamespace(c clientset.Interface, podName, namespace string, timeout time.Duration) error {
  1318. return wait.PollImmediate(Poll, timeout, podRunningAndReady(c, podName, namespace))
  1319. }
  1320. func podRunningAndReady(c clientset.Interface, podName, namespace string) wait.ConditionFunc {
  1321. return func() (bool, error) {
  1322. pod, err := c.CoreV1().Pods(namespace).Get(podName, metav1.GetOptions{})
  1323. if err != nil {
  1324. return false, err
  1325. }
  1326. switch pod.Status.Phase {
  1327. case v1.PodFailed, v1.PodSucceeded:
  1328. return false, conditions.ErrPodCompleted
  1329. case v1.PodRunning:
  1330. return podutil.IsPodReady(pod), nil
  1331. }
  1332. return false, nil
  1333. }
  1334. }
  1335. // WaitForPodNotPending returns an error if it took too long for the pod to go out of pending state.
  1336. // The resourceVersion is used when Watching object changes, it tells since when we care
  1337. // about changes to the pod.
  1338. func WaitForPodNotPending(c clientset.Interface, ns, podName string) error {
  1339. return wait.PollImmediate(Poll, PodStartTimeout, podNotPending(c, podName, ns))
  1340. }
  1341. func podNotPending(c clientset.Interface, podName, namespace string) wait.ConditionFunc {
  1342. return func() (bool, error) {
  1343. pod, err := c.CoreV1().Pods(namespace).Get(podName, metav1.GetOptions{})
  1344. if err != nil {
  1345. return false, err
  1346. }
  1347. switch pod.Status.Phase {
  1348. case v1.PodPending:
  1349. return false, nil
  1350. default:
  1351. return true, nil
  1352. }
  1353. }
  1354. }
  1355. // waitForPodTerminatedInNamespace returns an error if it takes too long for the pod to terminate,
  1356. // if the pod Get api returns an error (IsNotFound or other), or if the pod failed (and thus did not
  1357. // terminate) with an unexpected reason. Typically called to test that the passed-in pod is fully
  1358. // terminated (reason==""), but may be called to detect if a pod did *not* terminate according to
  1359. // the supplied reason.
  1360. func waitForPodTerminatedInNamespace(c clientset.Interface, podName, reason, namespace string) error {
  1361. return WaitForPodCondition(c, namespace, podName, "terminated due to deadline exceeded", PodStartTimeout, func(pod *v1.Pod) (bool, error) {
  1362. // Only consider Failed pods. Successful pods will be deleted and detected in
  1363. // waitForPodCondition's Get call returning `IsNotFound`
  1364. if pod.Status.Phase == v1.PodFailed {
  1365. if pod.Status.Reason == reason { // short-circuit waitForPodCondition's loop
  1366. return true, nil
  1367. } else {
  1368. return true, fmt.Errorf("Expected pod %q in namespace %q to be terminated with reason %q, got reason: %q", podName, namespace, reason, pod.Status.Reason)
  1369. }
  1370. }
  1371. return false, nil
  1372. })
  1373. }
  1374. // waitForPodNotFoundInNamespace returns an error if it takes too long for the pod to fully terminate.
  1375. // Unlike `waitForPodTerminatedInNamespace`, the pod's Phase and Reason are ignored. If the pod Get
  1376. // api returns IsNotFound then the wait stops and nil is returned. If the Get api returns an error other
  1377. // than "not found" then that error is returned and the wait stops.
  1378. func waitForPodNotFoundInNamespace(c clientset.Interface, podName, ns string, timeout time.Duration) error {
  1379. return wait.PollImmediate(Poll, timeout, func() (bool, error) {
  1380. _, err := c.CoreV1().Pods(ns).Get(podName, metav1.GetOptions{})
  1381. if apierrs.IsNotFound(err) {
  1382. return true, nil // done
  1383. }
  1384. if err != nil {
  1385. return true, err // stop wait with error
  1386. }
  1387. return false, nil
  1388. })
  1389. }
  1390. // waitForPodSuccessInNamespaceTimeout returns nil if the pod reached state success, or an error if it reached failure or ran too long.
  1391. func waitForPodSuccessInNamespaceTimeout(c clientset.Interface, podName string, namespace string, timeout time.Duration) error {
  1392. return WaitForPodCondition(c, namespace, podName, "success or failure", timeout, func(pod *v1.Pod) (bool, error) {
  1393. if pod.Spec.RestartPolicy == v1.RestartPolicyAlways {
  1394. return true, fmt.Errorf("pod %q will never terminate with a succeeded state since its restart policy is Always", podName)
  1395. }
  1396. switch pod.Status.Phase {
  1397. case v1.PodSucceeded:
  1398. By("Saw pod success")
  1399. return true, nil
  1400. case v1.PodFailed:
  1401. return true, fmt.Errorf("pod %q failed with status: %+v", podName, pod.Status)
  1402. default:
  1403. return false, nil
  1404. }
  1405. })
  1406. }
  1407. // WaitForPodSuccessInNamespace returns nil if the pod reached state success, or an error if it reached failure or until podStartupTimeout.
  1408. func WaitForPodSuccessInNamespace(c clientset.Interface, podName string, namespace string) error {
  1409. return waitForPodSuccessInNamespaceTimeout(c, podName, namespace, PodStartTimeout)
  1410. }
  1411. // WaitForPodSuccessInNamespaceSlow returns nil if the pod reached state success, or an error if it reached failure or until slowPodStartupTimeout.
  1412. func WaitForPodSuccessInNamespaceSlow(c clientset.Interface, podName string, namespace string) error {
  1413. return waitForPodSuccessInNamespaceTimeout(c, podName, namespace, slowPodStartTimeout)
  1414. }
  1415. // WaitForRCToStabilize waits till the RC has a matching generation/replica count between spec and status.
  1416. func WaitForRCToStabilize(c clientset.Interface, ns, name string, timeout time.Duration) error {
  1417. options := metav1.ListOptions{FieldSelector: fields.Set{
  1418. "metadata.name": name,
  1419. "metadata.namespace": ns,
  1420. }.AsSelector().String()}
  1421. w, err := c.CoreV1().ReplicationControllers(ns).Watch(options)
  1422. if err != nil {
  1423. return err
  1424. }
  1425. _, err = watch.Until(timeout, w, func(event watch.Event) (bool, error) {
  1426. switch event.Type {
  1427. case watch.Deleted:
  1428. return false, apierrs.NewNotFound(schema.GroupResource{Resource: "replicationcontrollers"}, "")
  1429. }
  1430. switch rc := event.Object.(type) {
  1431. case *v1.ReplicationController:
  1432. if rc.Name == name && rc.Namespace == ns &&
  1433. rc.Generation <= rc.Status.ObservedGeneration &&
  1434. *(rc.Spec.Replicas) == rc.Status.Replicas {
  1435. return true, nil
  1436. }
  1437. Logf("Waiting for rc %s to stabilize, generation %v observed generation %v spec.replicas %d status.replicas %d",
  1438. name, rc.Generation, rc.Status.ObservedGeneration, *(rc.Spec.Replicas), rc.Status.Replicas)
  1439. }
  1440. return false, nil
  1441. })
  1442. return err
  1443. }
  1444. func WaitForPodToDisappear(c clientset.Interface, ns, podName string, label labels.Selector, interval, timeout time.Duration) error {
  1445. return wait.PollImmediate(interval, timeout, func() (bool, error) {
  1446. Logf("Waiting for pod %s to disappear", podName)
  1447. options := metav1.ListOptions{LabelSelector: label.String()}
  1448. pods, err := c.CoreV1().Pods(ns).List(options)
  1449. if err != nil {
  1450. if testutils.IsRetryableAPIError(err) {
  1451. return false, nil
  1452. }
  1453. return false, err
  1454. }
  1455. found := false
  1456. for _, pod := range pods.Items {
  1457. if pod.Name == podName {
  1458. Logf("Pod %s still exists", podName)
  1459. found = true
  1460. break
  1461. }
  1462. }
  1463. if !found {
  1464. Logf("Pod %s no longer exists", podName)
  1465. return true, nil
  1466. }
  1467. return false, nil
  1468. })
  1469. }
  1470. // WaitForPodNameUnschedulableInNamespace returns an error if it takes too long for the pod to become Pending
  1471. // and have condition Status equal to Unschedulable,
  1472. // if the pod Get api returns an error (IsNotFound or other), or if the pod failed with an unexpected reason.
  1473. // Typically called to test that the passed-in pod is Pending and Unschedulable.
  1474. func WaitForPodNameUnschedulableInNamespace(c clientset.Interface, podName, namespace string) error {
  1475. return WaitForPodCondition(c, namespace, podName, "Unschedulable", PodStartTimeout, func(pod *v1.Pod) (bool, error) {
  1476. // Only consider Failed pods. Successful pods will be deleted and detected in
  1477. // waitForPodCondition's Get call returning `IsNotFound`
  1478. if pod.Status.Phase == v1.PodPending {
  1479. for _, cond := range pod.Status.Conditions {
  1480. if cond.Type == v1.PodScheduled && cond.Status == v1.ConditionFalse && cond.Reason == "Unschedulable" {
  1481. return true, nil
  1482. }
  1483. }
  1484. }
  1485. if pod.Status.Phase == v1.PodRunning || pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed {
  1486. return true, fmt.Errorf("Expected pod %q in namespace %q to be in phase Pending, but got phase: %v", podName, namespace, pod.Status.Phase)
  1487. }
  1488. return false, nil
  1489. })
  1490. }
  1491. // WaitForService waits until the service appears (exist == true), or disappears (exist == false)
  1492. func WaitForService(c clientset.Interface, namespace, name string, exist bool, interval, timeout time.Duration) error {
  1493. err := wait.PollImmediate(interval, timeout, func() (bool, error) {
  1494. _, err := c.CoreV1().Services(namespace).Get(name, metav1.GetOptions{})
  1495. switch {
  1496. case err == nil:
  1497. Logf("Service %s in namespace %s found.", name, namespace)
  1498. return exist, nil
  1499. case apierrs.IsNotFound(err):
  1500. Logf("Service %s in namespace %s disappeared.", name, namespace)
  1501. return !exist, nil
  1502. case !testutils.IsRetryableAPIError(err):
  1503. Logf("Non-retryable failure while getting service.")
  1504. return false, err
  1505. default:
  1506. Logf("Get service %s in namespace %s failed: %v", name, namespace, err)
  1507. return false, nil
  1508. }
  1509. })
  1510. if err != nil {
  1511. stateMsg := map[bool]string{true: "to appear", false: "to disappear"}
  1512. return fmt.Errorf("error waiting for service %s/%s %s: %v", namespace, name, stateMsg[exist], err)
  1513. }
  1514. return nil
  1515. }
  1516. // WaitForServiceWithSelector waits until any service with given selector appears (exist == true), or disappears (exist == false)
  1517. func WaitForServiceWithSelector(c clientset.Interface, namespace string, selector labels.Selector, exist bool, interval,
  1518. timeout time.Duration) error {
  1519. err := wait.PollImmediate(interval, timeout, func() (bool, error) {
  1520. services, err := c.CoreV1().Services(namespace).List(metav1.ListOptions{LabelSelector: selector.String()})
  1521. switch {
  1522. case len(services.Items) != 0:
  1523. Logf("Service with %s in namespace %s found.", selector.String(), namespace)
  1524. return exist, nil
  1525. case len(services.Items) == 0:
  1526. Logf("Service with %s in namespace %s disappeared.", selector.String(), namespace)
  1527. return !exist, nil
  1528. case !testutils.IsRetryableAPIError(err):
  1529. Logf("Non-retryable failure while listing service.")
  1530. return false, err
  1531. default:
  1532. Logf("List service with %s in namespace %s failed: %v", selector.String(), namespace, err)
  1533. return false, nil
  1534. }
  1535. })
  1536. if err != nil {
  1537. stateMsg := map[bool]string{true: "to appear", false: "to disappear"}
  1538. return fmt.Errorf("error waiting for service with %s in namespace %s %s: %v", selector.String(), namespace, stateMsg[exist], err)
  1539. }
  1540. return nil
  1541. }
  1542. //WaitForServiceEndpointsNum waits until the amount of endpoints that implement service to expectNum.
  1543. func WaitForServiceEndpointsNum(c clientset.Interface, namespace, serviceName string, expectNum int, interval, timeout time.Duration) error {
  1544. return wait.Poll(interval, timeout, func() (bool, error) {
  1545. Logf("Waiting for amount of service:%s endpoints to be %d", serviceName, expectNum)
  1546. list, err := c.CoreV1().Endpoints(namespace).List(metav1.ListOptions{})
  1547. if err != nil {
  1548. return false, err
  1549. }
  1550. for _, e := range list.Items {
  1551. if e.Name == serviceName && countEndpointsNum(&e) == expectNum {
  1552. return true, nil
  1553. }
  1554. }
  1555. return false, nil
  1556. })
  1557. }
  1558. func countEndpointsNum(e *v1.Endpoints) int {
  1559. num := 0
  1560. for _, sub := range e.Subsets {
  1561. num += len(sub.Addresses)
  1562. }
  1563. return num
  1564. }
  1565. func WaitForEndpoint(c clientset.Interface, ns, name string) error {
  1566. for t := time.Now(); time.Since(t) < EndpointRegisterTimeout; time.Sleep(Poll) {
  1567. endpoint, err := c.CoreV1().Endpoints(ns).Get(name, metav1.GetOptions{})
  1568. if apierrs.IsNotFound(err) {
  1569. Logf("Endpoint %s/%s is not ready yet", ns, name)
  1570. continue
  1571. }
  1572. Expect(err).NotTo(HaveOccurred())
  1573. if len(endpoint.Subsets) == 0 || len(endpoint.Subsets[0].Addresses) == 0 {
  1574. Logf("Endpoint %s/%s is not ready yet", ns, name)
  1575. continue
  1576. } else {
  1577. return nil
  1578. }
  1579. }
  1580. return fmt.Errorf("Failed to get endpoints for %s/%s", ns, name)
  1581. }
  1582. // Context for checking pods responses by issuing GETs to them (via the API
  1583. // proxy) and verifying that they answer with there own pod name.
  1584. type podProxyResponseChecker struct {
  1585. c clientset.Interface
  1586. ns string
  1587. label labels.Selector
  1588. controllerName string
  1589. respondName bool // Whether the pod should respond with its own name.
  1590. pods *v1.PodList
  1591. }
  1592. func PodProxyResponseChecker(c clientset.Interface, ns string, label labels.Selector, controllerName string, respondName bool, pods *v1.PodList) podProxyResponseChecker {
  1593. return podProxyResponseChecker{c, ns, label, controllerName, respondName, pods}
  1594. }
  1595. // CheckAllResponses issues GETs to all pods in the context and verify they
  1596. // reply with their own pod name.
  1597. func (r podProxyResponseChecker) CheckAllResponses() (done bool, err error) {
  1598. successes := 0
  1599. options := metav1.ListOptions{LabelSelector: r.label.String()}
  1600. currentPods, err := r.c.CoreV1().Pods(r.ns).List(options)
  1601. Expect(err).NotTo(HaveOccurred())
  1602. for i, pod := range r.pods.Items {
  1603. // Check that the replica list remains unchanged, otherwise we have problems.
  1604. if !isElementOf(pod.UID, currentPods) {
  1605. return false, fmt.Errorf("pod with UID %s is no longer a member of the replica set. Must have been restarted for some reason. Current replica set: %v", pod.UID, currentPods)
  1606. }
  1607. ctx, cancel := context.WithTimeout(context.Background(), SingleCallTimeout)
  1608. defer cancel()
  1609. body, err := r.c.CoreV1().RESTClient().Get().
  1610. Context(ctx).
  1611. Namespace(r.ns).
  1612. Resource("pods").
  1613. SubResource("proxy").
  1614. Name(string(pod.Name)).
  1615. Do().
  1616. Raw()
  1617. if err != nil {
  1618. if ctx.Err() != nil {
  1619. // We may encounter errors here because of a race between the pod readiness and apiserver
  1620. // proxy. So, we log the error and retry if this occurs.
  1621. Logf("Controller %s: Failed to Get from replica %d [%s]: %v\n pod status: %#v", r.controllerName, i+1, pod.Name, err, pod.Status)
  1622. return false, nil
  1623. }
  1624. Logf("Controller %s: Failed to GET from replica %d [%s]: %v\npod status: %#v", r.controllerName, i+1, pod.Name, err, pod.Status)
  1625. continue
  1626. }
  1627. // The response checker expects the pod's name unless !respondName, in
  1628. // which case it just checks for a non-empty response.
  1629. got := string(body)
  1630. what := ""
  1631. if r.respondName {
  1632. what = "expected"
  1633. want := pod.Name
  1634. if got != want {
  1635. Logf("Controller %s: Replica %d [%s] expected response %q but got %q",
  1636. r.controllerName, i+1, pod.Name, want, got)
  1637. continue
  1638. }
  1639. } else {
  1640. what = "non-empty"
  1641. if len(got) == 0 {
  1642. Logf("Controller %s: Replica %d [%s] expected non-empty response",
  1643. r.controllerName, i+1, pod.Name)
  1644. continue
  1645. }
  1646. }
  1647. successes++
  1648. Logf("Controller %s: Got %s result from replica %d [%s]: %q, %d of %d required successes so far",
  1649. r.controllerName, what, i+1, pod.Name, got, successes, len(r.pods.Items))
  1650. }
  1651. if successes < len(r.pods.Items) {
  1652. return false, nil
  1653. }
  1654. return true, nil
  1655. }
  1656. // ServerVersionGTE returns true if v is greater than or equal to the server
  1657. // version.
  1658. //
  1659. // TODO(18726): This should be incorporated into client.VersionInterface.
  1660. func ServerVersionGTE(v *utilversion.Version, c discovery.ServerVersionInterface) (bool, error) {
  1661. serverVersion, err := c.ServerVersion()
  1662. if err != nil {
  1663. return false, fmt.Errorf("Unable to get server version: %v", err)
  1664. }
  1665. sv, err := utilversion.ParseSemantic(serverVersion.GitVersion)
  1666. if err != nil {
  1667. return false, fmt.Errorf("Unable to parse server version %q: %v", serverVersion.GitVersion, err)
  1668. }
  1669. return sv.AtLeast(v), nil
  1670. }
  1671. func SkipUnlessKubectlVersionGTE(v *utilversion.Version) {
  1672. gte, err := KubectlVersionGTE(v)
  1673. if err != nil {
  1674. Failf("Failed to get kubectl version: %v", err)
  1675. }
  1676. if !gte {
  1677. Skipf("Not supported for kubectl versions before %q", v)
  1678. }
  1679. }
  1680. // KubectlVersionGTE returns true if the kubectl version is greater than or
  1681. // equal to v.
  1682. func KubectlVersionGTE(v *utilversion.Version) (bool, error) {
  1683. kv, err := KubectlVersion()
  1684. if err != nil {
  1685. return false, err
  1686. }
  1687. return kv.AtLeast(v), nil
  1688. }
  1689. // KubectlVersion gets the version of kubectl that's currently being used (see
  1690. // --kubectl-path in e2e.go to use an alternate kubectl).
  1691. func KubectlVersion() (*utilversion.Version, error) {
  1692. output := RunKubectlOrDie("version", "--client")
  1693. matches := gitVersionRegexp.FindStringSubmatch(output)
  1694. if len(matches) != 2 {
  1695. return nil, fmt.Errorf("Could not find kubectl version in output %v", output)
  1696. }
  1697. // Don't use the full match, as it contains "GitVersion:\"" and a
  1698. // trailing "\"". Just use the submatch.
  1699. return utilversion.ParseSemantic(matches[1])
  1700. }
  1701. func PodsResponding(c clientset.Interface, ns, name string, wantName bool, pods *v1.PodList) error {
  1702. By("trying to dial each unique pod")
  1703. label := labels.SelectorFromSet(labels.Set(map[string]string{"name": name}))
  1704. return wait.PollImmediate(Poll, podRespondingTimeout, PodProxyResponseChecker(c, ns, label, name, wantName, pods).CheckAllResponses)
  1705. }
  1706. func PodsCreated(c clientset.Interface, ns, name string, replicas int32) (*v1.PodList, error) {
  1707. label := labels.SelectorFromSet(labels.Set(map[string]string{"name": name}))
  1708. return PodsCreatedByLabel(c, ns, name, replicas, label)
  1709. }
  1710. func PodsCreatedByLabel(c clientset.Interface, ns, name string, replicas int32, label labels.Selector) (*v1.PodList, error) {
  1711. timeout := 2 * time.Minute
  1712. for start := time.Now(); time.Since(start) < timeout; time.Sleep(5 * time.Second) {
  1713. options := metav1.ListOptions{LabelSelector: label.String()}
  1714. // List the pods, making sure we observe all the replicas.
  1715. pods, err := c.CoreV1().Pods(ns).List(options)
  1716. if err != nil {
  1717. return nil, err
  1718. }
  1719. created := []v1.Pod{}
  1720. for _, pod := range pods.Items {
  1721. if pod.DeletionTimestamp != nil {
  1722. continue
  1723. }
  1724. created = append(created, pod)
  1725. }
  1726. Logf("Pod name %s: Found %d pods out of %d", name, len(created), replicas)
  1727. if int32(len(created)) == replicas {
  1728. pods.Items = created
  1729. return pods, nil
  1730. }
  1731. }
  1732. return nil, fmt.Errorf("Pod name %s: Gave up waiting %v for %d pods to come up", name, timeout, replicas)
  1733. }
  1734. func podsRunning(c clientset.Interface, pods *v1.PodList) []error {
  1735. // Wait for the pods to enter the running state. Waiting loops until the pods
  1736. // are running so non-running pods cause a timeout for this test.
  1737. By("ensuring each pod is running")
  1738. e := []error{}
  1739. error_chan := make(chan error)
  1740. for _, pod := range pods.Items {
  1741. go func(p v1.Pod) {
  1742. error_chan <- WaitForPodRunningInNamespace(c, &p)
  1743. }(pod)
  1744. }
  1745. for range pods.Items {
  1746. err := <-error_chan
  1747. if err != nil {
  1748. e = append(e, err)
  1749. }
  1750. }
  1751. return e
  1752. }
  1753. func VerifyPods(c clientset.Interface, ns, name string, wantName bool, replicas int32) error {
  1754. return podRunningMaybeResponding(c, ns, name, wantName, replicas, true)
  1755. }
  1756. func VerifyPodsRunning(c clientset.Interface, ns, name string, wantName bool, replicas int32) error {
  1757. return podRunningMaybeResponding(c, ns, name, wantName, replicas, false)
  1758. }
  1759. func podRunningMaybeResponding(c clientset.Interface, ns, name string, wantName bool, replicas int32, checkResponding bool) error {
  1760. pods, err := PodsCreated(c, ns, name, replicas)
  1761. if err != nil {
  1762. return err
  1763. }
  1764. e := podsRunning(c, pods)
  1765. if len(e) > 0 {
  1766. return fmt.Errorf("failed to wait for pods running: %v", e)
  1767. }
  1768. if checkResponding {
  1769. err = PodsResponding(c, ns, name, wantName, pods)
  1770. if err != nil {
  1771. return fmt.Errorf("failed to wait for pods responding: %v", err)
  1772. }
  1773. }
  1774. return nil
  1775. }
  1776. func ServiceResponding(c clientset.Interface, ns, name string) error {
  1777. By(fmt.Sprintf("trying to dial the service %s.%s via the proxy", ns, name))
  1778. return wait.PollImmediate(Poll, ServiceRespondingTimeout, func() (done bool, err error) {
  1779. proxyRequest, errProxy := GetServicesProxyRequest(c, c.CoreV1().RESTClient().Get())
  1780. if errProxy != nil {
  1781. Logf("Failed to get services proxy request: %v:", errProxy)
  1782. return false, nil
  1783. }
  1784. ctx, cancel := context.WithTimeout(context.Background(), SingleCallTimeout)
  1785. defer cancel()
  1786. body, err := proxyRequest.Namespace(ns).
  1787. Context(ctx).
  1788. Name(name).
  1789. Do().
  1790. Raw()
  1791. if err != nil {
  1792. if ctx.Err() != nil {
  1793. Failf("Failed to GET from service %s: %v", name, err)
  1794. return true, err
  1795. }
  1796. Logf("Failed to GET from service %s: %v:", name, err)
  1797. return false, nil
  1798. }
  1799. got := string(body)
  1800. if len(got) == 0 {
  1801. Logf("Service %s: expected non-empty response", name)
  1802. return false, err // stop polling
  1803. }
  1804. Logf("Service %s: found nonempty answer: %s", name, got)
  1805. return true, nil
  1806. })
  1807. }
  1808. func RestclientConfig(kubeContext string) (*clientcmdapi.Config, error) {
  1809. Logf(">>> kubeConfig: %s", TestContext.KubeConfig)
  1810. if TestContext.KubeConfig == "" {
  1811. return nil, fmt.Errorf("KubeConfig must be specified to load client config")
  1812. }
  1813. c, err := clientcmd.LoadFromFile(TestContext.KubeConfig)
  1814. if err != nil {
  1815. return nil, fmt.Errorf("error loading KubeConfig: %v", err.Error())
  1816. }
  1817. if kubeContext != "" {
  1818. Logf(">>> kubeContext: %s", kubeContext)
  1819. c.CurrentContext = kubeContext
  1820. }
  1821. return c, nil
  1822. }
  1823. type ClientConfigGetter func() (*restclient.Config, error)
  1824. func LoadConfig() (*restclient.Config, error) {
  1825. if TestContext.NodeE2E {
  1826. // This is a node e2e test, apply the node e2e configuration
  1827. return &restclient.Config{Host: TestContext.Host}, nil
  1828. }
  1829. c, err := RestclientConfig(TestContext.KubeContext)
  1830. if err != nil {
  1831. if TestContext.KubeConfig == "" {
  1832. return restclient.InClusterConfig()
  1833. } else {
  1834. return nil, err
  1835. }
  1836. }
  1837. return clientcmd.NewDefaultClientConfig(*c, &clientcmd.ConfigOverrides{ClusterInfo: clientcmdapi.Cluster{Server: TestContext.Host}}).ClientConfig()
  1838. }
  1839. func LoadInternalClientset() (*internalclientset.Clientset, error) {
  1840. config, err := LoadConfig()
  1841. if err != nil {
  1842. return nil, fmt.Errorf("error creating client: %v", err.Error())
  1843. }
  1844. return internalclientset.NewForConfig(config)
  1845. }
  1846. func LoadClientset() (*clientset.Clientset, error) {
  1847. config, err := LoadConfig()
  1848. if err != nil {
  1849. return nil, fmt.Errorf("error creating client: %v", err.Error())
  1850. }
  1851. return clientset.NewForConfig(config)
  1852. }
  1853. // randomSuffix provides a random string to append to pods,services,rcs.
  1854. // TODO: Allow service names to have the same form as names
  1855. // for pods and replication controllers so we don't
  1856. // need to use such a function and can instead
  1857. // use the UUID utility function.
  1858. func randomSuffix() string {
  1859. r := rand.New(rand.NewSource(time.Now().UnixNano()))
  1860. return strconv.Itoa(r.Int() % 10000)
  1861. }
  1862. func ExpectNoError(err error, explain ...interface{}) {
  1863. ExpectNoErrorWithOffset(1, err, explain...)
  1864. }
  1865. // ExpectNoErrorWithOffset checks if "err" is set, and if so, fails assertion while logging the error at "offset" levels above its caller
  1866. // (for example, for call chain f -> g -> ExpectNoErrorWithOffset(1, ...) error would be logged for "f").
  1867. func ExpectNoErrorWithOffset(offset int, err error, explain ...interface{}) {
  1868. if err != nil {
  1869. Logf("Unexpected error occurred: %v", err)
  1870. }
  1871. ExpectWithOffset(1+offset, err).NotTo(HaveOccurred(), explain...)
  1872. }
  1873. func ExpectNoErrorWithRetries(fn func() error, maxRetries int, explain ...interface{}) {
  1874. var err error
  1875. for i := 0; i < maxRetries; i++ {
  1876. err = fn()
  1877. if err == nil {
  1878. return
  1879. }
  1880. Logf("(Attempt %d of %d) Unexpected error occurred: %v", i+1, maxRetries, err)
  1881. }
  1882. ExpectWithOffset(1, err).NotTo(HaveOccurred(), explain...)
  1883. }
  1884. // Stops everything from filePath from namespace ns and checks if everything matching selectors from the given namespace is correctly stopped.
  1885. func Cleanup(filePath, ns string, selectors ...string) {
  1886. By("using delete to clean up resources")
  1887. var nsArg string
  1888. if ns != "" {
  1889. nsArg = fmt.Sprintf("--namespace=%s", ns)
  1890. }
  1891. RunKubectlOrDie("delete", "--grace-period=0", "-f", filePath, nsArg)
  1892. AssertCleanup(ns, selectors...)
  1893. }
  1894. // Asserts that cleanup of a namespace wrt selectors occurred.
  1895. func AssertCleanup(ns string, selectors ...string) {
  1896. var nsArg string
  1897. if ns != "" {
  1898. nsArg = fmt.Sprintf("--namespace=%s", ns)
  1899. }
  1900. var e error
  1901. verifyCleanupFunc := func() (bool, error) {
  1902. e = nil
  1903. for _, selector := range selectors {
  1904. resources := RunKubectlOrDie("get", "rc,svc", "-l", selector, "--no-headers", nsArg)
  1905. if resources != "" {
  1906. e = fmt.Errorf("Resources left running after stop:\n%s", resources)
  1907. return false, nil
  1908. }
  1909. pods := RunKubectlOrDie("get", "pods", "-l", selector, nsArg, "-o", "go-template={{ range .items }}{{ if not .metadata.deletionTimestamp }}{{ .metadata.name }}{{ \"\\n\" }}{{ end }}{{ end }}")
  1910. if pods != "" {
  1911. e = fmt.Errorf("Pods left unterminated after stop:\n%s", pods)
  1912. return false, nil
  1913. }
  1914. }
  1915. return true, nil
  1916. }
  1917. err := wait.PollImmediate(500*time.Millisecond, 1*time.Minute, verifyCleanupFunc)
  1918. if err != nil {
  1919. Failf(e.Error())
  1920. }
  1921. }
  1922. // KubectlCmd runs the kubectl executable through the wrapper script.
  1923. func KubectlCmd(args ...string) *exec.Cmd {
  1924. defaultArgs := []string{}
  1925. // Reference a --server option so tests can run anywhere.
  1926. if TestContext.Host != "" {
  1927. defaultArgs = append(defaultArgs, "--"+clientcmd.FlagAPIServer+"="+TestContext.Host)
  1928. }
  1929. if TestContext.KubeConfig != "" {
  1930. defaultArgs = append(defaultArgs, "--"+clientcmd.RecommendedConfigPathFlag+"="+TestContext.KubeConfig)
  1931. // Reference the KubeContext
  1932. if TestContext.KubeContext != "" {
  1933. defaultArgs = append(defaultArgs, "--"+clientcmd.FlagContext+"="+TestContext.KubeContext)
  1934. }
  1935. } else {
  1936. if TestContext.CertDir != "" {
  1937. defaultArgs = append(defaultArgs,
  1938. fmt.Sprintf("--certificate-authority=%s", filepath.Join(TestContext.CertDir, "ca.crt")),
  1939. fmt.Sprintf("--client-certificate=%s", filepath.Join(TestContext.CertDir, "kubecfg.crt")),
  1940. fmt.Sprintf("--client-key=%s", filepath.Join(TestContext.CertDir, "kubecfg.key")))
  1941. }
  1942. }
  1943. kubectlArgs := append(defaultArgs, args...)
  1944. //We allow users to specify path to kubectl, so you can test either "kubectl" or "cluster/kubectl.sh"
  1945. //and so on.
  1946. cmd := exec.Command(TestContext.KubectlPath, kubectlArgs...)
  1947. //caller will invoke this and wait on it.
  1948. return cmd
  1949. }
  1950. // kubectlBuilder is used to build, customize and execute a kubectl Command.
  1951. // Add more functions to customize the builder as needed.
  1952. type kubectlBuilder struct {
  1953. cmd *exec.Cmd
  1954. timeout <-chan time.Time
  1955. }
  1956. func NewKubectlCommand(args ...string) *kubectlBuilder {
  1957. b := new(kubectlBuilder)
  1958. b.cmd = KubectlCmd(args...)
  1959. return b
  1960. }
  1961. func (b *kubectlBuilder) WithEnv(env []string) *kubectlBuilder {
  1962. b.cmd.Env = env
  1963. return b
  1964. }
  1965. func (b *kubectlBuilder) WithTimeout(t <-chan time.Time) *kubectlBuilder {
  1966. b.timeout = t
  1967. return b
  1968. }
  1969. func (b kubectlBuilder) WithStdinData(data string) *kubectlBuilder {
  1970. b.cmd.Stdin = strings.NewReader(data)
  1971. return &b
  1972. }
  1973. func (b kubectlBuilder) WithStdinReader(reader io.Reader) *kubectlBuilder {
  1974. b.cmd.Stdin = reader
  1975. return &b
  1976. }
  1977. func (b kubectlBuilder) ExecOrDie() string {
  1978. str, err := b.Exec()
  1979. // In case of i/o timeout error, try talking to the apiserver again after 2s before dying.
  1980. // Note that we're still dying after retrying so that we can get visibility to triage it further.
  1981. if isTimeout(err) {
  1982. Logf("Hit i/o timeout error, talking to the server 2s later to see if it's temporary.")
  1983. time.Sleep(2 * time.Second)
  1984. retryStr, retryErr := RunKubectl("version")
  1985. Logf("stdout: %q", retryStr)
  1986. Logf("err: %v", retryErr)
  1987. }
  1988. Expect(err).NotTo(HaveOccurred())
  1989. return str
  1990. }
  1991. func isTimeout(err error) bool {
  1992. switch err := err.(type) {
  1993. case net.Error:
  1994. if err.Timeout() {
  1995. return true
  1996. }
  1997. case *url.Error:
  1998. if err, ok := err.Err.(net.Error); ok && err.Timeout() {
  1999. return true
  2000. }
  2001. }
  2002. return false
  2003. }
  2004. func (b kubectlBuilder) Exec() (string, error) {
  2005. var stdout, stderr bytes.Buffer
  2006. cmd := b.cmd
  2007. cmd.Stdout, cmd.Stderr = &stdout, &stderr
  2008. Logf("Running '%s %s'", cmd.Path, strings.Join(cmd.Args[1:], " ")) // skip arg[0] as it is printed separately
  2009. if err := cmd.Start(); err != nil {
  2010. return "", fmt.Errorf("error starting %v:\nCommand stdout:\n%v\nstderr:\n%v\nerror:\n%v\n", cmd, cmd.Stdout, cmd.Stderr, err)
  2011. }
  2012. errCh := make(chan error, 1)
  2013. go func() {
  2014. errCh <- cmd.Wait()
  2015. }()
  2016. select {
  2017. case err := <-errCh:
  2018. if err != nil {
  2019. var rc int = 127
  2020. if ee, ok := err.(*exec.ExitError); ok {
  2021. rc = int(ee.Sys().(syscall.WaitStatus).ExitStatus())
  2022. Logf("rc: %d", rc)
  2023. }
  2024. return "", uexec.CodeExitError{
  2025. Err: fmt.Errorf("error running %v:\nCommand stdout:\n%v\nstderr:\n%v\nerror:\n%v\n", cmd, cmd.Stdout, cmd.Stderr, err),
  2026. Code: rc,
  2027. }
  2028. }
  2029. case <-b.timeout:
  2030. b.cmd.Process.Kill()
  2031. return "", fmt.Errorf("timed out waiting for command %v:\nCommand stdout:\n%v\nstderr:\n%v\n", cmd, cmd.Stdout, cmd.Stderr)
  2032. }
  2033. Logf("stderr: %q", stderr.String())
  2034. Logf("stdout: %q", stdout.String())
  2035. return stdout.String(), nil
  2036. }
  2037. // RunKubectlOrDie is a convenience wrapper over kubectlBuilder
  2038. func RunKubectlOrDie(args ...string) string {
  2039. return NewKubectlCommand(args...).ExecOrDie()
  2040. }
  2041. // RunKubectl is a convenience wrapper over kubectlBuilder
  2042. func RunKubectl(args ...string) (string, error) {
  2043. return NewKubectlCommand(args...).Exec()
  2044. }
  2045. // RunKubectlOrDieInput is a convenience wrapper over kubectlBuilder that takes input to stdin
  2046. func RunKubectlOrDieInput(data string, args ...string) string {
  2047. return NewKubectlCommand(args...).WithStdinData(data).ExecOrDie()
  2048. }
  2049. // RunKubemciWithKubeconfig is a convenience wrapper over RunKubemciCmd
  2050. func RunKubemciWithKubeconfig(args ...string) (string, error) {
  2051. if TestContext.KubeConfig != "" {
  2052. args = append(args, "--"+clientcmd.RecommendedConfigPathFlag+"="+TestContext.KubeConfig)
  2053. }
  2054. return RunKubemciCmd(args...)
  2055. }
  2056. // RunKubemciCmd is a convenience wrapper over kubectlBuilder to run kubemci.
  2057. // It assumes that kubemci exists in PATH.
  2058. func RunKubemciCmd(args ...string) (string, error) {
  2059. // kubemci is assumed to be in PATH.
  2060. kubemci := "kubemci"
  2061. b := new(kubectlBuilder)
  2062. args = append(args, "--gcp-project="+TestContext.CloudConfig.ProjectID)
  2063. b.cmd = exec.Command(kubemci, args...)
  2064. return b.Exec()
  2065. }
  2066. func StartCmdAndStreamOutput(cmd *exec.Cmd) (stdout, stderr io.ReadCloser, err error) {
  2067. stdout, err = cmd.StdoutPipe()
  2068. if err != nil {
  2069. return
  2070. }
  2071. stderr, err = cmd.StderrPipe()
  2072. if err != nil {
  2073. return
  2074. }
  2075. Logf("Asynchronously running '%s %s'", cmd.Path, strings.Join(cmd.Args, " "))
  2076. err = cmd.Start()
  2077. return
  2078. }
  2079. // Rough equivalent of ctrl+c for cleaning up processes. Intended to be run in defer.
  2080. func TryKill(cmd *exec.Cmd) {
  2081. if err := cmd.Process.Kill(); err != nil {
  2082. Logf("ERROR failed to kill command %v! The process may leak", cmd)
  2083. }
  2084. }
  2085. // testContainerOutputMatcher runs the given pod in the given namespace and waits
  2086. // for all of the containers in the podSpec to move into the 'Success' status, and tests
  2087. // the specified container log against the given expected output using the given matcher.
  2088. func (f *Framework) testContainerOutputMatcher(scenarioName string,
  2089. pod *v1.Pod,
  2090. containerIndex int,
  2091. expectedOutput []string,
  2092. matcher func(string, ...interface{}) gomegatypes.GomegaMatcher) {
  2093. By(fmt.Sprintf("Creating a pod to test %v", scenarioName))
  2094. if containerIndex < 0 || containerIndex >= len(pod.Spec.Containers) {
  2095. Failf("Invalid container index: %d", containerIndex)
  2096. }
  2097. ExpectNoError(f.MatchContainerOutput(pod, pod.Spec.Containers[containerIndex].Name, expectedOutput, matcher))
  2098. }
  2099. // MatchContainerOutput creates a pod and waits for all it's containers to exit with success.
  2100. // It then tests that the matcher with each expectedOutput matches the output of the specified container.
  2101. func (f *Framework) MatchContainerOutput(
  2102. pod *v1.Pod,
  2103. containerName string,
  2104. expectedOutput []string,
  2105. matcher func(string, ...interface{}) gomegatypes.GomegaMatcher) error {
  2106. ns := pod.ObjectMeta.Namespace
  2107. if ns == "" {
  2108. ns = f.Namespace.Name
  2109. }
  2110. podClient := f.PodClientNS(ns)
  2111. createdPod := podClient.Create(pod)
  2112. defer func() {
  2113. By("delete the pod")
  2114. podClient.DeleteSync(createdPod.Name, &metav1.DeleteOptions{}, DefaultPodDeletionTimeout)
  2115. }()
  2116. // Wait for client pod to complete.
  2117. podErr := WaitForPodSuccessInNamespace(f.ClientSet, createdPod.Name, ns)
  2118. // Grab its logs. Get host first.
  2119. podStatus, err := podClient.Get(createdPod.Name, metav1.GetOptions{})
  2120. if err != nil {
  2121. return fmt.Errorf("failed to get pod status: %v", err)
  2122. }
  2123. if podErr != nil {
  2124. // Pod failed. Dump all logs from all containers to see what's wrong
  2125. for _, container := range podStatus.Spec.Containers {
  2126. logs, err := GetPodLogs(f.ClientSet, ns, podStatus.Name, container.Name)
  2127. if err != nil {
  2128. Logf("Failed to get logs from node %q pod %q container %q: %v",
  2129. podStatus.Spec.NodeName, podStatus.Name, container.Name, err)
  2130. continue
  2131. }
  2132. Logf("Output of node %q pod %q container %q: %s", podStatus.Spec.NodeName, podStatus.Name, container.Name, logs)
  2133. }
  2134. return fmt.Errorf("expected pod %q success: %v", createdPod.Name, podErr)
  2135. }
  2136. Logf("Trying to get logs from node %s pod %s container %s: %v",
  2137. podStatus.Spec.NodeName, podStatus.Name, containerName, err)
  2138. // Sometimes the actual containers take a second to get started, try to get logs for 60s
  2139. logs, err := GetPodLogs(f.ClientSet, ns, podStatus.Name, containerName)
  2140. if err != nil {
  2141. Logf("Failed to get logs from node %q pod %q container %q. %v",
  2142. podStatus.Spec.NodeName, podStatus.Name, containerName, err)
  2143. return fmt.Errorf("failed to get logs from %s for %s: %v", podStatus.Name, containerName, err)
  2144. }
  2145. for _, expected := range expectedOutput {
  2146. m := matcher(expected)
  2147. matches, err := m.Match(logs)
  2148. if err != nil {
  2149. return fmt.Errorf("expected %q in container output: %v", expected, err)
  2150. } else if !matches {
  2151. return fmt.Errorf("expected %q in container output: %s", expected, m.FailureMessage(logs))
  2152. }
  2153. }
  2154. return nil
  2155. }
  2156. type EventsLister func(opts metav1.ListOptions, ns string) (*v1.EventList, error)
  2157. func DumpEventsInNamespace(eventsLister EventsLister, namespace string) {
  2158. By(fmt.Sprintf("Collecting events from namespace %q.", namespace))
  2159. events, err := eventsLister(metav1.ListOptions{}, namespace)
  2160. Expect(err).NotTo(HaveOccurred())
  2161. By(fmt.Sprintf("Found %d events.", len(events.Items)))
  2162. // Sort events by their first timestamp
  2163. sortedEvents := events.Items
  2164. if len(sortedEvents) > 1 {
  2165. sort.Sort(byFirstTimestamp(sortedEvents))
  2166. }
  2167. for _, e := range sortedEvents {
  2168. Logf("At %v - event for %v: %v %v: %v", e.FirstTimestamp, e.InvolvedObject.Name, e.Source, e.Reason, e.Message)
  2169. }
  2170. // Note that we don't wait for any Cleanup to propagate, which means
  2171. // that if you delete a bunch of pods right before ending your test,
  2172. // you may or may not see the killing/deletion/Cleanup events.
  2173. }
  2174. func DumpAllNamespaceInfo(c clientset.Interface, namespace string) {
  2175. DumpEventsInNamespace(func(opts metav1.ListOptions, ns string) (*v1.EventList, error) {
  2176. return c.CoreV1().Events(ns).List(opts)
  2177. }, namespace)
  2178. // If cluster is large, then the following logs are basically useless, because:
  2179. // 1. it takes tens of minutes or hours to grab all of them
  2180. // 2. there are so many of them that working with them are mostly impossible
  2181. // So we dump them only if the cluster is relatively small.
  2182. maxNodesForDump := 20
  2183. if nodes, err := c.CoreV1().Nodes().List(metav1.ListOptions{}); err == nil {
  2184. if len(nodes.Items) <= maxNodesForDump {
  2185. dumpAllPodInfo(c)
  2186. dumpAllNodeInfo(c)
  2187. } else {
  2188. Logf("skipping dumping cluster info - cluster too large")
  2189. }
  2190. } else {
  2191. Logf("unable to fetch node list: %v", err)
  2192. }
  2193. }
  2194. // byFirstTimestamp sorts a slice of events by first timestamp, using their involvedObject's name as a tie breaker.
  2195. type byFirstTimestamp []v1.Event
  2196. func (o byFirstTimestamp) Len() int { return len(o) }
  2197. func (o byFirstTimestamp) Swap(i, j int) { o[i], o[j] = o[j], o[i] }
  2198. func (o byFirstTimestamp) Less(i, j int) bool {
  2199. if o[i].FirstTimestamp.Equal(&o[j].FirstTimestamp) {
  2200. return o[i].InvolvedObject.Name < o[j].InvolvedObject.Name
  2201. }
  2202. return o[i].FirstTimestamp.Before(&o[j].FirstTimestamp)
  2203. }
  2204. func dumpAllPodInfo(c clientset.Interface) {
  2205. pods, err := c.CoreV1().Pods("").List(metav1.ListOptions{})
  2206. if err != nil {
  2207. Logf("unable to fetch pod debug info: %v", err)
  2208. }
  2209. logPodStates(pods.Items)
  2210. }
  2211. func dumpAllNodeInfo(c clientset.Interface) {
  2212. // It should be OK to list unschedulable Nodes here.
  2213. nodes, err := c.CoreV1().Nodes().List(metav1.ListOptions{})
  2214. if err != nil {
  2215. Logf("unable to fetch node list: %v", err)
  2216. return
  2217. }
  2218. names := make([]string, len(nodes.Items))
  2219. for ix := range nodes.Items {
  2220. names[ix] = nodes.Items[ix].Name
  2221. }
  2222. DumpNodeDebugInfo(c, names, Logf)
  2223. }
  2224. func DumpNodeDebugInfo(c clientset.Interface, nodeNames []string, logFunc func(fmt string, args ...interface{})) {
  2225. for _, n := range nodeNames {
  2226. logFunc("\nLogging node info for node %v", n)
  2227. node, err := c.CoreV1().Nodes().Get(n, metav1.GetOptions{})
  2228. if err != nil {
  2229. logFunc("Error getting node info %v", err)
  2230. }
  2231. logFunc("Node Info: %v", node)
  2232. logFunc("\nLogging kubelet events for node %v", n)
  2233. for _, e := range getNodeEvents(c, n) {
  2234. logFunc("source %v type %v message %v reason %v first ts %v last ts %v, involved obj %+v",
  2235. e.Source, e.Type, e.Message, e.Reason, e.FirstTimestamp, e.LastTimestamp, e.InvolvedObject)
  2236. }
  2237. logFunc("\nLogging pods the kubelet thinks is on node %v", n)
  2238. podList, err := GetKubeletPods(c, n)
  2239. if err != nil {
  2240. logFunc("Unable to retrieve kubelet pods for node %v: %v", n, err)
  2241. continue
  2242. }
  2243. for _, p := range podList.Items {
  2244. logFunc("%v started at %v (%d+%d container statuses recorded)", p.Name, p.Status.StartTime, len(p.Status.InitContainerStatuses), len(p.Status.ContainerStatuses))
  2245. for _, c := range p.Status.InitContainerStatuses {
  2246. logFunc("\tInit container %v ready: %v, restart count %v",
  2247. c.Name, c.Ready, c.RestartCount)
  2248. }
  2249. for _, c := range p.Status.ContainerStatuses {
  2250. logFunc("\tContainer %v ready: %v, restart count %v",
  2251. c.Name, c.Ready, c.RestartCount)
  2252. }
  2253. }
  2254. HighLatencyKubeletOperations(c, 10*time.Second, n, logFunc)
  2255. // TODO: Log node resource info
  2256. }
  2257. }
  2258. // logNodeEvents logs kubelet events from the given node. This includes kubelet
  2259. // restart and node unhealthy events. Note that listing events like this will mess
  2260. // with latency metrics, beware of calling it during a test.
  2261. func getNodeEvents(c clientset.Interface, nodeName string) []v1.Event {
  2262. selector := fields.Set{
  2263. "involvedObject.kind": "Node",
  2264. "involvedObject.name": nodeName,
  2265. "involvedObject.namespace": metav1.NamespaceAll,
  2266. "source": "kubelet",
  2267. }.AsSelector().String()
  2268. options := metav1.ListOptions{FieldSelector: selector}
  2269. events, err := c.CoreV1().Events(metav1.NamespaceSystem).List(options)
  2270. if err != nil {
  2271. Logf("Unexpected error retrieving node events %v", err)
  2272. return []v1.Event{}
  2273. }
  2274. return events.Items
  2275. }
  2276. // waitListSchedulableNodes is a wrapper around listing nodes supporting retries.
  2277. func waitListSchedulableNodes(c clientset.Interface) (*v1.NodeList, error) {
  2278. var nodes *v1.NodeList
  2279. var err error
  2280. if wait.PollImmediate(Poll, SingleCallTimeout, func() (bool, error) {
  2281. nodes, err = c.CoreV1().Nodes().List(metav1.ListOptions{FieldSelector: fields.Set{
  2282. "spec.unschedulable": "false",
  2283. }.AsSelector().String()})
  2284. if err != nil {
  2285. if testutils.IsRetryableAPIError(err) {
  2286. return false, nil
  2287. }
  2288. return false, err
  2289. }
  2290. return true, nil
  2291. }) != nil {
  2292. return nodes, err
  2293. }
  2294. return nodes, nil
  2295. }
  2296. // waitListSchedulableNodesOrDie is a wrapper around listing nodes supporting retries.
  2297. func waitListSchedulableNodesOrDie(c clientset.Interface) *v1.NodeList {
  2298. nodes, err := waitListSchedulableNodes(c)
  2299. if err != nil {
  2300. ExpectNoError(err, "Non-retryable failure or timed out while listing nodes for e2e cluster.")
  2301. }
  2302. return nodes
  2303. }
  2304. // Node is schedulable if:
  2305. // 1) doesn't have "unschedulable" field set
  2306. // 2) it's Ready condition is set to true
  2307. // 3) doesn't have NetworkUnavailable condition set to true
  2308. func isNodeSchedulable(node *v1.Node) bool {
  2309. nodeReady := IsNodeConditionSetAsExpected(node, v1.NodeReady, true)
  2310. networkReady := IsNodeConditionUnset(node, v1.NodeNetworkUnavailable) ||
  2311. IsNodeConditionSetAsExpectedSilent(node, v1.NodeNetworkUnavailable, false)
  2312. return !node.Spec.Unschedulable && nodeReady && networkReady
  2313. }
  2314. // Test whether a fake pod can be scheduled on "node", given its current taints.
  2315. func isNodeUntainted(node *v1.Node) bool {
  2316. fakePod := &v1.Pod{
  2317. TypeMeta: metav1.TypeMeta{
  2318. Kind: "Pod",
  2319. APIVersion: "v1",
  2320. },
  2321. ObjectMeta: metav1.ObjectMeta{
  2322. Name: "fake-not-scheduled",
  2323. Namespace: "fake-not-scheduled",
  2324. },
  2325. Spec: v1.PodSpec{
  2326. Containers: []v1.Container{
  2327. {
  2328. Name: "fake-not-scheduled",
  2329. Image: "fake-not-scheduled",
  2330. },
  2331. },
  2332. },
  2333. }
  2334. nodeInfo := schedulercache.NewNodeInfo()
  2335. nodeInfo.SetNode(node)
  2336. fit, _, err := predicates.PodToleratesNodeTaints(fakePod, nil, nodeInfo)
  2337. if err != nil {
  2338. Failf("Can't test predicates for node %s: %v", node.Name, err)
  2339. return false
  2340. }
  2341. return fit
  2342. }
  2343. // GetReadySchedulableNodesOrDie addresses the common use case of getting nodes you can do work on.
  2344. // 1) Needs to be schedulable.
  2345. // 2) Needs to be ready.
  2346. // If EITHER 1 or 2 is not true, most tests will want to ignore the node entirely.
  2347. func GetReadySchedulableNodesOrDie(c clientset.Interface) (nodes *v1.NodeList) {
  2348. nodes = waitListSchedulableNodesOrDie(c)
  2349. // previous tests may have cause failures of some nodes. Let's skip
  2350. // 'Not Ready' nodes, just in case (there is no need to fail the test).
  2351. FilterNodes(nodes, func(node v1.Node) bool {
  2352. return isNodeSchedulable(&node) && isNodeUntainted(&node)
  2353. })
  2354. return nodes
  2355. }
  2356. // GetReadyNodesIncludingTaintedOrDie returns all ready nodes, even those which are tainted.
  2357. // There are cases when we care about tainted nodes
  2358. // E.g. in tests related to nodes with gpu we care about nodes despite
  2359. // presence of nvidia.com/gpu=present:NoSchedule taint
  2360. func GetReadyNodesIncludingTaintedOrDie(c clientset.Interface) (nodes *v1.NodeList) {
  2361. nodes = waitListSchedulableNodesOrDie(c)
  2362. FilterNodes(nodes, func(node v1.Node) bool {
  2363. return isNodeSchedulable(&node)
  2364. })
  2365. return nodes
  2366. }
  2367. func WaitForAllNodesSchedulable(c clientset.Interface, timeout time.Duration) error {
  2368. Logf("Waiting up to %v for all (but %d) nodes to be schedulable", timeout, TestContext.AllowedNotReadyNodes)
  2369. var notSchedulable []*v1.Node
  2370. attempt := 0
  2371. return wait.PollImmediate(30*time.Second, timeout, func() (bool, error) {
  2372. attempt++
  2373. notSchedulable = nil
  2374. opts := metav1.ListOptions{
  2375. ResourceVersion: "0",
  2376. FieldSelector: fields.Set{"spec.unschedulable": "false"}.AsSelector().String(),
  2377. }
  2378. nodes, err := c.CoreV1().Nodes().List(opts)
  2379. if err != nil {
  2380. Logf("Unexpected error listing nodes: %v", err)
  2381. if testutils.IsRetryableAPIError(err) {
  2382. return false, nil
  2383. }
  2384. return false, err
  2385. }
  2386. for i := range nodes.Items {
  2387. node := &nodes.Items[i]
  2388. if !isNodeSchedulable(node) {
  2389. notSchedulable = append(notSchedulable, node)
  2390. }
  2391. }
  2392. // Framework allows for <TestContext.AllowedNotReadyNodes> nodes to be non-ready,
  2393. // to make it possible e.g. for incorrect deployment of some small percentage
  2394. // of nodes (which we allow in cluster validation). Some nodes that are not
  2395. // provisioned correctly at startup will never become ready (e.g. when something
  2396. // won't install correctly), so we can't expect them to be ready at any point.
  2397. //
  2398. // However, we only allow non-ready nodes with some specific reasons.
  2399. if len(notSchedulable) > 0 {
  2400. // In large clusters, log them only every 10th pass.
  2401. if len(nodes.Items) >= largeClusterThreshold && attempt%10 == 0 {
  2402. Logf("Unschedulable nodes:")
  2403. for i := range notSchedulable {
  2404. Logf("-> %s Ready=%t Network=%t",
  2405. notSchedulable[i].Name,
  2406. IsNodeConditionSetAsExpectedSilent(notSchedulable[i], v1.NodeReady, true),
  2407. IsNodeConditionSetAsExpectedSilent(notSchedulable[i], v1.NodeNetworkUnavailable, false))
  2408. }
  2409. Logf("================================")
  2410. }
  2411. }
  2412. return len(notSchedulable) <= TestContext.AllowedNotReadyNodes, nil
  2413. })
  2414. }
  2415. func GetPodSecretUpdateTimeout(c clientset.Interface) time.Duration {
  2416. // With SecretManager(ConfigMapManager), we may have to wait up to full sync period +
  2417. // TTL of secret(configmap) to elapse before the Kubelet projects the update into the
  2418. // volume and the container picks it up.
  2419. // So this timeout is based on default Kubelet sync period (1 minute) + maximum TTL for
  2420. // secret(configmap) that's based on cluster size + additional time as a fudge factor.
  2421. secretTTL, err := GetNodeTTLAnnotationValue(c)
  2422. if err != nil {
  2423. Logf("Couldn't get node TTL annotation (using default value of 0): %v", err)
  2424. }
  2425. podLogTimeout := 240*time.Second + secretTTL
  2426. return podLogTimeout
  2427. }
  2428. func GetNodeTTLAnnotationValue(c clientset.Interface) (time.Duration, error) {
  2429. nodes, err := c.CoreV1().Nodes().List(metav1.ListOptions{})
  2430. if err != nil || len(nodes.Items) == 0 {
  2431. return time.Duration(0), fmt.Errorf("Couldn't list any nodes to get TTL annotation: %v", err)
  2432. }
  2433. // Since TTL the kubelet is using is stored in node object, for the timeout
  2434. // purpose we take it from the first node (all of them should be the same).
  2435. node := &nodes.Items[0]
  2436. if node.Annotations == nil {
  2437. return time.Duration(0), fmt.Errorf("No annotations found on the node")
  2438. }
  2439. value, ok := node.Annotations[v1.ObjectTTLAnnotationKey]
  2440. if !ok {
  2441. return time.Duration(0), fmt.Errorf("No TTL annotation found on the node")
  2442. }
  2443. intValue, err := strconv.Atoi(value)
  2444. if err != nil {
  2445. return time.Duration(0), fmt.Errorf("Cannot convert TTL annotation from %#v to int", *node)
  2446. }
  2447. return time.Duration(intValue) * time.Second, nil
  2448. }
  2449. func AddOrUpdateLabelOnNode(c clientset.Interface, nodeName string, labelKey, labelValue string) {
  2450. ExpectNoError(testutils.AddLabelsToNode(c, nodeName, map[string]string{labelKey: labelValue}))
  2451. }
  2452. func AddOrUpdateLabelOnNodeAndReturnOldValue(c clientset.Interface, nodeName string, labelKey, labelValue string) string {
  2453. var oldValue string
  2454. node, err := c.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{})
  2455. ExpectNoError(err)
  2456. oldValue = node.Labels[labelKey]
  2457. ExpectNoError(testutils.AddLabelsToNode(c, nodeName, map[string]string{labelKey: labelValue}))
  2458. return oldValue
  2459. }
  2460. func ExpectNodeHasLabel(c clientset.Interface, nodeName string, labelKey string, labelValue string) {
  2461. By("verifying the node has the label " + labelKey + " " + labelValue)
  2462. node, err := c.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{})
  2463. ExpectNoError(err)
  2464. Expect(node.Labels[labelKey]).To(Equal(labelValue))
  2465. }
  2466. func RemoveTaintOffNode(c clientset.Interface, nodeName string, taint v1.Taint) {
  2467. ExpectNoError(controller.RemoveTaintOffNode(c, nodeName, nil, &taint))
  2468. VerifyThatTaintIsGone(c, nodeName, &taint)
  2469. }
  2470. func AddOrUpdateTaintOnNode(c clientset.Interface, nodeName string, taint v1.Taint) {
  2471. ExpectNoError(controller.AddOrUpdateTaintOnNode(c, nodeName, &taint))
  2472. }
  2473. // RemoveLabelOffNode is for cleaning up labels temporarily added to node,
  2474. // won't fail if target label doesn't exist or has been removed.
  2475. func RemoveLabelOffNode(c clientset.Interface, nodeName string, labelKey string) {
  2476. By("removing the label " + labelKey + " off the node " + nodeName)
  2477. ExpectNoError(testutils.RemoveLabelOffNode(c, nodeName, []string{labelKey}))
  2478. By("verifying the node doesn't have the label " + labelKey)
  2479. ExpectNoError(testutils.VerifyLabelsRemoved(c, nodeName, []string{labelKey}))
  2480. }
  2481. func VerifyThatTaintIsGone(c clientset.Interface, nodeName string, taint *v1.Taint) {
  2482. By("verifying the node doesn't have the taint " + taint.ToString())
  2483. nodeUpdated, err := c.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{})
  2484. ExpectNoError(err)
  2485. if taintutils.TaintExists(nodeUpdated.Spec.Taints, taint) {
  2486. Failf("Failed removing taint " + taint.ToString() + " of the node " + nodeName)
  2487. }
  2488. }
  2489. func ExpectNodeHasTaint(c clientset.Interface, nodeName string, taint *v1.Taint) {
  2490. By("verifying the node has the taint " + taint.ToString())
  2491. if has, err := NodeHasTaint(c, nodeName, taint); !has {
  2492. ExpectNoError(err)
  2493. Failf("Failed to find taint %s on node %s", taint.ToString(), nodeName)
  2494. }
  2495. }
  2496. func NodeHasTaint(c clientset.Interface, nodeName string, taint *v1.Taint) (bool, error) {
  2497. node, err := c.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{})
  2498. if err != nil {
  2499. return false, err
  2500. }
  2501. nodeTaints := node.Spec.Taints
  2502. if len(nodeTaints) == 0 || !taintutils.TaintExists(nodeTaints, taint) {
  2503. return false, nil
  2504. }
  2505. return true, nil
  2506. }
  2507. //AddOrUpdateAvoidPodOnNode adds avoidPods annotations to node, will override if it exists
  2508. func AddOrUpdateAvoidPodOnNode(c clientset.Interface, nodeName string, avoidPods v1.AvoidPods) {
  2509. err := wait.PollImmediate(Poll, SingleCallTimeout, func() (bool, error) {
  2510. node, err := c.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{})
  2511. if err != nil {
  2512. if testutils.IsRetryableAPIError(err) {
  2513. return false, nil
  2514. }
  2515. return false, err
  2516. }
  2517. taintsData, err := json.Marshal(avoidPods)
  2518. ExpectNoError(err)
  2519. if node.Annotations == nil {
  2520. node.Annotations = make(map[string]string)
  2521. }
  2522. node.Annotations[v1.PreferAvoidPodsAnnotationKey] = string(taintsData)
  2523. _, err = c.CoreV1().Nodes().Update(node)
  2524. if err != nil {
  2525. if !apierrs.IsConflict(err) {
  2526. ExpectNoError(err)
  2527. } else {
  2528. Logf("Conflict when trying to add/update avoidPonds %v to %v", avoidPods, nodeName)
  2529. }
  2530. }
  2531. return true, nil
  2532. })
  2533. ExpectNoError(err)
  2534. }
  2535. //RemoveAnnotationOffNode removes AvoidPods annotations from the node. It does not fail if no such annotation exists.
  2536. func RemoveAvoidPodsOffNode(c clientset.Interface, nodeName string) {
  2537. err := wait.PollImmediate(Poll, SingleCallTimeout, func() (bool, error) {
  2538. node, err := c.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{})
  2539. if err != nil {
  2540. if testutils.IsRetryableAPIError(err) {
  2541. return false, nil
  2542. }
  2543. return false, err
  2544. }
  2545. if node.Annotations == nil {
  2546. return true, nil
  2547. }
  2548. delete(node.Annotations, v1.PreferAvoidPodsAnnotationKey)
  2549. _, err = c.CoreV1().Nodes().Update(node)
  2550. if err != nil {
  2551. if !apierrs.IsConflict(err) {
  2552. ExpectNoError(err)
  2553. } else {
  2554. Logf("Conflict when trying to remove avoidPods to %v", nodeName)
  2555. }
  2556. }
  2557. return true, nil
  2558. })
  2559. ExpectNoError(err)
  2560. }
  2561. func ScaleResource(
  2562. clientset clientset.Interface,
  2563. scalesGetter scaleclient.ScalesGetter,
  2564. ns, name string,
  2565. size uint,
  2566. wait bool,
  2567. kind schema.GroupKind,
  2568. gr schema.GroupResource,
  2569. ) error {
  2570. By(fmt.Sprintf("Scaling %v %s in namespace %s to %d", kind, name, ns, size))
  2571. scaler := kubectl.NewScaler(scalesGetter)
  2572. if err := testutils.ScaleResourceWithRetries(scaler, ns, name, size, gr); err != nil {
  2573. return fmt.Errorf("error while scaling RC %s to %d replicas: %v", name, size, err)
  2574. }
  2575. if !wait {
  2576. return nil
  2577. }
  2578. return WaitForControlledPodsRunning(clientset, ns, name, kind)
  2579. }
  2580. // Wait up to 10 minutes for pods to become Running.
  2581. func WaitForControlledPodsRunning(c clientset.Interface, ns, name string, kind schema.GroupKind) error {
  2582. rtObject, err := getRuntimeObjectForKind(c, kind, ns, name)
  2583. if err != nil {
  2584. return err
  2585. }
  2586. selector, err := getSelectorFromRuntimeObject(rtObject)
  2587. if err != nil {
  2588. return err
  2589. }
  2590. replicas, err := getReplicasFromRuntimeObject(rtObject)
  2591. if err != nil {
  2592. return err
  2593. }
  2594. err = testutils.WaitForEnoughPodsWithLabelRunning(c, ns, selector, int(replicas))
  2595. if err != nil {
  2596. return fmt.Errorf("Error while waiting for replication controller %s pods to be running: %v", name, err)
  2597. }
  2598. return nil
  2599. }
  2600. // Wait up to PodListTimeout for getting pods of the specified controller name and return them.
  2601. func WaitForControlledPods(c clientset.Interface, ns, name string, kind schema.GroupKind) (pods *v1.PodList, err error) {
  2602. rtObject, err := getRuntimeObjectForKind(c, kind, ns, name)
  2603. if err != nil {
  2604. return nil, err
  2605. }
  2606. selector, err := getSelectorFromRuntimeObject(rtObject)
  2607. if err != nil {
  2608. return nil, err
  2609. }
  2610. return WaitForPodsWithLabel(c, ns, selector)
  2611. }
  2612. // Returns true if all the specified pods are scheduled, else returns false.
  2613. func podsWithLabelScheduled(c clientset.Interface, ns string, label labels.Selector) (bool, error) {
  2614. ps, err := testutils.NewPodStore(c, ns, label, fields.Everything())
  2615. if err != nil {
  2616. return false, err
  2617. }
  2618. defer ps.Stop()
  2619. pods := ps.List()
  2620. if len(pods) == 0 {
  2621. return false, nil
  2622. }
  2623. for _, pod := range pods {
  2624. if pod.Spec.NodeName == "" {
  2625. return false, nil
  2626. }
  2627. }
  2628. return true, nil
  2629. }
  2630. // Wait for all matching pods to become scheduled and at least one
  2631. // matching pod exists. Return the list of matching pods.
  2632. func WaitForPodsWithLabelScheduled(c clientset.Interface, ns string, label labels.Selector) (pods *v1.PodList, err error) {
  2633. err = wait.PollImmediate(Poll, podScheduledBeforeTimeout,
  2634. func() (bool, error) {
  2635. pods, err = WaitForPodsWithLabel(c, ns, label)
  2636. if err != nil {
  2637. return false, err
  2638. }
  2639. for _, pod := range pods.Items {
  2640. if pod.Spec.NodeName == "" {
  2641. return false, nil
  2642. }
  2643. }
  2644. return true, nil
  2645. })
  2646. return pods, err
  2647. }
  2648. // Wait up to PodListTimeout for getting pods with certain label
  2649. func WaitForPodsWithLabel(c clientset.Interface, ns string, label labels.Selector) (pods *v1.PodList, err error) {
  2650. for t := time.Now(); time.Since(t) < PodListTimeout; time.Sleep(Poll) {
  2651. options := metav1.ListOptions{LabelSelector: label.String()}
  2652. pods, err = c.CoreV1().Pods(ns).List(options)
  2653. if err != nil {
  2654. if testutils.IsRetryableAPIError(err) {
  2655. continue
  2656. }
  2657. return
  2658. }
  2659. if len(pods.Items) > 0 {
  2660. break
  2661. }
  2662. }
  2663. if pods == nil || len(pods.Items) == 0 {
  2664. err = fmt.Errorf("Timeout while waiting for pods with label %v", label)
  2665. }
  2666. return
  2667. }
  2668. // Wait for exact amount of matching pods to become running and ready.
  2669. // Return the list of matching pods.
  2670. func WaitForPodsWithLabelRunningReady(c clientset.Interface, ns string, label labels.Selector, num int, timeout time.Duration) (pods *v1.PodList, err error) {
  2671. var current int
  2672. err = wait.Poll(Poll, timeout,
  2673. func() (bool, error) {
  2674. pods, err := WaitForPodsWithLabel(c, ns, label)
  2675. if err != nil {
  2676. Logf("Failed to list pods: %v", err)
  2677. if testutils.IsRetryableAPIError(err) {
  2678. return false, nil
  2679. }
  2680. return false, err
  2681. }
  2682. current = 0
  2683. for _, pod := range pods.Items {
  2684. if flag, err := testutils.PodRunningReady(&pod); err == nil && flag == true {
  2685. current++
  2686. }
  2687. }
  2688. if current != num {
  2689. Logf("Got %v pods running and ready, expect: %v", current, num)
  2690. return false, nil
  2691. }
  2692. return true, nil
  2693. })
  2694. return pods, err
  2695. }
  2696. func getRuntimeObjectForKind(c clientset.Interface, kind schema.GroupKind, ns, name string) (runtime.Object, error) {
  2697. switch kind {
  2698. case api.Kind("ReplicationController"):
  2699. return c.CoreV1().ReplicationControllers(ns).Get(name, metav1.GetOptions{})
  2700. case extensionsinternal.Kind("ReplicaSet"), appsinternal.Kind("ReplicaSet"):
  2701. return c.ExtensionsV1beta1().ReplicaSets(ns).Get(name, metav1.GetOptions{})
  2702. case extensionsinternal.Kind("Deployment"), appsinternal.Kind("Deployment"):
  2703. return c.ExtensionsV1beta1().Deployments(ns).Get(name, metav1.GetOptions{})
  2704. case extensionsinternal.Kind("DaemonSet"):
  2705. return c.ExtensionsV1beta1().DaemonSets(ns).Get(name, metav1.GetOptions{})
  2706. case batchinternal.Kind("Job"):
  2707. return c.BatchV1().Jobs(ns).Get(name, metav1.GetOptions{})
  2708. default:
  2709. return nil, fmt.Errorf("Unsupported kind when getting runtime object: %v", kind)
  2710. }
  2711. }
  2712. func getSelectorFromRuntimeObject(obj runtime.Object) (labels.Selector, error) {
  2713. switch typed := obj.(type) {
  2714. case *v1.ReplicationController:
  2715. return labels.SelectorFromSet(typed.Spec.Selector), nil
  2716. case *extensions.ReplicaSet:
  2717. return metav1.LabelSelectorAsSelector(typed.Spec.Selector)
  2718. case *extensions.Deployment:
  2719. return metav1.LabelSelectorAsSelector(typed.Spec.Selector)
  2720. case *extensions.DaemonSet:
  2721. return metav1.LabelSelectorAsSelector(typed.Spec.Selector)
  2722. case *batch.Job:
  2723. return metav1.LabelSelectorAsSelector(typed.Spec.Selector)
  2724. default:
  2725. return nil, fmt.Errorf("Unsupported kind when getting selector: %v", obj)
  2726. }
  2727. }
  2728. func getReplicasFromRuntimeObject(obj runtime.Object) (int32, error) {
  2729. switch typed := obj.(type) {
  2730. case *v1.ReplicationController:
  2731. if typed.Spec.Replicas != nil {
  2732. return *typed.Spec.Replicas, nil
  2733. }
  2734. return 0, nil
  2735. case *extensions.ReplicaSet:
  2736. if typed.Spec.Replicas != nil {
  2737. return *typed.Spec.Replicas, nil
  2738. }
  2739. return 0, nil
  2740. case *extensions.Deployment:
  2741. if typed.Spec.Replicas != nil {
  2742. return *typed.Spec.Replicas, nil
  2743. }
  2744. return 0, nil
  2745. case *extensions.DaemonSet:
  2746. return 0, nil
  2747. case *batch.Job:
  2748. // TODO: currently we use pause pods so that's OK. When we'll want to switch to Pods
  2749. // that actually finish we need a better way to do this.
  2750. if typed.Spec.Parallelism != nil {
  2751. return *typed.Spec.Parallelism, nil
  2752. }
  2753. return 0, nil
  2754. default:
  2755. return -1, fmt.Errorf("Unsupported kind when getting number of replicas: %v", obj)
  2756. }
  2757. }
  2758. // DeleteResourceAndWaitForGC deletes only given resource and waits for GC to delete the pods.
  2759. func DeleteResourceAndWaitForGC(c clientset.Interface, kind schema.GroupKind, ns, name string) error {
  2760. By(fmt.Sprintf("deleting %v %s in namespace %s, will wait for the garbage collector to delete the pods", kind, name, ns))
  2761. rtObject, err := getRuntimeObjectForKind(c, kind, ns, name)
  2762. if err != nil {
  2763. if apierrs.IsNotFound(err) {
  2764. Logf("%v %s not found: %v", kind, name, err)
  2765. return nil
  2766. }
  2767. return err
  2768. }
  2769. selector, err := getSelectorFromRuntimeObject(rtObject)
  2770. if err != nil {
  2771. return err
  2772. }
  2773. replicas, err := getReplicasFromRuntimeObject(rtObject)
  2774. if err != nil {
  2775. return err
  2776. }
  2777. ps, err := testutils.NewPodStore(c, ns, selector, fields.Everything())
  2778. if err != nil {
  2779. return err
  2780. }
  2781. defer ps.Stop()
  2782. falseVar := false
  2783. deleteOption := &metav1.DeleteOptions{OrphanDependents: &falseVar}
  2784. startTime := time.Now()
  2785. if err := testutils.DeleteResourceWithRetries(c, kind, ns, name, deleteOption); err != nil {
  2786. return err
  2787. }
  2788. deleteTime := time.Since(startTime)
  2789. Logf("Deleting %v %s took: %v", kind, name, deleteTime)
  2790. var interval, timeout time.Duration
  2791. switch {
  2792. case replicas < 100:
  2793. interval = 100 * time.Millisecond
  2794. case replicas < 1000:
  2795. interval = 1 * time.Second
  2796. default:
  2797. interval = 10 * time.Second
  2798. }
  2799. if replicas < 5000 {
  2800. timeout = 10 * time.Minute
  2801. } else {
  2802. timeout = time.Duration(replicas/gcThroughput) * time.Second
  2803. // gcThroughput is pretty strict now, add a bit more to it
  2804. timeout = timeout + 3*time.Minute
  2805. }
  2806. err = waitForPodsInactive(ps, interval, timeout)
  2807. if err != nil {
  2808. return fmt.Errorf("error while waiting for pods to become inactive %s: %v", name, err)
  2809. }
  2810. terminatePodTime := time.Since(startTime) - deleteTime
  2811. Logf("Terminating %v %s pods took: %v", kind, name, terminatePodTime)
  2812. err = waitForPodsGone(ps, interval, 10*time.Minute)
  2813. if err != nil {
  2814. return fmt.Errorf("error while waiting for pods gone %s: %v", name, err)
  2815. }
  2816. return nil
  2817. }
  2818. // waitForPodsInactive waits until there are no active pods left in the PodStore.
  2819. // This is to make a fair comparison of deletion time between DeleteRCAndPods
  2820. // and DeleteRCAndWaitForGC, because the RC controller decreases status.replicas
  2821. // when the pod is inactvie.
  2822. func waitForPodsInactive(ps *testutils.PodStore, interval, timeout time.Duration) error {
  2823. return wait.PollImmediate(interval, timeout, func() (bool, error) {
  2824. pods := ps.List()
  2825. for _, pod := range pods {
  2826. if controller.IsPodActive(pod) {
  2827. return false, nil
  2828. }
  2829. }
  2830. return true, nil
  2831. })
  2832. }
  2833. // waitForPodsGone waits until there are no pods left in the PodStore.
  2834. func waitForPodsGone(ps *testutils.PodStore, interval, timeout time.Duration) error {
  2835. return wait.PollImmediate(interval, timeout, func() (bool, error) {
  2836. if pods := ps.List(); len(pods) == 0 {
  2837. return true, nil
  2838. }
  2839. return false, nil
  2840. })
  2841. }
  2842. func WaitForPodsReady(c clientset.Interface, ns, name string, minReadySeconds int) error {
  2843. label := labels.SelectorFromSet(labels.Set(map[string]string{"name": name}))
  2844. options := metav1.ListOptions{LabelSelector: label.String()}
  2845. return wait.Poll(Poll, 5*time.Minute, func() (bool, error) {
  2846. pods, err := c.CoreV1().Pods(ns).List(options)
  2847. if err != nil {
  2848. return false, nil
  2849. }
  2850. for _, pod := range pods.Items {
  2851. if !podutil.IsPodAvailable(&pod, int32(minReadySeconds), metav1.Now()) {
  2852. return false, nil
  2853. }
  2854. }
  2855. return true, nil
  2856. })
  2857. }
  2858. // Waits for the number of events on the given object to reach a desired count.
  2859. func WaitForEvents(c clientset.Interface, ns string, objOrRef runtime.Object, desiredEventsCount int) error {
  2860. return wait.Poll(Poll, 5*time.Minute, func() (bool, error) {
  2861. events, err := c.CoreV1().Events(ns).Search(legacyscheme.Scheme, objOrRef)
  2862. if err != nil {
  2863. return false, fmt.Errorf("error in listing events: %s", err)
  2864. }
  2865. eventsCount := len(events.Items)
  2866. if eventsCount == desiredEventsCount {
  2867. return true, nil
  2868. }
  2869. if eventsCount < desiredEventsCount {
  2870. return false, nil
  2871. }
  2872. // Number of events has exceeded the desired count.
  2873. return false, fmt.Errorf("number of events has exceeded the desired count, eventsCount: %d, desiredCount: %d", eventsCount, desiredEventsCount)
  2874. })
  2875. }
  2876. // Waits for the number of events on the given object to be at least a desired count.
  2877. func WaitForPartialEvents(c clientset.Interface, ns string, objOrRef runtime.Object, atLeastEventsCount int) error {
  2878. return wait.Poll(Poll, 5*time.Minute, func() (bool, error) {
  2879. events, err := c.CoreV1().Events(ns).Search(legacyscheme.Scheme, objOrRef)
  2880. if err != nil {
  2881. return false, fmt.Errorf("error in listing events: %s", err)
  2882. }
  2883. eventsCount := len(events.Items)
  2884. if eventsCount >= atLeastEventsCount {
  2885. return true, nil
  2886. }
  2887. return false, nil
  2888. })
  2889. }
  2890. type updateDSFunc func(*apps.DaemonSet)
  2891. func UpdateDaemonSetWithRetries(c clientset.Interface, namespace, name string, applyUpdate updateDSFunc) (ds *apps.DaemonSet, err error) {
  2892. daemonsets := c.AppsV1().DaemonSets(namespace)
  2893. var updateErr error
  2894. pollErr := wait.PollImmediate(10*time.Millisecond, 1*time.Minute, func() (bool, error) {
  2895. if ds, err = daemonsets.Get(name, metav1.GetOptions{}); err != nil {
  2896. if testutils.IsRetryableAPIError(err) {
  2897. return false, nil
  2898. }
  2899. return false, err
  2900. }
  2901. // Apply the update, then attempt to push it to the apiserver.
  2902. applyUpdate(ds)
  2903. if ds, err = daemonsets.Update(ds); err == nil {
  2904. Logf("Updating DaemonSet %s", name)
  2905. return true, nil
  2906. }
  2907. updateErr = err
  2908. return false, nil
  2909. })
  2910. if pollErr == wait.ErrWaitTimeout {
  2911. pollErr = fmt.Errorf("couldn't apply the provided updated to DaemonSet %q: %v", name, updateErr)
  2912. }
  2913. return ds, pollErr
  2914. }
  2915. // NodeAddresses returns the first address of the given type of each node.
  2916. func NodeAddresses(nodelist *v1.NodeList, addrType v1.NodeAddressType) []string {
  2917. hosts := []string{}
  2918. for _, n := range nodelist.Items {
  2919. for _, addr := range n.Status.Addresses {
  2920. // Use the first external IP address we find on the node, and
  2921. // use at most one per node.
  2922. // TODO(roberthbailey): Use the "preferred" address for the node, once
  2923. // such a thing is defined (#2462).
  2924. if addr.Type == addrType {
  2925. hosts = append(hosts, addr.Address)
  2926. break
  2927. }
  2928. }
  2929. }
  2930. return hosts
  2931. }
  2932. // NodeSSHHosts returns SSH-able host names for all schedulable nodes - this excludes master node.
  2933. // It returns an error if it can't find an external IP for every node, though it still returns all
  2934. // hosts that it found in that case.
  2935. func NodeSSHHosts(c clientset.Interface) ([]string, error) {
  2936. nodelist := waitListSchedulableNodesOrDie(c)
  2937. // TODO(roberthbailey): Use the "preferred" address for the node, once such a thing is defined (#2462).
  2938. hosts := NodeAddresses(nodelist, v1.NodeExternalIP)
  2939. // Error if any node didn't have an external IP.
  2940. if len(hosts) != len(nodelist.Items) {
  2941. return hosts, fmt.Errorf(
  2942. "only found %d external IPs on nodes, but found %d nodes. Nodelist: %v",
  2943. len(hosts), len(nodelist.Items), nodelist)
  2944. }
  2945. sshHosts := make([]string, 0, len(hosts))
  2946. for _, h := range hosts {
  2947. sshHosts = append(sshHosts, net.JoinHostPort(h, sshPort))
  2948. }
  2949. return sshHosts, nil
  2950. }
  2951. type SSHResult struct {
  2952. User string
  2953. Host string
  2954. Cmd string
  2955. Stdout string
  2956. Stderr string
  2957. Code int
  2958. }
  2959. // NodeExec execs the given cmd on node via SSH. Note that the nodeName is an sshable name,
  2960. // eg: the name returned by framework.GetMasterHost(). This is also not guaranteed to work across
  2961. // cloud providers since it involves ssh.
  2962. func NodeExec(nodeName, cmd string) (SSHResult, error) {
  2963. return SSH(cmd, net.JoinHostPort(nodeName, sshPort), TestContext.Provider)
  2964. }
  2965. // SSH synchronously SSHs to a node running on provider and runs cmd. If there
  2966. // is no error performing the SSH, the stdout, stderr, and exit code are
  2967. // returned.
  2968. func SSH(cmd, host, provider string) (SSHResult, error) {
  2969. result := SSHResult{Host: host, Cmd: cmd}
  2970. // Get a signer for the provider.
  2971. signer, err := GetSigner(provider)
  2972. if err != nil {
  2973. return result, fmt.Errorf("error getting signer for provider %s: '%v'", provider, err)
  2974. }
  2975. // RunSSHCommand will default to Getenv("USER") if user == "", but we're
  2976. // defaulting here as well for logging clarity.
  2977. result.User = os.Getenv("KUBE_SSH_USER")
  2978. if result.User == "" {
  2979. result.User = os.Getenv("USER")
  2980. }
  2981. stdout, stderr, code, err := sshutil.RunSSHCommand(cmd, result.User, host, signer)
  2982. result.Stdout = stdout
  2983. result.Stderr = stderr
  2984. result.Code = code
  2985. return result, err
  2986. }
  2987. func LogSSHResult(result SSHResult) {
  2988. remote := fmt.Sprintf("%s@%s", result.User, result.Host)
  2989. Logf("ssh %s: command: %s", remote, result.Cmd)
  2990. Logf("ssh %s: stdout: %q", remote, result.Stdout)
  2991. Logf("ssh %s: stderr: %q", remote, result.Stderr)
  2992. Logf("ssh %s: exit code: %d", remote, result.Code)
  2993. }
  2994. func IssueSSHCommandWithResult(cmd, provider string, node *v1.Node) (*SSHResult, error) {
  2995. Logf("Getting external IP address for %s", node.Name)
  2996. host := ""
  2997. for _, a := range node.Status.Addresses {
  2998. if a.Type == v1.NodeExternalIP {
  2999. host = net.JoinHostPort(a.Address, sshPort)
  3000. break
  3001. }
  3002. }
  3003. if host == "" {
  3004. // No external IPs were found, let's try to use internal as plan B
  3005. for _, a := range node.Status.Addresses {
  3006. if a.Type == v1.NodeInternalIP {
  3007. host = net.JoinHostPort(a.Address, sshPort)
  3008. break
  3009. }
  3010. }
  3011. }
  3012. if host == "" {
  3013. return nil, fmt.Errorf("couldn't find any IP address for node %s", node.Name)
  3014. }
  3015. Logf("SSH %q on %s(%s)", cmd, node.Name, host)
  3016. result, err := SSH(cmd, host, provider)
  3017. LogSSHResult(result)
  3018. if result.Code != 0 || err != nil {
  3019. return nil, fmt.Errorf("failed running %q: %v (exit code %d)",
  3020. cmd, err, result.Code)
  3021. }
  3022. return &result, nil
  3023. }
  3024. func IssueSSHCommand(cmd, provider string, node *v1.Node) error {
  3025. _, err := IssueSSHCommandWithResult(cmd, provider, node)
  3026. if err != nil {
  3027. return err
  3028. }
  3029. return nil
  3030. }
  3031. // NewHostExecPodSpec returns the pod spec of hostexec pod
  3032. func NewHostExecPodSpec(ns, name string) *v1.Pod {
  3033. immediate := int64(0)
  3034. pod := &v1.Pod{
  3035. ObjectMeta: metav1.ObjectMeta{
  3036. Name: name,
  3037. Namespace: ns,
  3038. },
  3039. Spec: v1.PodSpec{
  3040. Containers: []v1.Container{
  3041. {
  3042. Name: "hostexec",
  3043. Image: imageutils.GetE2EImage(imageutils.Hostexec),
  3044. ImagePullPolicy: v1.PullIfNotPresent,
  3045. },
  3046. },
  3047. HostNetwork: true,
  3048. SecurityContext: &v1.PodSecurityContext{},
  3049. TerminationGracePeriodSeconds: &immediate,
  3050. },
  3051. }
  3052. return pod
  3053. }
  3054. // RunHostCmd runs the given cmd in the context of the given pod using `kubectl exec`
  3055. // inside of a shell.
  3056. func RunHostCmd(ns, name, cmd string) (string, error) {
  3057. return RunKubectl("exec", fmt.Sprintf("--namespace=%v", ns), name, "--", "/bin/sh", "-c", cmd)
  3058. }
  3059. // RunHostCmdOrDie calls RunHostCmd and dies on error.
  3060. func RunHostCmdOrDie(ns, name, cmd string) string {
  3061. stdout, err := RunHostCmd(ns, name, cmd)
  3062. Logf("stdout: %v", stdout)
  3063. ExpectNoError(err)
  3064. return stdout
  3065. }
  3066. // RunHostCmdWithRetries calls RunHostCmd and retries all errors
  3067. // until it succeeds or the specified timeout expires.
  3068. // This can be used with idempotent commands to deflake transient Node issues.
  3069. func RunHostCmdWithRetries(ns, name, cmd string, interval, timeout time.Duration) (string, error) {
  3070. start := time.Now()
  3071. for {
  3072. out, err := RunHostCmd(ns, name, cmd)
  3073. if err == nil {
  3074. return out, nil
  3075. }
  3076. if elapsed := time.Since(start); elapsed > timeout {
  3077. return out, fmt.Errorf("RunHostCmd still failed after %v: %v", elapsed, err)
  3078. }
  3079. Logf("Waiting %v to retry failed RunHostCmd: %v", interval, err)
  3080. time.Sleep(interval)
  3081. }
  3082. }
  3083. // LaunchHostExecPod launches a hostexec pod in the given namespace and waits
  3084. // until it's Running
  3085. func LaunchHostExecPod(client clientset.Interface, ns, name string) *v1.Pod {
  3086. hostExecPod := NewHostExecPodSpec(ns, name)
  3087. pod, err := client.CoreV1().Pods(ns).Create(hostExecPod)
  3088. ExpectNoError(err)
  3089. err = WaitForPodRunningInNamespace(client, pod)
  3090. ExpectNoError(err)
  3091. return pod
  3092. }
  3093. // newExecPodSpec returns the pod spec of exec pod
  3094. func newExecPodSpec(ns, generateName string) *v1.Pod {
  3095. immediate := int64(0)
  3096. pod := &v1.Pod{
  3097. ObjectMeta: metav1.ObjectMeta{
  3098. GenerateName: generateName,
  3099. Namespace: ns,
  3100. },
  3101. Spec: v1.PodSpec{
  3102. TerminationGracePeriodSeconds: &immediate,
  3103. Containers: []v1.Container{
  3104. {
  3105. Name: "exec",
  3106. Image: BusyBoxImage,
  3107. Command: []string{"sh", "-c", "trap exit TERM; while true; do sleep 5; done"},
  3108. },
  3109. },
  3110. },
  3111. }
  3112. return pod
  3113. }
  3114. // CreateExecPodOrFail creates a simple busybox pod in a sleep loop used as a
  3115. // vessel for kubectl exec commands.
  3116. // Returns the name of the created pod.
  3117. func CreateExecPodOrFail(client clientset.Interface, ns, generateName string, tweak func(*v1.Pod)) string {
  3118. Logf("Creating new exec pod")
  3119. execPod := newExecPodSpec(ns, generateName)
  3120. if tweak != nil {
  3121. tweak(execPod)
  3122. }
  3123. created, err := client.CoreV1().Pods(ns).Create(execPod)
  3124. Expect(err).NotTo(HaveOccurred())
  3125. err = wait.PollImmediate(Poll, 5*time.Minute, func() (bool, error) {
  3126. retrievedPod, err := client.CoreV1().Pods(execPod.Namespace).Get(created.Name, metav1.GetOptions{})
  3127. if err != nil {
  3128. if testutils.IsRetryableAPIError(err) {
  3129. return false, nil
  3130. }
  3131. return false, err
  3132. }
  3133. return retrievedPod.Status.Phase == v1.PodRunning, nil
  3134. })
  3135. Expect(err).NotTo(HaveOccurred())
  3136. return created.Name
  3137. }
  3138. func CreatePodOrFail(c clientset.Interface, ns, name string, labels map[string]string, containerPorts []v1.ContainerPort) {
  3139. By(fmt.Sprintf("Creating pod %s in namespace %s", name, ns))
  3140. pod := &v1.Pod{
  3141. ObjectMeta: metav1.ObjectMeta{
  3142. Name: name,
  3143. Labels: labels,
  3144. },
  3145. Spec: v1.PodSpec{
  3146. Containers: []v1.Container{
  3147. {
  3148. Name: "pause",
  3149. Image: imageutils.GetPauseImageName(),
  3150. Ports: containerPorts,
  3151. // Add a dummy environment variable to work around a docker issue.
  3152. // https://github.com/docker/docker/issues/14203
  3153. Env: []v1.EnvVar{{Name: "FOO", Value: " "}},
  3154. },
  3155. },
  3156. },
  3157. }
  3158. _, err := c.CoreV1().Pods(ns).Create(pod)
  3159. Expect(err).NotTo(HaveOccurred())
  3160. }
  3161. func DeletePodOrFail(c clientset.Interface, ns, name string) {
  3162. By(fmt.Sprintf("Deleting pod %s in namespace %s", name, ns))
  3163. err := c.CoreV1().Pods(ns).Delete(name, nil)
  3164. Expect(err).NotTo(HaveOccurred())
  3165. }
  3166. // GetSigner returns an ssh.Signer for the provider ("gce", etc.) that can be
  3167. // used to SSH to their nodes.
  3168. func GetSigner(provider string) (ssh.Signer, error) {
  3169. // Get the directory in which SSH keys are located.
  3170. keydir := filepath.Join(os.Getenv("HOME"), ".ssh")
  3171. // Select the key itself to use. When implementing more providers here,
  3172. // please also add them to any SSH tests that are disabled because of signer
  3173. // support.
  3174. keyfile := ""
  3175. key := ""
  3176. switch provider {
  3177. case "gce", "gke", "kubemark":
  3178. keyfile = "google_compute_engine"
  3179. case "aws":
  3180. // If there is an env. variable override, use that.
  3181. aws_keyfile := os.Getenv("AWS_SSH_KEY")
  3182. if len(aws_keyfile) != 0 {
  3183. return sshutil.MakePrivateKeySignerFromFile(aws_keyfile)
  3184. }
  3185. // Otherwise revert to home dir
  3186. keyfile = "kube_aws_rsa"
  3187. case "local", "vsphere":
  3188. keyfile = os.Getenv("LOCAL_SSH_KEY") // maybe?
  3189. if len(keyfile) == 0 {
  3190. keyfile = "id_rsa"
  3191. }
  3192. case "skeleton":
  3193. keyfile = os.Getenv("KUBE_SSH_KEY")
  3194. if len(keyfile) == 0 {
  3195. keyfile = "id_rsa"
  3196. }
  3197. default:
  3198. return nil, fmt.Errorf("GetSigner(...) not implemented for %s", provider)
  3199. }
  3200. if len(key) == 0 {
  3201. key = filepath.Join(keydir, keyfile)
  3202. }
  3203. return sshutil.MakePrivateKeySignerFromFile(key)
  3204. }
  3205. // CheckPodsRunningReady returns whether all pods whose names are listed in
  3206. // podNames in namespace ns are running and ready, using c and waiting at most
  3207. // timeout.
  3208. func CheckPodsRunningReady(c clientset.Interface, ns string, podNames []string, timeout time.Duration) bool {
  3209. return CheckPodsCondition(c, ns, podNames, timeout, testutils.PodRunningReady, "running and ready")
  3210. }
  3211. // CheckPodsRunningReadyOrSucceeded returns whether all pods whose names are
  3212. // listed in podNames in namespace ns are running and ready, or succeeded; use
  3213. // c and waiting at most timeout.
  3214. func CheckPodsRunningReadyOrSucceeded(c clientset.Interface, ns string, podNames []string, timeout time.Duration) bool {
  3215. return CheckPodsCondition(c, ns, podNames, timeout, testutils.PodRunningReadyOrSucceeded, "running and ready, or succeeded")
  3216. }
  3217. // CheckPodsCondition returns whether all pods whose names are listed in podNames
  3218. // in namespace ns are in the condition, using c and waiting at most timeout.
  3219. func CheckPodsCondition(c clientset.Interface, ns string, podNames []string, timeout time.Duration, condition podCondition, desc string) bool {
  3220. np := len(podNames)
  3221. Logf("Waiting up to %v for %d pods to be %s: %s", timeout, np, desc, podNames)
  3222. type waitPodResult struct {
  3223. success bool
  3224. podName string
  3225. }
  3226. result := make(chan waitPodResult, len(podNames))
  3227. for _, podName := range podNames {
  3228. // Launch off pod readiness checkers.
  3229. go func(name string) {
  3230. err := WaitForPodCondition(c, ns, name, desc, timeout, condition)
  3231. result <- waitPodResult{err == nil, name}
  3232. }(podName)
  3233. }
  3234. // Wait for them all to finish.
  3235. success := true
  3236. for range podNames {
  3237. res := <-result
  3238. if !res.success {
  3239. Logf("Pod %[1]s failed to be %[2]s.", res.podName, desc)
  3240. success = false
  3241. }
  3242. }
  3243. Logf("Wanted all %d pods to be %s. Result: %t. Pods: %v", np, desc, success, podNames)
  3244. return success
  3245. }
  3246. // WaitForNodeToBeReady returns whether node name is ready within timeout.
  3247. func WaitForNodeToBeReady(c clientset.Interface, name string, timeout time.Duration) bool {
  3248. return WaitForNodeToBe(c, name, v1.NodeReady, true, timeout)
  3249. }
  3250. // WaitForNodeToBeNotReady returns whether node name is not ready (i.e. the
  3251. // readiness condition is anything but ready, e.g false or unknown) within
  3252. // timeout.
  3253. func WaitForNodeToBeNotReady(c clientset.Interface, name string, timeout time.Duration) bool {
  3254. return WaitForNodeToBe(c, name, v1.NodeReady, false, timeout)
  3255. }
  3256. func isNodeConditionSetAsExpected(node *v1.Node, conditionType v1.NodeConditionType, wantTrue, silent bool) bool {
  3257. // Check the node readiness condition (logging all).
  3258. for _, cond := range node.Status.Conditions {
  3259. // Ensure that the condition type and the status matches as desired.
  3260. if cond.Type == conditionType {
  3261. // For NodeReady condition we need to check Taints as well
  3262. if cond.Type == v1.NodeReady {
  3263. hasNodeControllerTaints := false
  3264. // For NodeReady we need to check if Taints are gone as well
  3265. taints := node.Spec.Taints
  3266. for _, taint := range taints {
  3267. if taint.MatchTaint(nodectlr.UnreachableTaintTemplate) || taint.MatchTaint(nodectlr.NotReadyTaintTemplate) {
  3268. hasNodeControllerTaints = true
  3269. break
  3270. }
  3271. }
  3272. if wantTrue {
  3273. if (cond.Status == v1.ConditionTrue) && !hasNodeControllerTaints {
  3274. return true
  3275. } else {
  3276. msg := ""
  3277. if !hasNodeControllerTaints {
  3278. msg = fmt.Sprintf("Condition %s of node %s is %v instead of %t. Reason: %v, message: %v",
  3279. conditionType, node.Name, cond.Status == v1.ConditionTrue, wantTrue, cond.Reason, cond.Message)
  3280. } else {
  3281. msg = fmt.Sprintf("Condition %s of node %s is %v, but Node is tainted by NodeController with %v. Failure",
  3282. conditionType, node.Name, cond.Status == v1.ConditionTrue, taints)
  3283. }
  3284. if !silent {
  3285. Logf(msg)
  3286. }
  3287. return false
  3288. }
  3289. } else {
  3290. // TODO: check if the Node is tainted once we enable NC notReady/unreachable taints by default
  3291. if cond.Status != v1.ConditionTrue {
  3292. return true
  3293. }
  3294. if !silent {
  3295. Logf("Condition %s of node %s is %v instead of %t. Reason: %v, message: %v",
  3296. conditionType, node.Name, cond.Status == v1.ConditionTrue, wantTrue, cond.Reason, cond.Message)
  3297. }
  3298. return false
  3299. }
  3300. }
  3301. if (wantTrue && (cond.Status == v1.ConditionTrue)) || (!wantTrue && (cond.Status != v1.ConditionTrue)) {
  3302. return true
  3303. } else {
  3304. if !silent {
  3305. Logf("Condition %s of node %s is %v instead of %t. Reason: %v, message: %v",
  3306. conditionType, node.Name, cond.Status == v1.ConditionTrue, wantTrue, cond.Reason, cond.Message)
  3307. }
  3308. return false
  3309. }
  3310. }
  3311. }
  3312. if !silent {
  3313. Logf("Couldn't find condition %v on node %v", conditionType, node.Name)
  3314. }
  3315. return false
  3316. }
  3317. func IsNodeConditionSetAsExpected(node *v1.Node, conditionType v1.NodeConditionType, wantTrue bool) bool {
  3318. return isNodeConditionSetAsExpected(node, conditionType, wantTrue, false)
  3319. }
  3320. func IsNodeConditionSetAsExpectedSilent(node *v1.Node, conditionType v1.NodeConditionType, wantTrue bool) bool {
  3321. return isNodeConditionSetAsExpected(node, conditionType, wantTrue, true)
  3322. }
  3323. func IsNodeConditionUnset(node *v1.Node, conditionType v1.NodeConditionType) bool {
  3324. for _, cond := range node.Status.Conditions {
  3325. if cond.Type == conditionType {
  3326. return false
  3327. }
  3328. }
  3329. return true
  3330. }
  3331. // WaitForNodeToBe returns whether node "name's" condition state matches wantTrue
  3332. // within timeout. If wantTrue is true, it will ensure the node condition status
  3333. // is ConditionTrue; if it's false, it ensures the node condition is in any state
  3334. // other than ConditionTrue (e.g. not true or unknown).
  3335. func WaitForNodeToBe(c clientset.Interface, name string, conditionType v1.NodeConditionType, wantTrue bool, timeout time.Duration) bool {
  3336. Logf("Waiting up to %v for node %s condition %s to be %t", timeout, name, conditionType, wantTrue)
  3337. for start := time.Now(); time.Since(start) < timeout; time.Sleep(Poll) {
  3338. node, err := c.CoreV1().Nodes().Get(name, metav1.GetOptions{})
  3339. if err != nil {
  3340. Logf("Couldn't get node %s", name)
  3341. continue
  3342. }
  3343. if IsNodeConditionSetAsExpected(node, conditionType, wantTrue) {
  3344. return true
  3345. }
  3346. }
  3347. Logf("Node %s didn't reach desired %s condition status (%t) within %v", name, conditionType, wantTrue, timeout)
  3348. return false
  3349. }
  3350. // Checks whether all registered nodes are ready.
  3351. // TODO: we should change the AllNodesReady call in AfterEach to WaitForAllNodesHealthy,
  3352. // and figure out how to do it in a configurable way, as we can't expect all setups to run
  3353. // default test add-ons.
  3354. func AllNodesReady(c clientset.Interface, timeout time.Duration) error {
  3355. Logf("Waiting up to %v for all (but %d) nodes to be ready", timeout, TestContext.AllowedNotReadyNodes)
  3356. var notReady []*v1.Node
  3357. err := wait.PollImmediate(Poll, timeout, func() (bool, error) {
  3358. notReady = nil
  3359. // It should be OK to list unschedulable Nodes here.
  3360. nodes, err := c.CoreV1().Nodes().List(metav1.ListOptions{})
  3361. if err != nil {
  3362. if testutils.IsRetryableAPIError(err) {
  3363. return false, nil
  3364. }
  3365. return false, err
  3366. }
  3367. for i := range nodes.Items {
  3368. node := &nodes.Items[i]
  3369. if !IsNodeConditionSetAsExpected(node, v1.NodeReady, true) {
  3370. notReady = append(notReady, node)
  3371. }
  3372. }
  3373. // Framework allows for <TestContext.AllowedNotReadyNodes> nodes to be non-ready,
  3374. // to make it possible e.g. for incorrect deployment of some small percentage
  3375. // of nodes (which we allow in cluster validation). Some nodes that are not
  3376. // provisioned correctly at startup will never become ready (e.g. when something
  3377. // won't install correctly), so we can't expect them to be ready at any point.
  3378. return len(notReady) <= TestContext.AllowedNotReadyNodes, nil
  3379. })
  3380. if err != nil && err != wait.ErrWaitTimeout {
  3381. return err
  3382. }
  3383. if len(notReady) > TestContext.AllowedNotReadyNodes {
  3384. msg := ""
  3385. for _, node := range notReady {
  3386. msg = fmt.Sprintf("%s, %s", msg, node.Name)
  3387. }
  3388. return fmt.Errorf("Not ready nodes: %#v", msg)
  3389. }
  3390. return nil
  3391. }
  3392. // checks whether all registered nodes are ready and all required Pods are running on them.
  3393. func WaitForAllNodesHealthy(c clientset.Interface, timeout time.Duration) error {
  3394. Logf("Waiting up to %v for all nodes to be ready", timeout)
  3395. var notReady []v1.Node
  3396. var missingPodsPerNode map[string][]string
  3397. err := wait.PollImmediate(Poll, timeout, func() (bool, error) {
  3398. notReady = nil
  3399. // It should be OK to list unschedulable Nodes here.
  3400. nodes, err := c.CoreV1().Nodes().List(metav1.ListOptions{ResourceVersion: "0"})
  3401. if err != nil {
  3402. if testutils.IsRetryableAPIError(err) {
  3403. return false, nil
  3404. }
  3405. return false, err
  3406. }
  3407. for _, node := range nodes.Items {
  3408. if !IsNodeConditionSetAsExpected(&node, v1.NodeReady, true) {
  3409. notReady = append(notReady, node)
  3410. }
  3411. }
  3412. pods, err := c.CoreV1().Pods(metav1.NamespaceAll).List(metav1.ListOptions{ResourceVersion: "0"})
  3413. if err != nil {
  3414. return false, err
  3415. }
  3416. systemPodsPerNode := make(map[string][]string)
  3417. for _, pod := range pods.Items {
  3418. if pod.Namespace == metav1.NamespaceSystem && pod.Status.Phase == v1.PodRunning {
  3419. if pod.Spec.NodeName != "" {
  3420. systemPodsPerNode[pod.Spec.NodeName] = append(systemPodsPerNode[pod.Spec.NodeName], pod.Name)
  3421. }
  3422. }
  3423. }
  3424. missingPodsPerNode = make(map[string][]string)
  3425. for _, node := range nodes.Items {
  3426. if !system.IsMasterNode(node.Name) {
  3427. for _, requiredPod := range requiredPerNodePods {
  3428. foundRequired := false
  3429. for _, presentPod := range systemPodsPerNode[node.Name] {
  3430. if requiredPod.MatchString(presentPod) {
  3431. foundRequired = true
  3432. break
  3433. }
  3434. }
  3435. if !foundRequired {
  3436. missingPodsPerNode[node.Name] = append(missingPodsPerNode[node.Name], requiredPod.String())
  3437. }
  3438. }
  3439. }
  3440. }
  3441. return len(notReady) == 0 && len(missingPodsPerNode) == 0, nil
  3442. })
  3443. if err != nil && err != wait.ErrWaitTimeout {
  3444. return err
  3445. }
  3446. if len(notReady) > 0 {
  3447. return fmt.Errorf("Not ready nodes: %v", notReady)
  3448. }
  3449. if len(missingPodsPerNode) > 0 {
  3450. return fmt.Errorf("Not running system Pods: %v", missingPodsPerNode)
  3451. }
  3452. return nil
  3453. }
  3454. // Filters nodes in NodeList in place, removing nodes that do not
  3455. // satisfy the given condition
  3456. // TODO: consider merging with pkg/client/cache.NodeLister
  3457. func FilterNodes(nodeList *v1.NodeList, fn func(node v1.Node) bool) {
  3458. var l []v1.Node
  3459. for _, node := range nodeList.Items {
  3460. if fn(node) {
  3461. l = append(l, node)
  3462. }
  3463. }
  3464. nodeList.Items = l
  3465. }
  3466. // ParseKVLines parses output that looks like lines containing "<key>: <val>"
  3467. // and returns <val> if <key> is found. Otherwise, it returns the empty string.
  3468. func ParseKVLines(output, key string) string {
  3469. delim := ":"
  3470. key = key + delim
  3471. for _, line := range strings.Split(output, "\n") {
  3472. pieces := strings.SplitAfterN(line, delim, 2)
  3473. if len(pieces) != 2 {
  3474. continue
  3475. }
  3476. k, v := pieces[0], pieces[1]
  3477. if k == key {
  3478. return strings.TrimSpace(v)
  3479. }
  3480. }
  3481. return ""
  3482. }
  3483. func RestartKubeProxy(host string) error {
  3484. // TODO: Make it work for all providers.
  3485. if !ProviderIs("gce", "gke", "aws") {
  3486. return fmt.Errorf("unsupported provider: %s", TestContext.Provider)
  3487. }
  3488. // kubelet will restart the kube-proxy since it's running in a static pod
  3489. Logf("Killing kube-proxy on node %v", host)
  3490. result, err := SSH("sudo pkill kube-proxy", host, TestContext.Provider)
  3491. if err != nil || result.Code != 0 {
  3492. LogSSHResult(result)
  3493. return fmt.Errorf("couldn't restart kube-proxy: %v", err)
  3494. }
  3495. // wait for kube-proxy to come back up
  3496. sshCmd := "sudo /bin/sh -c 'pgrep kube-proxy | wc -l'"
  3497. err = wait.Poll(5*time.Second, 60*time.Second, func() (bool, error) {
  3498. Logf("Waiting for kubeproxy to come back up with %v on %v", sshCmd, host)
  3499. result, err := SSH(sshCmd, host, TestContext.Provider)
  3500. if err != nil {
  3501. return false, err
  3502. }
  3503. if result.Code != 0 {
  3504. LogSSHResult(result)
  3505. return false, fmt.Errorf("failed to run command, exited %d", result.Code)
  3506. }
  3507. if result.Stdout == "0\n" {
  3508. return false, nil
  3509. }
  3510. Logf("kube-proxy is back up.")
  3511. return true, nil
  3512. })
  3513. if err != nil {
  3514. return fmt.Errorf("kube-proxy didn't recover: %v", err)
  3515. }
  3516. return nil
  3517. }
  3518. func RestartKubelet(host string) error {
  3519. // TODO: Make it work for all providers and distros.
  3520. supportedProviders := []string{"gce", "aws", "vsphere"}
  3521. if !ProviderIs(supportedProviders...) {
  3522. return fmt.Errorf("unsupported provider: %s, supported providers are: %v", TestContext.Provider, supportedProviders)
  3523. }
  3524. if ProviderIs("gce") && !NodeOSDistroIs("debian", "gci") {
  3525. return fmt.Errorf("unsupported node OS distro: %s", TestContext.NodeOSDistro)
  3526. }
  3527. var cmd string
  3528. if ProviderIs("gce") && NodeOSDistroIs("debian") {
  3529. cmd = "sudo /etc/init.d/kubelet restart"
  3530. } else if ProviderIs("vsphere") {
  3531. var sudoPresent bool
  3532. sshResult, err := SSH("sudo --version", host, TestContext.Provider)
  3533. if err != nil {
  3534. return fmt.Errorf("Unable to ssh to host %s with error %v", host, err)
  3535. }
  3536. if !strings.Contains(sshResult.Stderr, "command not found") {
  3537. sudoPresent = true
  3538. }
  3539. sshResult, err = SSH("systemctl --version", host, TestContext.Provider)
  3540. if !strings.Contains(sshResult.Stderr, "command not found") {
  3541. cmd = "systemctl restart kubelet"
  3542. } else {
  3543. cmd = "service kubelet restart"
  3544. }
  3545. if sudoPresent {
  3546. cmd = fmt.Sprintf("sudo %s", cmd)
  3547. }
  3548. } else {
  3549. cmd = "sudo systemctl restart kubelet"
  3550. }
  3551. Logf("Restarting kubelet via ssh on host %s with command %s", host, cmd)
  3552. result, err := SSH(cmd, host, TestContext.Provider)
  3553. if err != nil || result.Code != 0 {
  3554. LogSSHResult(result)
  3555. return fmt.Errorf("couldn't restart kubelet: %v", err)
  3556. }
  3557. return nil
  3558. }
  3559. func WaitForKubeletUp(host string) error {
  3560. cmd := "curl http://localhost:" + strconv.Itoa(ports.KubeletReadOnlyPort) + "/healthz"
  3561. for start := time.Now(); time.Since(start) < time.Minute; time.Sleep(5 * time.Second) {
  3562. result, err := SSH(cmd, host, TestContext.Provider)
  3563. if err != nil || result.Code != 0 {
  3564. LogSSHResult(result)
  3565. }
  3566. if result.Stdout == "ok" {
  3567. return nil
  3568. }
  3569. }
  3570. return fmt.Errorf("waiting for kubelet timed out")
  3571. }
  3572. func RestartApiserver(cs clientset.Interface) error {
  3573. // TODO: Make it work for all providers.
  3574. if !ProviderIs("gce", "gke", "aws") {
  3575. return fmt.Errorf("unsupported provider: %s", TestContext.Provider)
  3576. }
  3577. if ProviderIs("gce", "aws") {
  3578. initialRestartCount, err := getApiserverRestartCount(cs)
  3579. if err != nil {
  3580. return fmt.Errorf("failed to get apiserver's restart count: %v", err)
  3581. }
  3582. if err := sshRestartMaster(); err != nil {
  3583. return fmt.Errorf("failed to restart apiserver: %v", err)
  3584. }
  3585. return waitForApiserverRestarted(cs, initialRestartCount)
  3586. }
  3587. // GKE doesn't allow ssh access, so use a same-version master
  3588. // upgrade to teardown/recreate master.
  3589. v, err := cs.Discovery().ServerVersion()
  3590. if err != nil {
  3591. return err
  3592. }
  3593. return masterUpgradeGKE(v.GitVersion[1:]) // strip leading 'v'
  3594. }
  3595. func sshRestartMaster() error {
  3596. if !ProviderIs("gce", "aws") {
  3597. return fmt.Errorf("unsupported provider: %s", TestContext.Provider)
  3598. }
  3599. var command string
  3600. if ProviderIs("gce") {
  3601. command = "pidof kube-apiserver | xargs sudo kill"
  3602. } else {
  3603. command = "sudo /etc/init.d/kube-apiserver restart"
  3604. }
  3605. Logf("Restarting master via ssh, running: %v", command)
  3606. result, err := SSH(command, net.JoinHostPort(GetMasterHost(), sshPort), TestContext.Provider)
  3607. if err != nil || result.Code != 0 {
  3608. LogSSHResult(result)
  3609. return fmt.Errorf("couldn't restart apiserver: %v", err)
  3610. }
  3611. return nil
  3612. }
  3613. func WaitForApiserverUp(c clientset.Interface) error {
  3614. for start := time.Now(); time.Since(start) < time.Minute; time.Sleep(5 * time.Second) {
  3615. body, err := c.CoreV1().RESTClient().Get().AbsPath("/healthz").Do().Raw()
  3616. if err == nil && string(body) == "ok" {
  3617. return nil
  3618. }
  3619. }
  3620. return fmt.Errorf("waiting for apiserver timed out")
  3621. }
  3622. // waitForApiserverRestarted waits until apiserver's restart count increased.
  3623. func waitForApiserverRestarted(c clientset.Interface, initialRestartCount int32) error {
  3624. for start := time.Now(); time.Since(start) < time.Minute; time.Sleep(5 * time.Second) {
  3625. restartCount, err := getApiserverRestartCount(c)
  3626. if err != nil {
  3627. Logf("Failed to get apiserver's restart count: %v", err)
  3628. continue
  3629. }
  3630. if restartCount > initialRestartCount {
  3631. Logf("Apiserver has restarted.")
  3632. return nil
  3633. }
  3634. Logf("Waiting for apiserver restart count to increase")
  3635. }
  3636. return fmt.Errorf("timed out waiting for apiserver to be restarted")
  3637. }
  3638. func getApiserverRestartCount(c clientset.Interface) (int32, error) {
  3639. label := labels.SelectorFromSet(labels.Set(map[string]string{"component": "kube-apiserver"}))
  3640. listOpts := metav1.ListOptions{LabelSelector: label.String()}
  3641. pods, err := c.CoreV1().Pods(metav1.NamespaceSystem).List(listOpts)
  3642. if err != nil {
  3643. return -1, err
  3644. }
  3645. if len(pods.Items) != 1 {
  3646. return -1, fmt.Errorf("unexpected number of apiserver pod: %d", len(pods.Items))
  3647. }
  3648. for _, s := range pods.Items[0].Status.ContainerStatuses {
  3649. if s.Name != "kube-apiserver" {
  3650. continue
  3651. }
  3652. return s.RestartCount, nil
  3653. }
  3654. return -1, fmt.Errorf("failed to find kube-apiserver container in pod")
  3655. }
  3656. func RestartControllerManager() error {
  3657. // TODO: Make it work for all providers and distros.
  3658. if !ProviderIs("gce", "aws") {
  3659. return fmt.Errorf("unsupported provider: %s", TestContext.Provider)
  3660. }
  3661. if ProviderIs("gce") && !MasterOSDistroIs("gci") {
  3662. return fmt.Errorf("unsupported master OS distro: %s", TestContext.MasterOSDistro)
  3663. }
  3664. cmd := "pidof kube-controller-manager | xargs sudo kill"
  3665. Logf("Restarting controller-manager via ssh, running: %v", cmd)
  3666. result, err := SSH(cmd, net.JoinHostPort(GetMasterHost(), sshPort), TestContext.Provider)
  3667. if err != nil || result.Code != 0 {
  3668. LogSSHResult(result)
  3669. return fmt.Errorf("couldn't restart controller-manager: %v", err)
  3670. }
  3671. return nil
  3672. }
  3673. func WaitForControllerManagerUp() error {
  3674. cmd := "curl http://localhost:" + strconv.Itoa(ports.InsecureKubeControllerManagerPort) + "/healthz"
  3675. for start := time.Now(); time.Since(start) < time.Minute; time.Sleep(5 * time.Second) {
  3676. result, err := SSH(cmd, net.JoinHostPort(GetMasterHost(), sshPort), TestContext.Provider)
  3677. if err != nil || result.Code != 0 {
  3678. LogSSHResult(result)
  3679. }
  3680. if result.Stdout == "ok" {
  3681. return nil
  3682. }
  3683. }
  3684. return fmt.Errorf("waiting for controller-manager timed out")
  3685. }
  3686. // CheckForControllerManagerHealthy checks that the controller manager does not crash within "duration"
  3687. func CheckForControllerManagerHealthy(duration time.Duration) error {
  3688. var PID string
  3689. cmd := "pidof kube-controller-manager"
  3690. for start := time.Now(); time.Since(start) < duration; time.Sleep(5 * time.Second) {
  3691. result, err := SSH(cmd, net.JoinHostPort(GetMasterHost(), sshPort), TestContext.Provider)
  3692. if err != nil {
  3693. // We don't necessarily know that it crashed, pipe could just be broken
  3694. LogSSHResult(result)
  3695. return fmt.Errorf("master unreachable after %v", time.Since(start))
  3696. } else if result.Code != 0 {
  3697. LogSSHResult(result)
  3698. return fmt.Errorf("SSH result code not 0. actually: %v after %v", result.Code, time.Since(start))
  3699. } else if result.Stdout != PID {
  3700. if PID == "" {
  3701. PID = result.Stdout
  3702. } else {
  3703. //its dead
  3704. return fmt.Errorf("controller manager crashed, old PID: %s, new PID: %s", PID, result.Stdout)
  3705. }
  3706. } else {
  3707. Logf("kube-controller-manager still healthy after %v", time.Since(start))
  3708. }
  3709. }
  3710. return nil
  3711. }
  3712. // NumberOfRegisteredNodes returns number of registered Nodes excluding Master Node.
  3713. func NumberOfRegisteredNodes(c clientset.Interface) (int, error) {
  3714. nodes, err := waitListSchedulableNodes(c)
  3715. if err != nil {
  3716. Logf("Failed to list nodes: %v", err)
  3717. return 0, err
  3718. }
  3719. return len(nodes.Items), nil
  3720. }
  3721. // NumberOfReadyNodes returns number of ready Nodes excluding Master Node.
  3722. func NumberOfReadyNodes(c clientset.Interface) (int, error) {
  3723. nodes, err := waitListSchedulableNodes(c)
  3724. if err != nil {
  3725. Logf("Failed to list nodes: %v", err)
  3726. return 0, err
  3727. }
  3728. // Filter out not-ready nodes.
  3729. FilterNodes(nodes, func(node v1.Node) bool {
  3730. return IsNodeConditionSetAsExpected(&node, v1.NodeReady, true)
  3731. })
  3732. return len(nodes.Items), nil
  3733. }
  3734. // CheckNodesReady waits up to timeout for cluster to has desired size and
  3735. // there is no not-ready nodes in it. By cluster size we mean number of Nodes
  3736. // excluding Master Node.
  3737. func CheckNodesReady(c clientset.Interface, size int, timeout time.Duration) ([]v1.Node, error) {
  3738. for start := time.Now(); time.Since(start) < timeout; time.Sleep(20 * time.Second) {
  3739. nodes, err := waitListSchedulableNodes(c)
  3740. if err != nil {
  3741. Logf("Failed to list nodes: %v", err)
  3742. continue
  3743. }
  3744. numNodes := len(nodes.Items)
  3745. // Filter out not-ready nodes.
  3746. FilterNodes(nodes, func(node v1.Node) bool {
  3747. return IsNodeConditionSetAsExpected(&node, v1.NodeReady, true)
  3748. })
  3749. numReady := len(nodes.Items)
  3750. if numNodes == size && numReady == size {
  3751. Logf("Cluster has reached the desired number of ready nodes %d", size)
  3752. return nodes.Items, nil
  3753. }
  3754. Logf("Waiting for ready nodes %d, current ready %d, not ready nodes %d", size, numReady, numNodes-numReady)
  3755. }
  3756. return nil, fmt.Errorf("timeout waiting %v for number of ready nodes to be %d", timeout, size)
  3757. }
  3758. // WaitForReadyNodes waits up to timeout for cluster to has desired size and
  3759. // there is no not-ready nodes in it. By cluster size we mean number of Nodes
  3760. // excluding Master Node.
  3761. func WaitForReadyNodes(c clientset.Interface, size int, timeout time.Duration) error {
  3762. _, err := CheckNodesReady(c, size, timeout)
  3763. return err
  3764. }
  3765. func GenerateMasterRegexp(prefix string) string {
  3766. return prefix + "(-...)?"
  3767. }
  3768. // waitForMasters waits until the cluster has the desired number of ready masters in it.
  3769. func WaitForMasters(masterPrefix string, c clientset.Interface, size int, timeout time.Duration) error {
  3770. for start := time.Now(); time.Since(start) < timeout; time.Sleep(20 * time.Second) {
  3771. nodes, err := c.CoreV1().Nodes().List(metav1.ListOptions{})
  3772. if err != nil {
  3773. Logf("Failed to list nodes: %v", err)
  3774. continue
  3775. }
  3776. // Filter out nodes that are not master replicas
  3777. FilterNodes(nodes, func(node v1.Node) bool {
  3778. res, err := regexp.Match(GenerateMasterRegexp(masterPrefix), ([]byte)(node.Name))
  3779. if err != nil {
  3780. Logf("Failed to match regexp to node name: %v", err)
  3781. return false
  3782. }
  3783. return res
  3784. })
  3785. numNodes := len(nodes.Items)
  3786. // Filter out not-ready nodes.
  3787. FilterNodes(nodes, func(node v1.Node) bool {
  3788. return IsNodeConditionSetAsExpected(&node, v1.NodeReady, true)
  3789. })
  3790. numReady := len(nodes.Items)
  3791. if numNodes == size && numReady == size {
  3792. Logf("Cluster has reached the desired number of masters %d", size)
  3793. return nil
  3794. }
  3795. Logf("Waiting for the number of masters %d, current %d, not ready master nodes %d", size, numNodes, numNodes-numReady)
  3796. }
  3797. return fmt.Errorf("timeout waiting %v for the number of masters to be %d", timeout, size)
  3798. }
  3799. // GetHostExternalAddress gets the node for a pod and returns the first External
  3800. // address. Returns an error if the node the pod is on doesn't have an External
  3801. // address.
  3802. func GetHostExternalAddress(client clientset.Interface, p *v1.Pod) (externalAddress string, err error) {
  3803. node, err := client.CoreV1().Nodes().Get(p.Spec.NodeName, metav1.GetOptions{})
  3804. if err != nil {
  3805. return "", err
  3806. }
  3807. for _, address := range node.Status.Addresses {
  3808. if address.Type == v1.NodeExternalIP {
  3809. if address.Address != "" {
  3810. externalAddress = address.Address
  3811. break
  3812. }
  3813. }
  3814. }
  3815. if externalAddress == "" {
  3816. err = fmt.Errorf("No external address for pod %v on node %v",
  3817. p.Name, p.Spec.NodeName)
  3818. }
  3819. return
  3820. }
  3821. type extractRT struct {
  3822. http.Header
  3823. }
  3824. func (rt *extractRT) RoundTrip(req *http.Request) (*http.Response, error) {
  3825. rt.Header = req.Header
  3826. return &http.Response{}, nil
  3827. }
  3828. // headersForConfig extracts any http client logic necessary for the provided
  3829. // config.
  3830. func headersForConfig(c *restclient.Config) (http.Header, error) {
  3831. extract := &extractRT{}
  3832. rt, err := restclient.HTTPWrappersForConfig(c, extract)
  3833. if err != nil {
  3834. return nil, err
  3835. }
  3836. if _, err := rt.RoundTrip(&http.Request{}); err != nil {
  3837. return nil, err
  3838. }
  3839. return extract.Header, nil
  3840. }
  3841. // OpenWebSocketForURL constructs a websocket connection to the provided URL, using the client
  3842. // config, with the specified protocols.
  3843. func OpenWebSocketForURL(url *url.URL, config *restclient.Config, protocols []string) (*websocket.Conn, error) {
  3844. tlsConfig, err := restclient.TLSConfigFor(config)
  3845. if err != nil {
  3846. return nil, fmt.Errorf("failed to create tls config: %v", err)
  3847. }
  3848. if tlsConfig != nil {
  3849. url.Scheme = "wss"
  3850. if !strings.Contains(url.Host, ":") {
  3851. url.Host += ":443"
  3852. }
  3853. } else {
  3854. url.Scheme = "ws"
  3855. if !strings.Contains(url.Host, ":") {
  3856. url.Host += ":80"
  3857. }
  3858. }
  3859. headers, err := headersForConfig(config)
  3860. if err != nil {
  3861. return nil, fmt.Errorf("failed to load http headers: %v", err)
  3862. }
  3863. cfg, err := websocket.NewConfig(url.String(), "http://localhost")
  3864. if err != nil {
  3865. return nil, fmt.Errorf("failed to create websocket config: %v", err)
  3866. }
  3867. cfg.Header = headers
  3868. cfg.TlsConfig = tlsConfig
  3869. cfg.Protocol = protocols
  3870. return websocket.DialConfig(cfg)
  3871. }
  3872. // Looks for the given string in the log of a specific pod container
  3873. func LookForStringInLog(ns, podName, container, expectedString string, timeout time.Duration) (result string, err error) {
  3874. return LookForString(expectedString, timeout, func() string {
  3875. return RunKubectlOrDie("logs", podName, container, fmt.Sprintf("--namespace=%v", ns))
  3876. })
  3877. }
  3878. // Looks for the given string in a file in a specific pod container
  3879. func LookForStringInFile(ns, podName, container, file, expectedString string, timeout time.Duration) (result string, err error) {
  3880. return LookForString(expectedString, timeout, func() string {
  3881. return RunKubectlOrDie("exec", podName, "-c", container, fmt.Sprintf("--namespace=%v", ns), "--", "cat", file)
  3882. })
  3883. }
  3884. // Looks for the given string in the output of a command executed in a specific pod container
  3885. func LookForStringInPodExec(ns, podName string, command []string, expectedString string, timeout time.Duration) (result string, err error) {
  3886. return LookForString(expectedString, timeout, func() string {
  3887. // use the first container
  3888. args := []string{"exec", podName, fmt.Sprintf("--namespace=%v", ns), "--"}
  3889. args = append(args, command...)
  3890. return RunKubectlOrDie(args...)
  3891. })
  3892. }
  3893. // Looks for the given string in the output of fn, repeatedly calling fn until
  3894. // the timeout is reached or the string is found. Returns last log and possibly
  3895. // error if the string was not found.
  3896. func LookForString(expectedString string, timeout time.Duration, fn func() string) (result string, err error) {
  3897. for t := time.Now(); time.Since(t) < timeout; time.Sleep(Poll) {
  3898. result = fn()
  3899. if strings.Contains(result, expectedString) {
  3900. return
  3901. }
  3902. }
  3903. err = fmt.Errorf("Failed to find \"%s\", last result: \"%s\"", expectedString, result)
  3904. return
  3905. }
  3906. // getSvcNodePort returns the node port for the given service:port.
  3907. func getSvcNodePort(client clientset.Interface, ns, name string, svcPort int) (int, error) {
  3908. svc, err := client.CoreV1().Services(ns).Get(name, metav1.GetOptions{})
  3909. if err != nil {
  3910. return 0, err
  3911. }
  3912. for _, p := range svc.Spec.Ports {
  3913. if p.Port == int32(svcPort) {
  3914. if p.NodePort != 0 {
  3915. return int(p.NodePort), nil
  3916. }
  3917. }
  3918. }
  3919. return 0, fmt.Errorf(
  3920. "No node port found for service %v, port %v", name, svcPort)
  3921. }
  3922. // GetNodePortURL returns the url to a nodeport Service.
  3923. func GetNodePortURL(client clientset.Interface, ns, name string, svcPort int) (string, error) {
  3924. nodePort, err := getSvcNodePort(client, ns, name, svcPort)
  3925. if err != nil {
  3926. return "", err
  3927. }
  3928. // This list of nodes must not include the master, which is marked
  3929. // unschedulable, since the master doesn't run kube-proxy. Without
  3930. // kube-proxy NodePorts won't work.
  3931. var nodes *v1.NodeList
  3932. if wait.PollImmediate(Poll, SingleCallTimeout, func() (bool, error) {
  3933. nodes, err = client.CoreV1().Nodes().List(metav1.ListOptions{FieldSelector: fields.Set{
  3934. "spec.unschedulable": "false",
  3935. }.AsSelector().String()})
  3936. if err != nil {
  3937. if testutils.IsRetryableAPIError(err) {
  3938. return false, nil
  3939. }
  3940. return false, err
  3941. }
  3942. return true, nil
  3943. }) != nil {
  3944. return "", err
  3945. }
  3946. if len(nodes.Items) == 0 {
  3947. return "", fmt.Errorf("Unable to list nodes in cluster.")
  3948. }
  3949. for _, node := range nodes.Items {
  3950. for _, address := range node.Status.Addresses {
  3951. if address.Type == v1.NodeExternalIP {
  3952. if address.Address != "" {
  3953. return fmt.Sprintf("http://%v:%v", address.Address, nodePort), nil
  3954. }
  3955. }
  3956. }
  3957. }
  3958. return "", fmt.Errorf("Failed to find external address for service %v", name)
  3959. }
  3960. // TODO(random-liu): Change this to be a member function of the framework.
  3961. func GetPodLogs(c clientset.Interface, namespace, podName, containerName string) (string, error) {
  3962. return getPodLogsInternal(c, namespace, podName, containerName, false)
  3963. }
  3964. func getPreviousPodLogs(c clientset.Interface, namespace, podName, containerName string) (string, error) {
  3965. return getPodLogsInternal(c, namespace, podName, containerName, true)
  3966. }
  3967. // utility function for gomega Eventually
  3968. func getPodLogsInternal(c clientset.Interface, namespace, podName, containerName string, previous bool) (string, error) {
  3969. logs, err := c.CoreV1().RESTClient().Get().
  3970. Resource("pods").
  3971. Namespace(namespace).
  3972. Name(podName).SubResource("log").
  3973. Param("container", containerName).
  3974. Param("previous", strconv.FormatBool(previous)).
  3975. Do().
  3976. Raw()
  3977. if err != nil {
  3978. return "", err
  3979. }
  3980. if err == nil && strings.Contains(string(logs), "Internal Error") {
  3981. return "", fmt.Errorf("Fetched log contains \"Internal Error\": %q.", string(logs))
  3982. }
  3983. return string(logs), err
  3984. }
  3985. func GetGCECloud() (*gcecloud.GCECloud, error) {
  3986. gceCloud, ok := TestContext.CloudConfig.Provider.(*gcecloud.GCECloud)
  3987. if !ok {
  3988. return nil, fmt.Errorf("failed to convert CloudConfig.Provider to GCECloud: %#v", TestContext.CloudConfig.Provider)
  3989. }
  3990. return gceCloud, nil
  3991. }
  3992. // EnsureLoadBalancerResourcesDeleted ensures that cloud load balancer resources that were created
  3993. // are actually cleaned up. Currently only implemented for GCE/GKE.
  3994. func EnsureLoadBalancerResourcesDeleted(ip, portRange string) error {
  3995. if TestContext.Provider == "gce" || TestContext.Provider == "gke" {
  3996. return ensureGCELoadBalancerResourcesDeleted(ip, portRange)
  3997. }
  3998. return nil
  3999. }
  4000. func ensureGCELoadBalancerResourcesDeleted(ip, portRange string) error {
  4001. gceCloud, err := GetGCECloud()
  4002. if err != nil {
  4003. return err
  4004. }
  4005. project := TestContext.CloudConfig.ProjectID
  4006. region, err := gcecloud.GetGCERegion(TestContext.CloudConfig.Zone)
  4007. if err != nil {
  4008. return fmt.Errorf("could not get region for zone %q: %v", TestContext.CloudConfig.Zone, err)
  4009. }
  4010. return wait.Poll(10*time.Second, 5*time.Minute, func() (bool, error) {
  4011. service := gceCloud.ComputeServices().GA
  4012. list, err := service.ForwardingRules.List(project, region).Do()
  4013. if err != nil {
  4014. return false, err
  4015. }
  4016. for _, item := range list.Items {
  4017. if item.PortRange == portRange && item.IPAddress == ip {
  4018. Logf("found a load balancer: %v", item)
  4019. return false, nil
  4020. }
  4021. }
  4022. return true, nil
  4023. })
  4024. }
  4025. // The following helper functions can block/unblock network from source
  4026. // host to destination host by manipulating iptable rules.
  4027. // This function assumes it can ssh to the source host.
  4028. //
  4029. // Caution:
  4030. // Recommend to input IP instead of hostnames. Using hostnames will cause iptables to
  4031. // do a DNS lookup to resolve the name to an IP address, which will
  4032. // slow down the test and cause it to fail if DNS is absent or broken.
  4033. //
  4034. // Suggested usage pattern:
  4035. // func foo() {
  4036. // ...
  4037. // defer UnblockNetwork(from, to)
  4038. // BlockNetwork(from, to)
  4039. // ...
  4040. // }
  4041. //
  4042. func BlockNetwork(from string, to string) {
  4043. Logf("block network traffic from %s to %s", from, to)
  4044. iptablesRule := fmt.Sprintf("OUTPUT --destination %s --jump REJECT", to)
  4045. dropCmd := fmt.Sprintf("sudo iptables --insert %s", iptablesRule)
  4046. if result, err := SSH(dropCmd, from, TestContext.Provider); result.Code != 0 || err != nil {
  4047. LogSSHResult(result)
  4048. Failf("Unexpected error: %v", err)
  4049. }
  4050. }
  4051. func UnblockNetwork(from string, to string) {
  4052. Logf("Unblock network traffic from %s to %s", from, to)
  4053. iptablesRule := fmt.Sprintf("OUTPUT --destination %s --jump REJECT", to)
  4054. undropCmd := fmt.Sprintf("sudo iptables --delete %s", iptablesRule)
  4055. // Undrop command may fail if the rule has never been created.
  4056. // In such case we just lose 30 seconds, but the cluster is healthy.
  4057. // But if the rule had been created and removing it failed, the node is broken and
  4058. // not coming back. Subsequent tests will run or fewer nodes (some of the tests
  4059. // may fail). Manual intervention is required in such case (recreating the
  4060. // cluster solves the problem too).
  4061. err := wait.Poll(time.Millisecond*100, time.Second*30, func() (bool, error) {
  4062. result, err := SSH(undropCmd, from, TestContext.Provider)
  4063. if result.Code == 0 && err == nil {
  4064. return true, nil
  4065. }
  4066. LogSSHResult(result)
  4067. if err != nil {
  4068. Logf("Unexpected error: %v", err)
  4069. }
  4070. return false, nil
  4071. })
  4072. if err != nil {
  4073. Failf("Failed to remove the iptable REJECT rule. Manual intervention is "+
  4074. "required on host %s: remove rule %s, if exists", from, iptablesRule)
  4075. }
  4076. }
  4077. func isElementOf(podUID types.UID, pods *v1.PodList) bool {
  4078. for _, pod := range pods.Items {
  4079. if pod.UID == podUID {
  4080. return true
  4081. }
  4082. }
  4083. return false
  4084. }
  4085. // timeout for proxy requests.
  4086. const proxyTimeout = 2 * time.Minute
  4087. // NodeProxyRequest performs a get on a node proxy endpoint given the nodename and rest client.
  4088. func NodeProxyRequest(c clientset.Interface, node, endpoint string) (restclient.Result, error) {
  4089. // proxy tends to hang in some cases when Node is not ready. Add an artificial timeout for this call.
  4090. // This will leak a goroutine if proxy hangs. #22165
  4091. var result restclient.Result
  4092. finished := make(chan struct{})
  4093. go func() {
  4094. result = c.CoreV1().RESTClient().Get().
  4095. Resource("nodes").
  4096. SubResource("proxy").
  4097. Name(fmt.Sprintf("%v:%v", node, ports.KubeletPort)).
  4098. Suffix(endpoint).
  4099. Do()
  4100. finished <- struct{}{}
  4101. }()
  4102. select {
  4103. case <-finished:
  4104. return result, nil
  4105. case <-time.After(proxyTimeout):
  4106. return restclient.Result{}, nil
  4107. }
  4108. }
  4109. // GetKubeletPods retrieves the list of pods on the kubelet
  4110. func GetKubeletPods(c clientset.Interface, node string) (*v1.PodList, error) {
  4111. return getKubeletPods(c, node, "pods")
  4112. }
  4113. // GetKubeletRunningPods retrieves the list of running pods on the kubelet. The pods
  4114. // includes necessary information (e.g., UID, name, namespace for
  4115. // pods/containers), but do not contain the full spec.
  4116. func GetKubeletRunningPods(c clientset.Interface, node string) (*v1.PodList, error) {
  4117. return getKubeletPods(c, node, "runningpods")
  4118. }
  4119. func getKubeletPods(c clientset.Interface, node, resource string) (*v1.PodList, error) {
  4120. result := &v1.PodList{}
  4121. client, err := NodeProxyRequest(c, node, resource)
  4122. if err != nil {
  4123. return &v1.PodList{}, err
  4124. }
  4125. if err = client.Into(result); err != nil {
  4126. return &v1.PodList{}, err
  4127. }
  4128. return result, nil
  4129. }
  4130. // LaunchWebserverPod launches a pod serving http on port 8080 to act
  4131. // as the target for networking connectivity checks. The ip address
  4132. // of the created pod will be returned if the pod is launched
  4133. // successfully.
  4134. func LaunchWebserverPod(f *Framework, podName, nodeName string) (ip string) {
  4135. containerName := fmt.Sprintf("%s-container", podName)
  4136. port := 8080
  4137. pod := &v1.Pod{
  4138. ObjectMeta: metav1.ObjectMeta{
  4139. Name: podName,
  4140. },
  4141. Spec: v1.PodSpec{
  4142. Containers: []v1.Container{
  4143. {
  4144. Name: containerName,
  4145. Image: imageutils.GetE2EImage(imageutils.Porter),
  4146. Env: []v1.EnvVar{{Name: fmt.Sprintf("SERVE_PORT_%d", port), Value: "foo"}},
  4147. Ports: []v1.ContainerPort{{ContainerPort: int32(port)}},
  4148. },
  4149. },
  4150. NodeName: nodeName,
  4151. RestartPolicy: v1.RestartPolicyNever,
  4152. },
  4153. }
  4154. podClient := f.ClientSet.CoreV1().Pods(f.Namespace.Name)
  4155. _, err := podClient.Create(pod)
  4156. ExpectNoError(err)
  4157. ExpectNoError(f.WaitForPodRunning(podName))
  4158. createdPod, err := podClient.Get(podName, metav1.GetOptions{})
  4159. ExpectNoError(err)
  4160. ip = net.JoinHostPort(createdPod.Status.PodIP, strconv.Itoa(port))
  4161. Logf("Target pod IP:port is %s", ip)
  4162. return
  4163. }
  4164. type PingCommand string
  4165. const (
  4166. IPv4PingCommand PingCommand = "ping"
  4167. IPv6PingCommand PingCommand = "ping6"
  4168. )
  4169. // CheckConnectivityToHost launches a pod to test connectivity to the specified
  4170. // host. An error will be returned if the host is not reachable from the pod.
  4171. //
  4172. // An empty nodeName will use the schedule to choose where the pod is executed.
  4173. func CheckConnectivityToHost(f *Framework, nodeName, podName, host string, pingCmd PingCommand, timeout int) error {
  4174. contName := fmt.Sprintf("%s-container", podName)
  4175. command := []string{
  4176. string(pingCmd),
  4177. "-c", "3", // send 3 pings
  4178. "-W", "2", // wait at most 2 seconds for a reply
  4179. "-w", strconv.Itoa(timeout),
  4180. host,
  4181. }
  4182. pod := &v1.Pod{
  4183. ObjectMeta: metav1.ObjectMeta{
  4184. Name: podName,
  4185. },
  4186. Spec: v1.PodSpec{
  4187. Containers: []v1.Container{
  4188. {
  4189. Name: contName,
  4190. Image: BusyBoxImage,
  4191. Command: command,
  4192. },
  4193. },
  4194. NodeName: nodeName,
  4195. RestartPolicy: v1.RestartPolicyNever,
  4196. },
  4197. }
  4198. podClient := f.ClientSet.CoreV1().Pods(f.Namespace.Name)
  4199. _, err := podClient.Create(pod)
  4200. if err != nil {
  4201. return err
  4202. }
  4203. err = WaitForPodSuccessInNamespace(f.ClientSet, podName, f.Namespace.Name)
  4204. if err != nil {
  4205. logs, logErr := GetPodLogs(f.ClientSet, f.Namespace.Name, pod.Name, contName)
  4206. if logErr != nil {
  4207. Logf("Warning: Failed to get logs from pod %q: %v", pod.Name, logErr)
  4208. } else {
  4209. Logf("pod %s/%s logs:\n%s", f.Namespace.Name, pod.Name, logs)
  4210. }
  4211. }
  4212. return err
  4213. }
  4214. // CoreDump SSHs to the master and all nodes and dumps their logs into dir.
  4215. // It shells out to cluster/log-dump/log-dump.sh to accomplish this.
  4216. func CoreDump(dir string) {
  4217. if TestContext.DisableLogDump {
  4218. Logf("Skipping dumping logs from cluster")
  4219. return
  4220. }
  4221. var cmd *exec.Cmd
  4222. if TestContext.LogexporterGCSPath != "" {
  4223. Logf("Dumping logs from nodes to GCS directly at path: %s", TestContext.LogexporterGCSPath)
  4224. cmd = exec.Command(path.Join(TestContext.RepoRoot, "cluster", "log-dump", "log-dump.sh"), dir, TestContext.LogexporterGCSPath)
  4225. } else {
  4226. Logf("Dumping logs locally to: %s", dir)
  4227. cmd = exec.Command(path.Join(TestContext.RepoRoot, "cluster", "log-dump", "log-dump.sh"), dir)
  4228. }
  4229. cmd.Env = append(os.Environ(), fmt.Sprintf("LOG_DUMP_SYSTEMD_SERVICES=%s", parseSystemdServices(TestContext.SystemdServices)))
  4230. cmd.Stdout = os.Stdout
  4231. cmd.Stderr = os.Stderr
  4232. if err := cmd.Run(); err != nil {
  4233. Logf("Error running cluster/log-dump/log-dump.sh: %v", err)
  4234. }
  4235. }
  4236. // parseSystemdServices converts services separator from comma to space.
  4237. func parseSystemdServices(services string) string {
  4238. return strings.TrimSpace(strings.Replace(services, ",", " ", -1))
  4239. }
  4240. func UpdatePodWithRetries(client clientset.Interface, ns, name string, update func(*v1.Pod)) (*v1.Pod, error) {
  4241. for i := 0; i < 3; i++ {
  4242. pod, err := client.CoreV1().Pods(ns).Get(name, metav1.GetOptions{})
  4243. if err != nil {
  4244. return nil, fmt.Errorf("Failed to get pod %q: %v", name, err)
  4245. }
  4246. update(pod)
  4247. pod, err = client.CoreV1().Pods(ns).Update(pod)
  4248. if err == nil {
  4249. return pod, nil
  4250. }
  4251. if !apierrs.IsConflict(err) && !apierrs.IsServerTimeout(err) {
  4252. return nil, fmt.Errorf("Failed to update pod %q: %v", name, err)
  4253. }
  4254. }
  4255. return nil, fmt.Errorf("Too many retries updating Pod %q", name)
  4256. }
  4257. func GetPodsInNamespace(c clientset.Interface, ns string, ignoreLabels map[string]string) ([]*v1.Pod, error) {
  4258. pods, err := c.CoreV1().Pods(ns).List(metav1.ListOptions{})
  4259. if err != nil {
  4260. return []*v1.Pod{}, err
  4261. }
  4262. ignoreSelector := labels.SelectorFromSet(ignoreLabels)
  4263. filtered := []*v1.Pod{}
  4264. for _, p := range pods.Items {
  4265. if len(ignoreLabels) != 0 && ignoreSelector.Matches(labels.Set(p.Labels)) {
  4266. continue
  4267. }
  4268. filtered = append(filtered, &p)
  4269. }
  4270. return filtered, nil
  4271. }
  4272. // RunCmd runs cmd using args and returns its stdout and stderr. It also outputs
  4273. // cmd's stdout and stderr to their respective OS streams.
  4274. func RunCmd(command string, args ...string) (string, string, error) {
  4275. return RunCmdEnv(nil, command, args...)
  4276. }
  4277. // RunCmdEnv runs cmd with the provided environment and args and
  4278. // returns its stdout and stderr. It also outputs cmd's stdout and
  4279. // stderr to their respective OS streams.
  4280. func RunCmdEnv(env []string, command string, args ...string) (string, string, error) {
  4281. Logf("Running %s %v", command, args)
  4282. var bout, berr bytes.Buffer
  4283. cmd := exec.Command(command, args...)
  4284. // We also output to the OS stdout/stderr to aid in debugging in case cmd
  4285. // hangs and never returns before the test gets killed.
  4286. //
  4287. // This creates some ugly output because gcloud doesn't always provide
  4288. // newlines.
  4289. cmd.Stdout = io.MultiWriter(os.Stdout, &bout)
  4290. cmd.Stderr = io.MultiWriter(os.Stderr, &berr)
  4291. cmd.Env = env
  4292. err := cmd.Run()
  4293. stdout, stderr := bout.String(), berr.String()
  4294. if err != nil {
  4295. return "", "", fmt.Errorf("error running %s %v; got error %v, stdout %q, stderr %q",
  4296. command, args, err, stdout, stderr)
  4297. }
  4298. return stdout, stderr, nil
  4299. }
  4300. // retryCmd runs cmd using args and retries it for up to SingleCallTimeout if
  4301. // it returns an error. It returns stdout and stderr.
  4302. func retryCmd(command string, args ...string) (string, string, error) {
  4303. var err error
  4304. stdout, stderr := "", ""
  4305. wait.Poll(Poll, SingleCallTimeout, func() (bool, error) {
  4306. stdout, stderr, err = RunCmd(command, args...)
  4307. if err != nil {
  4308. Logf("Got %v", err)
  4309. return false, nil
  4310. }
  4311. return true, nil
  4312. })
  4313. return stdout, stderr, err
  4314. }
  4315. // GetPodsScheduled returns a number of currently scheduled and not scheduled Pods.
  4316. func GetPodsScheduled(masterNodes sets.String, pods *v1.PodList) (scheduledPods, notScheduledPods []v1.Pod) {
  4317. for _, pod := range pods.Items {
  4318. if !masterNodes.Has(pod.Spec.NodeName) {
  4319. if pod.Spec.NodeName != "" {
  4320. _, scheduledCondition := podutil.GetPodCondition(&pod.Status, v1.PodScheduled)
  4321. Expect(scheduledCondition != nil).To(Equal(true))
  4322. Expect(scheduledCondition.Status).To(Equal(v1.ConditionTrue))
  4323. scheduledPods = append(scheduledPods, pod)
  4324. } else {
  4325. _, scheduledCondition := podutil.GetPodCondition(&pod.Status, v1.PodScheduled)
  4326. Expect(scheduledCondition != nil).To(Equal(true))
  4327. Expect(scheduledCondition.Status).To(Equal(v1.ConditionFalse))
  4328. if scheduledCondition.Reason == "Unschedulable" {
  4329. notScheduledPods = append(notScheduledPods, pod)
  4330. }
  4331. }
  4332. }
  4333. }
  4334. return
  4335. }
  4336. // WaitForStableCluster waits until all existing pods are scheduled and returns their amount.
  4337. func WaitForStableCluster(c clientset.Interface, masterNodes sets.String) int {
  4338. timeout := 10 * time.Minute
  4339. startTime := time.Now()
  4340. allPods, err := c.CoreV1().Pods(metav1.NamespaceAll).List(metav1.ListOptions{})
  4341. ExpectNoError(err)
  4342. // API server returns also Pods that succeeded. We need to filter them out.
  4343. currentPods := make([]v1.Pod, 0, len(allPods.Items))
  4344. for _, pod := range allPods.Items {
  4345. if pod.Status.Phase != v1.PodSucceeded && pod.Status.Phase != v1.PodFailed {
  4346. currentPods = append(currentPods, pod)
  4347. }
  4348. }
  4349. allPods.Items = currentPods
  4350. scheduledPods, currentlyNotScheduledPods := GetPodsScheduled(masterNodes, allPods)
  4351. for len(currentlyNotScheduledPods) != 0 {
  4352. time.Sleep(2 * time.Second)
  4353. allPods, err := c.CoreV1().Pods(metav1.NamespaceAll).List(metav1.ListOptions{})
  4354. ExpectNoError(err)
  4355. scheduledPods, currentlyNotScheduledPods = GetPodsScheduled(masterNodes, allPods)
  4356. if startTime.Add(timeout).Before(time.Now()) {
  4357. Failf("Timed out after %v waiting for stable cluster.", timeout)
  4358. break
  4359. }
  4360. }
  4361. return len(scheduledPods)
  4362. }
  4363. // GetMasterAndWorkerNodesOrDie will return a list masters and schedulable worker nodes
  4364. func GetMasterAndWorkerNodesOrDie(c clientset.Interface) (sets.String, *v1.NodeList) {
  4365. nodes := &v1.NodeList{}
  4366. masters := sets.NewString()
  4367. all, _ := c.CoreV1().Nodes().List(metav1.ListOptions{})
  4368. for _, n := range all.Items {
  4369. if system.IsMasterNode(n.Name) {
  4370. masters.Insert(n.Name)
  4371. } else if isNodeSchedulable(&n) && isNodeUntainted(&n) {
  4372. nodes.Items = append(nodes.Items, n)
  4373. }
  4374. }
  4375. return masters, nodes
  4376. }
  4377. func ListNamespaceEvents(c clientset.Interface, ns string) error {
  4378. ls, err := c.CoreV1().Events(ns).List(metav1.ListOptions{})
  4379. if err != nil {
  4380. return err
  4381. }
  4382. for _, event := range ls.Items {
  4383. glog.Infof("Event(%#v): type: '%v' reason: '%v' %v", event.InvolvedObject, event.Type, event.Reason, event.Message)
  4384. }
  4385. return nil
  4386. }
  4387. // E2ETestNodePreparer implements testutils.TestNodePreparer interface, which is used
  4388. // to create/modify Nodes before running a test.
  4389. type E2ETestNodePreparer struct {
  4390. client clientset.Interface
  4391. // Specifies how many nodes should be modified using the given strategy.
  4392. // Only one strategy can be applied to a single Node, so there needs to
  4393. // be at least <sum_of_keys> Nodes in the cluster.
  4394. countToStrategy []testutils.CountToStrategy
  4395. nodeToAppliedStrategy map[string]testutils.PrepareNodeStrategy
  4396. }
  4397. func NewE2ETestNodePreparer(client clientset.Interface, countToStrategy []testutils.CountToStrategy) testutils.TestNodePreparer {
  4398. return &E2ETestNodePreparer{
  4399. client: client,
  4400. countToStrategy: countToStrategy,
  4401. nodeToAppliedStrategy: make(map[string]testutils.PrepareNodeStrategy),
  4402. }
  4403. }
  4404. func (p *E2ETestNodePreparer) PrepareNodes() error {
  4405. nodes := GetReadySchedulableNodesOrDie(p.client)
  4406. numTemplates := 0
  4407. for k := range p.countToStrategy {
  4408. numTemplates += k
  4409. }
  4410. if numTemplates > len(nodes.Items) {
  4411. return fmt.Errorf("Can't prepare Nodes. Got more templates than existing Nodes.")
  4412. }
  4413. index := 0
  4414. sum := 0
  4415. for _, v := range p.countToStrategy {
  4416. sum += v.Count
  4417. for ; index < sum; index++ {
  4418. if err := testutils.DoPrepareNode(p.client, &nodes.Items[index], v.Strategy); err != nil {
  4419. glog.Errorf("Aborting node preparation: %v", err)
  4420. return err
  4421. }
  4422. p.nodeToAppliedStrategy[nodes.Items[index].Name] = v.Strategy
  4423. }
  4424. }
  4425. return nil
  4426. }
  4427. func (p *E2ETestNodePreparer) CleanupNodes() error {
  4428. var encounteredError error
  4429. nodes := GetReadySchedulableNodesOrDie(p.client)
  4430. for i := range nodes.Items {
  4431. var err error
  4432. name := nodes.Items[i].Name
  4433. strategy, found := p.nodeToAppliedStrategy[name]
  4434. if found {
  4435. if err = testutils.DoCleanupNode(p.client, name, strategy); err != nil {
  4436. glog.Errorf("Skipping cleanup of Node: failed update of %v: %v", name, err)
  4437. encounteredError = err
  4438. }
  4439. }
  4440. }
  4441. return encounteredError
  4442. }
  4443. func GetClusterID(c clientset.Interface) (string, error) {
  4444. cm, err := c.CoreV1().ConfigMaps(metav1.NamespaceSystem).Get(gcecloud.UIDConfigMapName, metav1.GetOptions{})
  4445. if err != nil || cm == nil {
  4446. return "", fmt.Errorf("error getting cluster ID: %v", err)
  4447. }
  4448. clusterID, clusterIDExists := cm.Data[gcecloud.UIDCluster]
  4449. providerID, providerIDExists := cm.Data[gcecloud.UIDProvider]
  4450. if !clusterIDExists {
  4451. return "", fmt.Errorf("cluster ID not set")
  4452. }
  4453. if providerIDExists {
  4454. return providerID, nil
  4455. }
  4456. return clusterID, nil
  4457. }
  4458. // CleanupGCEResources cleans up GCE Service Type=LoadBalancer resources with
  4459. // the given name. The name is usually the UUID of the Service prefixed with an
  4460. // alpha-numeric character ('a') to work around cloudprovider rules.
  4461. func CleanupGCEResources(c clientset.Interface, loadBalancerName, region, zone string) (retErr error) {
  4462. gceCloud, err := GetGCECloud()
  4463. if err != nil {
  4464. return err
  4465. }
  4466. if region == "" {
  4467. // Attempt to parse region from zone if no region is given.
  4468. region, err = gcecloud.GetGCERegion(zone)
  4469. if err != nil {
  4470. return fmt.Errorf("error parsing GCE/GKE region from zone %q: %v", zone, err)
  4471. }
  4472. }
  4473. if err := gceCloud.DeleteFirewall(gcecloud.MakeFirewallName(loadBalancerName)); err != nil &&
  4474. !IsGoogleAPIHTTPErrorCode(err, http.StatusNotFound) {
  4475. retErr = err
  4476. }
  4477. if err := gceCloud.DeleteRegionForwardingRule(loadBalancerName, region); err != nil &&
  4478. !IsGoogleAPIHTTPErrorCode(err, http.StatusNotFound) {
  4479. retErr = fmt.Errorf("%v\n%v", retErr, err)
  4480. }
  4481. if err := gceCloud.DeleteRegionAddress(loadBalancerName, region); err != nil &&
  4482. !IsGoogleAPIHTTPErrorCode(err, http.StatusNotFound) {
  4483. retErr = fmt.Errorf("%v\n%v", retErr, err)
  4484. }
  4485. clusterID, err := GetClusterID(c)
  4486. if err != nil {
  4487. retErr = fmt.Errorf("%v\n%v", retErr, err)
  4488. return
  4489. }
  4490. hcNames := []string{gcecloud.MakeNodesHealthCheckName(clusterID)}
  4491. hc, getErr := gceCloud.GetHttpHealthCheck(loadBalancerName)
  4492. if getErr != nil && !IsGoogleAPIHTTPErrorCode(getErr, http.StatusNotFound) {
  4493. retErr = fmt.Errorf("%v\n%v", retErr, getErr)
  4494. return
  4495. }
  4496. if hc != nil {
  4497. hcNames = append(hcNames, hc.Name)
  4498. }
  4499. if err := gceCloud.DeleteExternalTargetPoolAndChecks(&v1.Service{}, loadBalancerName, region, clusterID, hcNames...); err != nil &&
  4500. !IsGoogleAPIHTTPErrorCode(err, http.StatusNotFound) {
  4501. retErr = fmt.Errorf("%v\n%v", retErr, err)
  4502. }
  4503. return
  4504. }
  4505. // IsHTTPErrorCode returns true if the error is a google api
  4506. // error matching the corresponding HTTP error code.
  4507. func IsGoogleAPIHTTPErrorCode(err error, code int) bool {
  4508. apiErr, ok := err.(*googleapi.Error)
  4509. return ok && apiErr.Code == code
  4510. }
  4511. // getMaster populates the externalIP, internalIP and hostname fields of the master.
  4512. // If any of these is unavailable, it is set to "".
  4513. func getMaster(c clientset.Interface) Address {
  4514. master := Address{}
  4515. // Populate the internal IP.
  4516. eps, err :=