PageRenderTime 52ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 0ms

/test/e2e/daemon_restart.go

https://gitlab.com/CORP-RESELLER/kubernetes
Go | 320 lines | 235 code | 35 blank | 50 comment | 27 complexity | 2928c375141b0adb08b47282db972049 MD5 | raw file
  1. /*
  2. Copyright 2015 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package e2e
  14. import (
  15. "fmt"
  16. "strconv"
  17. "time"
  18. "k8s.io/kubernetes/pkg/api"
  19. "k8s.io/kubernetes/pkg/client/cache"
  20. client "k8s.io/kubernetes/pkg/client/unversioned"
  21. controllerframework "k8s.io/kubernetes/pkg/controller/framework"
  22. "k8s.io/kubernetes/pkg/labels"
  23. "k8s.io/kubernetes/pkg/master/ports"
  24. "k8s.io/kubernetes/pkg/runtime"
  25. "k8s.io/kubernetes/pkg/util/sets"
  26. "k8s.io/kubernetes/pkg/util/uuid"
  27. "k8s.io/kubernetes/pkg/util/wait"
  28. "k8s.io/kubernetes/pkg/watch"
  29. "k8s.io/kubernetes/test/e2e/framework"
  30. . "github.com/onsi/ginkgo"
  31. . "github.com/onsi/gomega"
  32. )
  33. // This test primarily checks 2 things:
  34. // 1. Daemons restart automatically within some sane time (10m).
  35. // 2. They don't take abnormal actions when restarted in the steady state.
  36. // - Controller manager shouldn't overshoot replicas
  37. // - Kubelet shouldn't restart containers
  38. // - Scheduler should continue assigning hosts to new pods
  39. const (
  40. restartPollInterval = 5 * time.Second
  41. restartTimeout = 10 * time.Minute
  42. numPods = 10
  43. sshPort = 22
  44. ADD = "ADD"
  45. DEL = "DEL"
  46. UPDATE = "UPDATE"
  47. )
  48. // nodeExec execs the given cmd on node via SSH. Note that the nodeName is an sshable name,
  49. // eg: the name returned by framework.GetMasterHost(). This is also not guaranteed to work across
  50. // cloud providers since it involves ssh.
  51. func nodeExec(nodeName, cmd string) (framework.SSHResult, error) {
  52. result, err := framework.SSH(cmd, fmt.Sprintf("%v:%v", nodeName, sshPort), framework.TestContext.Provider)
  53. Expect(err).NotTo(HaveOccurred())
  54. return result, err
  55. }
  56. // restartDaemonConfig is a config to restart a running daemon on a node, and wait till
  57. // it comes back up. It uses ssh to send a SIGTERM to the daemon.
  58. type restartDaemonConfig struct {
  59. nodeName string
  60. daemonName string
  61. healthzPort int
  62. pollInterval time.Duration
  63. pollTimeout time.Duration
  64. }
  65. // NewRestartConfig creates a restartDaemonConfig for the given node and daemon.
  66. func NewRestartConfig(nodeName, daemonName string, healthzPort int, pollInterval, pollTimeout time.Duration) *restartDaemonConfig {
  67. if !framework.ProviderIs("gce") {
  68. framework.Logf("WARNING: SSH through the restart config might not work on %s", framework.TestContext.Provider)
  69. }
  70. return &restartDaemonConfig{
  71. nodeName: nodeName,
  72. daemonName: daemonName,
  73. healthzPort: healthzPort,
  74. pollInterval: pollInterval,
  75. pollTimeout: pollTimeout,
  76. }
  77. }
  78. func (r *restartDaemonConfig) String() string {
  79. return fmt.Sprintf("Daemon %v on node %v", r.daemonName, r.nodeName)
  80. }
  81. // waitUp polls healthz of the daemon till it returns "ok" or the polling hits the pollTimeout
  82. func (r *restartDaemonConfig) waitUp() {
  83. framework.Logf("Checking if %v is up by polling for a 200 on its /healthz endpoint", r)
  84. healthzCheck := fmt.Sprintf(
  85. "curl -s -o /dev/null -I -w \"%%{http_code}\" http://localhost:%v/healthz", r.healthzPort)
  86. err := wait.Poll(r.pollInterval, r.pollTimeout, func() (bool, error) {
  87. result, err := nodeExec(r.nodeName, healthzCheck)
  88. framework.ExpectNoError(err)
  89. if result.Code == 0 {
  90. httpCode, err := strconv.Atoi(result.Stdout)
  91. if err != nil {
  92. framework.Logf("Unable to parse healthz http return code: %v", err)
  93. } else if httpCode == 200 {
  94. return true, nil
  95. }
  96. }
  97. framework.Logf("node %v exec command, '%v' failed with exitcode %v: \n\tstdout: %v\n\tstderr: %v",
  98. r.nodeName, healthzCheck, result.Code, result.Stdout, result.Stderr)
  99. return false, nil
  100. })
  101. framework.ExpectNoError(err, "%v did not respond with a 200 via %v within %v", r, healthzCheck, r.pollTimeout)
  102. }
  103. // kill sends a SIGTERM to the daemon
  104. func (r *restartDaemonConfig) kill() {
  105. framework.Logf("Killing %v", r)
  106. nodeExec(r.nodeName, fmt.Sprintf("pgrep %v | xargs -I {} sudo kill {}", r.daemonName))
  107. }
  108. // Restart checks if the daemon is up, kills it, and waits till it comes back up
  109. func (r *restartDaemonConfig) restart() {
  110. r.waitUp()
  111. r.kill()
  112. r.waitUp()
  113. }
  114. // podTracker records a serial history of events that might've affects pods.
  115. type podTracker struct {
  116. cache.ThreadSafeStore
  117. }
  118. func (p *podTracker) remember(pod *api.Pod, eventType string) {
  119. if eventType == UPDATE && pod.Status.Phase == api.PodRunning {
  120. return
  121. }
  122. p.Add(fmt.Sprintf("[%v] %v: %v", time.Now(), eventType, pod.Name), pod)
  123. }
  124. func (p *podTracker) String() (msg string) {
  125. for _, k := range p.ListKeys() {
  126. obj, exists := p.Get(k)
  127. if !exists {
  128. continue
  129. }
  130. pod := obj.(*api.Pod)
  131. msg += fmt.Sprintf("%v Phase %v Host %v\n", k, pod.Status.Phase, pod.Spec.NodeName)
  132. }
  133. return
  134. }
  135. func newPodTracker() *podTracker {
  136. return &podTracker{cache.NewThreadSafeStore(
  137. cache.Indexers{}, cache.Indices{})}
  138. }
  139. // replacePods replaces content of the store with the given pods.
  140. func replacePods(pods []*api.Pod, store cache.Store) {
  141. found := make([]interface{}, 0, len(pods))
  142. for i := range pods {
  143. found = append(found, pods[i])
  144. }
  145. framework.ExpectNoError(store.Replace(found, "0"))
  146. }
  147. // getContainerRestarts returns the count of container restarts across all pods matching the given labelSelector,
  148. // and a list of nodenames across which these containers restarted.
  149. func getContainerRestarts(c *client.Client, ns string, labelSelector labels.Selector) (int, []string) {
  150. options := api.ListOptions{LabelSelector: labelSelector}
  151. pods, err := c.Pods(ns).List(options)
  152. framework.ExpectNoError(err)
  153. failedContainers := 0
  154. containerRestartNodes := sets.NewString()
  155. for _, p := range pods.Items {
  156. for _, v := range framework.FailedContainers(&p) {
  157. failedContainers = failedContainers + v.Restarts
  158. containerRestartNodes.Insert(p.Spec.NodeName)
  159. }
  160. }
  161. return failedContainers, containerRestartNodes.List()
  162. }
  163. var _ = framework.KubeDescribe("DaemonRestart [Disruptive]", func() {
  164. f := framework.NewDefaultFramework("daemonrestart")
  165. rcName := "daemonrestart" + strconv.Itoa(numPods) + "-" + string(uuid.NewUUID())
  166. labelSelector := labels.Set(map[string]string{"name": rcName}).AsSelector()
  167. existingPods := cache.NewStore(cache.MetaNamespaceKeyFunc)
  168. var ns string
  169. var config framework.RCConfig
  170. var controller *controllerframework.Controller
  171. var newPods cache.Store
  172. var stopCh chan struct{}
  173. var tracker *podTracker
  174. BeforeEach(func() {
  175. // These tests require SSH
  176. framework.SkipUnlessProviderIs(framework.ProvidersWithSSH...)
  177. ns = f.Namespace.Name
  178. // All the restart tests need an rc and a watch on pods of the rc.
  179. // Additionally some of them might scale the rc during the test.
  180. config = framework.RCConfig{
  181. Client: f.Client,
  182. Name: rcName,
  183. Namespace: ns,
  184. Image: framework.GetPauseImageName(f.Client),
  185. Replicas: numPods,
  186. CreatedPods: &[]*api.Pod{},
  187. }
  188. Expect(framework.RunRC(config)).NotTo(HaveOccurred())
  189. replacePods(*config.CreatedPods, existingPods)
  190. stopCh = make(chan struct{})
  191. tracker = newPodTracker()
  192. newPods, controller = controllerframework.NewInformer(
  193. &cache.ListWatch{
  194. ListFunc: func(options api.ListOptions) (runtime.Object, error) {
  195. options.LabelSelector = labelSelector
  196. return f.Client.Pods(ns).List(options)
  197. },
  198. WatchFunc: func(options api.ListOptions) (watch.Interface, error) {
  199. options.LabelSelector = labelSelector
  200. return f.Client.Pods(ns).Watch(options)
  201. },
  202. },
  203. &api.Pod{},
  204. 0,
  205. controllerframework.ResourceEventHandlerFuncs{
  206. AddFunc: func(obj interface{}) {
  207. tracker.remember(obj.(*api.Pod), ADD)
  208. },
  209. UpdateFunc: func(oldObj, newObj interface{}) {
  210. tracker.remember(newObj.(*api.Pod), UPDATE)
  211. },
  212. DeleteFunc: func(obj interface{}) {
  213. tracker.remember(obj.(*api.Pod), DEL)
  214. },
  215. },
  216. )
  217. go controller.Run(stopCh)
  218. })
  219. AfterEach(func() {
  220. close(stopCh)
  221. })
  222. It("Controller Manager should not create/delete replicas across restart", func() {
  223. // Requires master ssh access.
  224. framework.SkipUnlessProviderIs("gce", "aws")
  225. restarter := NewRestartConfig(
  226. framework.GetMasterHost(), "kube-controller", ports.ControllerManagerPort, restartPollInterval, restartTimeout)
  227. restarter.restart()
  228. // The intent is to ensure the replication controller manager has observed and reported status of
  229. // the replication controller at least once since the manager restarted, so that we can determine
  230. // that it had the opportunity to create/delete pods, if it were going to do so. Scaling the RC
  231. // to the same size achieves this, because the scale operation advances the RC's sequence number
  232. // and awaits it to be observed and reported back in the RC's status.
  233. framework.ScaleRC(f.Client, ns, rcName, numPods, true)
  234. // Only check the keys, the pods can be different if the kubelet updated it.
  235. // TODO: Can it really?
  236. existingKeys := sets.NewString()
  237. newKeys := sets.NewString()
  238. for _, k := range existingPods.ListKeys() {
  239. existingKeys.Insert(k)
  240. }
  241. for _, k := range newPods.ListKeys() {
  242. newKeys.Insert(k)
  243. }
  244. if len(newKeys.List()) != len(existingKeys.List()) ||
  245. !newKeys.IsSuperset(existingKeys) {
  246. framework.Failf("RcManager created/deleted pods after restart \n\n %+v", tracker)
  247. }
  248. })
  249. It("Scheduler should continue assigning pods to nodes across restart", func() {
  250. // Requires master ssh access.
  251. framework.SkipUnlessProviderIs("gce", "aws")
  252. restarter := NewRestartConfig(
  253. framework.GetMasterHost(), "kube-scheduler", ports.SchedulerPort, restartPollInterval, restartTimeout)
  254. // Create pods while the scheduler is down and make sure the scheduler picks them up by
  255. // scaling the rc to the same size.
  256. restarter.waitUp()
  257. restarter.kill()
  258. // This is best effort to try and create pods while the scheduler is down,
  259. // since we don't know exactly when it is restarted after the kill signal.
  260. framework.ExpectNoError(framework.ScaleRC(f.Client, ns, rcName, numPods+5, false))
  261. restarter.waitUp()
  262. framework.ExpectNoError(framework.ScaleRC(f.Client, ns, rcName, numPods+5, true))
  263. })
  264. It("Kubelet should not restart containers across restart", func() {
  265. nodeIPs, err := getNodePublicIps(f.Client)
  266. framework.ExpectNoError(err)
  267. preRestarts, badNodes := getContainerRestarts(f.Client, ns, labelSelector)
  268. if preRestarts != 0 {
  269. framework.Logf("WARNING: Non-zero container restart count: %d across nodes %v", preRestarts, badNodes)
  270. }
  271. for _, ip := range nodeIPs {
  272. restarter := NewRestartConfig(
  273. ip, "kubelet", ports.KubeletReadOnlyPort, restartPollInterval, restartTimeout)
  274. restarter.restart()
  275. }
  276. postRestarts, badNodes := getContainerRestarts(f.Client, ns, labelSelector)
  277. if postRestarts != preRestarts {
  278. framework.DumpNodeDebugInfo(f.Client, badNodes)
  279. framework.Failf("Net container restart count went from %v -> %v after kubelet restart on nodes %v \n\n %+v", preRestarts, postRestarts, badNodes, tracker)
  280. }
  281. })
  282. })