PageRenderTime 44ms CodeModel.GetById 19ms RepoModel.GetById 1ms app.codeStats 0ms

/vendor/src/github.com/docker/swarmkit/manager/scheduler/scheduler.go

https://gitlab.com/vectorci/docker-1
Go | 478 lines | 373 code | 61 blank | 44 comment | 107 complexity | 1d5707d84a845f07eeb5c5d3d2b48274 MD5 | raw file
  1. package scheduler
  2. import (
  3. "container/heap"
  4. "container/list"
  5. "time"
  6. "github.com/docker/swarmkit/api"
  7. "github.com/docker/swarmkit/log"
  8. "github.com/docker/swarmkit/manager/state"
  9. "github.com/docker/swarmkit/manager/state/store"
  10. "github.com/docker/swarmkit/protobuf/ptypes"
  11. "golang.org/x/net/context"
  12. )
  13. type schedulingDecision struct {
  14. old *api.Task
  15. new *api.Task
  16. }
  17. // Scheduler assigns tasks to nodes.
  18. type Scheduler struct {
  19. store *store.MemoryStore
  20. unassignedTasks *list.List
  21. // preassignedTasks already have NodeID, need resource validation
  22. preassignedTasks map[string]*api.Task
  23. nodeHeap nodeHeap
  24. allTasks map[string]*api.Task
  25. pipeline *Pipeline
  26. // stopChan signals to the state machine to stop running
  27. stopChan chan struct{}
  28. // doneChan is closed when the state machine terminates
  29. doneChan chan struct{}
  30. // This currently exists only for benchmarking. It tells the scheduler
  31. // scan the whole heap instead of taking the minimum-valued node
  32. // blindly.
  33. scanAllNodes bool
  34. }
  35. // New creates a new scheduler.
  36. func New(store *store.MemoryStore) *Scheduler {
  37. return &Scheduler{
  38. store: store,
  39. unassignedTasks: list.New(),
  40. preassignedTasks: make(map[string]*api.Task),
  41. allTasks: make(map[string]*api.Task),
  42. stopChan: make(chan struct{}),
  43. doneChan: make(chan struct{}),
  44. pipeline: NewPipeline(),
  45. }
  46. }
  47. func (s *Scheduler) setupTasksList(tx store.ReadTx) error {
  48. tasks, err := store.FindTasks(tx, store.All)
  49. if err != nil {
  50. return err
  51. }
  52. tasksByNode := make(map[string]map[string]*api.Task)
  53. for _, t := range tasks {
  54. // Ignore all tasks that have not reached ALLOCATED
  55. // state and tasks that no longer consume resources.
  56. if t.Status.State < api.TaskStateAllocated || t.Status.State > api.TaskStateRunning {
  57. continue
  58. }
  59. s.allTasks[t.ID] = t
  60. if t.NodeID == "" {
  61. s.enqueue(t)
  62. continue
  63. }
  64. // preassigned tasks need to validate resource requirement on corresponding node
  65. if t.Status.State == api.TaskStateAllocated {
  66. s.preassignedTasks[t.ID] = t
  67. continue
  68. }
  69. if tasksByNode[t.NodeID] == nil {
  70. tasksByNode[t.NodeID] = make(map[string]*api.Task)
  71. }
  72. tasksByNode[t.NodeID][t.ID] = t
  73. }
  74. if err := s.buildNodeHeap(tx, tasksByNode); err != nil {
  75. return err
  76. }
  77. return nil
  78. }
  79. // Run is the scheduler event loop.
  80. func (s *Scheduler) Run(ctx context.Context) error {
  81. defer close(s.doneChan)
  82. updates, cancel, err := store.ViewAndWatch(s.store, s.setupTasksList)
  83. if err != nil {
  84. log.G(ctx).WithError(err).Errorf("snapshot store update failed")
  85. return err
  86. }
  87. defer cancel()
  88. // Validate resource for tasks from preassigned tasks
  89. // do this before other tasks because preassigned tasks like
  90. // global service should start before other tasks
  91. s.processPreassignedTasks(ctx)
  92. // Queue all unassigned tasks before processing changes.
  93. s.tick(ctx)
  94. const (
  95. // commitDebounceGap is the amount of time to wait between
  96. // commit events to debounce them.
  97. commitDebounceGap = 50 * time.Millisecond
  98. // maxLatency is a time limit on the debouncing.
  99. maxLatency = time.Second
  100. )
  101. var (
  102. debouncingStarted time.Time
  103. commitDebounceTimer *time.Timer
  104. commitDebounceTimeout <-chan time.Time
  105. )
  106. pendingChanges := 0
  107. schedule := func() {
  108. if len(s.preassignedTasks) > 0 {
  109. s.processPreassignedTasks(ctx)
  110. }
  111. if pendingChanges > 0 {
  112. s.tick(ctx)
  113. pendingChanges = 0
  114. }
  115. }
  116. // Watch for changes.
  117. for {
  118. select {
  119. case event := <-updates:
  120. switch v := event.(type) {
  121. case state.EventCreateTask:
  122. pendingChanges += s.createTask(ctx, v.Task)
  123. case state.EventUpdateTask:
  124. pendingChanges += s.updateTask(ctx, v.Task)
  125. case state.EventDeleteTask:
  126. s.deleteTask(ctx, v.Task)
  127. case state.EventCreateNode:
  128. s.createOrUpdateNode(v.Node)
  129. pendingChanges++
  130. case state.EventUpdateNode:
  131. s.createOrUpdateNode(v.Node)
  132. pendingChanges++
  133. case state.EventDeleteNode:
  134. s.nodeHeap.remove(v.Node.ID)
  135. case state.EventCommit:
  136. if commitDebounceTimer != nil {
  137. if time.Since(debouncingStarted) > maxLatency {
  138. commitDebounceTimer.Stop()
  139. commitDebounceTimer = nil
  140. commitDebounceTimeout = nil
  141. schedule()
  142. } else {
  143. commitDebounceTimer.Reset(commitDebounceGap)
  144. }
  145. } else {
  146. commitDebounceTimer = time.NewTimer(commitDebounceGap)
  147. commitDebounceTimeout = commitDebounceTimer.C
  148. debouncingStarted = time.Now()
  149. }
  150. }
  151. case <-commitDebounceTimeout:
  152. schedule()
  153. commitDebounceTimer = nil
  154. commitDebounceTimeout = nil
  155. case <-s.stopChan:
  156. return nil
  157. }
  158. }
  159. }
  160. // Stop causes the scheduler event loop to stop running.
  161. func (s *Scheduler) Stop() {
  162. close(s.stopChan)
  163. <-s.doneChan
  164. }
  165. // enqueue queues a task for scheduling.
  166. func (s *Scheduler) enqueue(t *api.Task) {
  167. s.unassignedTasks.PushBack(t)
  168. }
  169. func (s *Scheduler) createTask(ctx context.Context, t *api.Task) int {
  170. // Ignore all tasks that have not reached ALLOCATED
  171. // state, and tasks that no longer consume resources.
  172. if t.Status.State < api.TaskStateAllocated || t.Status.State > api.TaskStateRunning {
  173. return 0
  174. }
  175. s.allTasks[t.ID] = t
  176. if t.NodeID == "" {
  177. // unassigned task
  178. s.enqueue(t)
  179. return 1
  180. }
  181. if t.Status.State == api.TaskStateAllocated {
  182. s.preassignedTasks[t.ID] = t
  183. // preassigned tasks do not contribute to running tasks count
  184. return 0
  185. }
  186. nodeInfo, err := s.nodeHeap.nodeInfo(t.NodeID)
  187. if err == nil && nodeInfo.addTask(t) {
  188. s.nodeHeap.updateNode(nodeInfo)
  189. }
  190. return 0
  191. }
  192. func (s *Scheduler) updateTask(ctx context.Context, t *api.Task) int {
  193. // Ignore all tasks that have not reached ALLOCATED
  194. // state.
  195. if t.Status.State < api.TaskStateAllocated {
  196. return 0
  197. }
  198. oldTask := s.allTasks[t.ID]
  199. // Ignore all tasks that have not reached ALLOCATED
  200. // state, and tasks that no longer consume resources.
  201. if t.Status.State > api.TaskStateRunning {
  202. if oldTask != nil {
  203. s.deleteTask(ctx, oldTask)
  204. }
  205. return 1
  206. }
  207. if t.NodeID == "" {
  208. // unassigned task
  209. if oldTask != nil {
  210. s.deleteTask(ctx, oldTask)
  211. }
  212. s.allTasks[t.ID] = t
  213. s.enqueue(t)
  214. return 1
  215. }
  216. if t.Status.State == api.TaskStateAllocated {
  217. if oldTask != nil {
  218. s.deleteTask(ctx, oldTask)
  219. }
  220. s.allTasks[t.ID] = t
  221. s.preassignedTasks[t.ID] = t
  222. // preassigned tasks do not contribute to running tasks count
  223. return 0
  224. }
  225. s.allTasks[t.ID] = t
  226. nodeInfo, err := s.nodeHeap.nodeInfo(t.NodeID)
  227. if err == nil && nodeInfo.addTask(t) {
  228. s.nodeHeap.updateNode(nodeInfo)
  229. }
  230. return 0
  231. }
  232. func (s *Scheduler) deleteTask(ctx context.Context, t *api.Task) {
  233. delete(s.allTasks, t.ID)
  234. delete(s.preassignedTasks, t.ID)
  235. nodeInfo, err := s.nodeHeap.nodeInfo(t.NodeID)
  236. if err == nil && nodeInfo.removeTask(t) {
  237. s.nodeHeap.updateNode(nodeInfo)
  238. }
  239. }
  240. func (s *Scheduler) createOrUpdateNode(n *api.Node) {
  241. nodeInfo, _ := s.nodeHeap.nodeInfo(n.ID)
  242. var resources api.Resources
  243. if n.Description != nil && n.Description.Resources != nil {
  244. resources = *n.Description.Resources
  245. // reconcile resources by looping over all tasks in this node
  246. for _, task := range nodeInfo.Tasks {
  247. reservations := taskReservations(task.Spec)
  248. resources.MemoryBytes -= reservations.MemoryBytes
  249. resources.NanoCPUs -= reservations.NanoCPUs
  250. }
  251. }
  252. nodeInfo.Node = n
  253. nodeInfo.AvailableResources = resources
  254. s.nodeHeap.addOrUpdateNode(nodeInfo)
  255. }
  256. func (s *Scheduler) processPreassignedTasks(ctx context.Context) {
  257. schedulingDecisions := make(map[string]schedulingDecision, len(s.preassignedTasks))
  258. for _, t := range s.preassignedTasks {
  259. newT := s.taskFitNode(ctx, t, t.NodeID)
  260. if newT == nil {
  261. continue
  262. }
  263. schedulingDecisions[t.ID] = schedulingDecision{old: t, new: newT}
  264. }
  265. successful, failed := s.applySchedulingDecisions(ctx, schedulingDecisions)
  266. for _, decision := range successful {
  267. delete(s.preassignedTasks, decision.old.ID)
  268. }
  269. for _, decision := range failed {
  270. s.allTasks[decision.old.ID] = decision.old
  271. nodeInfo, err := s.nodeHeap.nodeInfo(decision.new.NodeID)
  272. if err == nil && nodeInfo.removeTask(decision.new) {
  273. s.nodeHeap.updateNode(nodeInfo)
  274. }
  275. }
  276. }
  277. // tick attempts to schedule the queue.
  278. func (s *Scheduler) tick(ctx context.Context) {
  279. schedulingDecisions := make(map[string]schedulingDecision, s.unassignedTasks.Len())
  280. var next *list.Element
  281. for e := s.unassignedTasks.Front(); e != nil; e = next {
  282. next = e.Next()
  283. id := e.Value.(*api.Task).ID
  284. if _, ok := schedulingDecisions[id]; ok {
  285. s.unassignedTasks.Remove(e)
  286. continue
  287. }
  288. t := s.allTasks[e.Value.(*api.Task).ID]
  289. if t == nil || t.NodeID != "" {
  290. // task deleted or already assigned
  291. s.unassignedTasks.Remove(e)
  292. continue
  293. }
  294. if newT := s.scheduleTask(ctx, t); newT != nil {
  295. schedulingDecisions[id] = schedulingDecision{old: t, new: newT}
  296. s.unassignedTasks.Remove(e)
  297. }
  298. }
  299. _, failed := s.applySchedulingDecisions(ctx, schedulingDecisions)
  300. for _, decision := range failed {
  301. s.allTasks[decision.old.ID] = decision.old
  302. nodeInfo, err := s.nodeHeap.nodeInfo(decision.new.NodeID)
  303. if err == nil && nodeInfo.removeTask(decision.new) {
  304. s.nodeHeap.updateNode(nodeInfo)
  305. }
  306. // enqueue task for next scheduling attempt
  307. s.enqueue(decision.old)
  308. }
  309. }
  310. func (s *Scheduler) applySchedulingDecisions(ctx context.Context, schedulingDecisions map[string]schedulingDecision) (successful, failed []schedulingDecision) {
  311. if len(schedulingDecisions) == 0 {
  312. return
  313. }
  314. successful = make([]schedulingDecision, 0, len(schedulingDecisions))
  315. // Apply changes to master store
  316. applied, err := s.store.Batch(func(batch *store.Batch) error {
  317. for len(schedulingDecisions) > 0 {
  318. err := batch.Update(func(tx store.Tx) error {
  319. // Update exactly one task inside this Update
  320. // callback.
  321. for taskID, decision := range schedulingDecisions {
  322. delete(schedulingDecisions, taskID)
  323. t := store.GetTask(tx, taskID)
  324. if t == nil {
  325. // Task no longer exists. Do nothing.
  326. failed = append(failed, decision)
  327. continue
  328. }
  329. if err := store.UpdateTask(tx, decision.new); err != nil {
  330. log.G(ctx).Debugf("scheduler failed to update task %s; will retry", taskID)
  331. failed = append(failed, decision)
  332. continue
  333. }
  334. successful = append(successful, decision)
  335. return nil
  336. }
  337. return nil
  338. })
  339. if err != nil {
  340. return err
  341. }
  342. }
  343. return nil
  344. })
  345. if err != nil {
  346. log.G(ctx).WithError(err).Error("scheduler tick transaction failed")
  347. failed = append(failed, successful[applied:]...)
  348. successful = successful[:applied]
  349. }
  350. return
  351. }
  352. // taskFitNode checks if a node has enough resource to accommodate a task
  353. func (s *Scheduler) taskFitNode(ctx context.Context, t *api.Task, nodeID string) *api.Task {
  354. nodeInfo, err := s.nodeHeap.nodeInfo(nodeID)
  355. if err != nil {
  356. // node does not exist in heap (it may have been deleted)
  357. return nil
  358. }
  359. s.pipeline.SetTask(t)
  360. if !s.pipeline.Process(&nodeInfo) {
  361. // this node cannot accommodate this task
  362. return nil
  363. }
  364. newT := *t
  365. newT.Status = api.TaskStatus{
  366. State: api.TaskStateAssigned,
  367. Timestamp: ptypes.MustTimestampProto(time.Now()),
  368. Message: "scheduler confirmed task can run on preassigned node",
  369. }
  370. s.allTasks[t.ID] = &newT
  371. if nodeInfo.addTask(&newT) {
  372. s.nodeHeap.updateNode(nodeInfo)
  373. }
  374. return &newT
  375. }
  376. // scheduleTask schedules a single task.
  377. func (s *Scheduler) scheduleTask(ctx context.Context, t *api.Task) *api.Task {
  378. s.pipeline.SetTask(t)
  379. n, _ := s.nodeHeap.findMin(s.pipeline.Process, s.scanAllNodes)
  380. if n == nil {
  381. log.G(ctx).WithField("task.id", t.ID).Debug("No suitable node available for task")
  382. return nil
  383. }
  384. log.G(ctx).WithField("task.id", t.ID).Debugf("Assigning to node %s", n.ID)
  385. newT := *t
  386. newT.NodeID = n.ID
  387. newT.Status = api.TaskStatus{
  388. State: api.TaskStateAssigned,
  389. Timestamp: ptypes.MustTimestampProto(time.Now()),
  390. Message: "scheduler assigned task to node",
  391. }
  392. s.allTasks[t.ID] = &newT
  393. nodeInfo, err := s.nodeHeap.nodeInfo(n.ID)
  394. if err == nil && nodeInfo.addTask(&newT) {
  395. s.nodeHeap.updateNode(nodeInfo)
  396. }
  397. return &newT
  398. }
  399. func (s *Scheduler) buildNodeHeap(tx store.ReadTx, tasksByNode map[string]map[string]*api.Task) error {
  400. nodes, err := store.FindNodes(tx, store.All)
  401. if err != nil {
  402. return err
  403. }
  404. s.nodeHeap.alloc(len(nodes))
  405. i := 0
  406. for _, n := range nodes {
  407. var resources api.Resources
  408. if n.Description != nil && n.Description.Resources != nil {
  409. resources = *n.Description.Resources
  410. }
  411. s.nodeHeap.heap = append(s.nodeHeap.heap, newNodeInfo(n, tasksByNode[n.ID], resources))
  412. s.nodeHeap.index[n.ID] = i
  413. i++
  414. }
  415. heap.Init(&s.nodeHeap)
  416. return nil
  417. }