PageRenderTime 20ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/vendor/src/github.com/hashicorp/serf/serf/snapshot.go

https://gitlab.com/liwh/docker
Go | 500 lines | 359 code | 63 blank | 78 comment | 89 complexity | 0f9d5ed18171d6df0b48b8498e7f9b53 MD5 | raw file
  1. package serf
  2. import (
  3. "bufio"
  4. "fmt"
  5. "log"
  6. "math/rand"
  7. "net"
  8. "os"
  9. "strconv"
  10. "strings"
  11. "time"
  12. "github.com/armon/go-metrics"
  13. )
  14. /*
  15. Serf supports using a "snapshot" file that contains various
  16. transactional data that is used to help Serf recover quickly
  17. and gracefully from a failure. We append member events, as well
  18. as the latest clock values to the file during normal operation,
  19. and periodically checkpoint and roll over the file. During a restore,
  20. we can replay the various member events to recall a list of known
  21. nodes to re-join, as well as restore our clock values to avoid replaying
  22. old events.
  23. */
  24. const flushInterval = 500 * time.Millisecond
  25. const clockUpdateInterval = 500 * time.Millisecond
  26. const tmpExt = ".compact"
  27. // Snapshotter is responsible for ingesting events and persisting
  28. // them to disk, and providing a recovery mechanism at start time.
  29. type Snapshotter struct {
  30. aliveNodes map[string]string
  31. clock *LamportClock
  32. fh *os.File
  33. buffered *bufio.Writer
  34. inCh <-chan Event
  35. lastFlush time.Time
  36. lastClock LamportTime
  37. lastEventClock LamportTime
  38. lastQueryClock LamportTime
  39. leaveCh chan struct{}
  40. leaving bool
  41. logger *log.Logger
  42. maxSize int64
  43. path string
  44. offset int64
  45. outCh chan<- Event
  46. rejoinAfterLeave bool
  47. shutdownCh <-chan struct{}
  48. waitCh chan struct{}
  49. }
  50. // PreviousNode is used to represent the previously known alive nodes
  51. type PreviousNode struct {
  52. Name string
  53. Addr string
  54. }
  55. func (p PreviousNode) String() string {
  56. return fmt.Sprintf("%s: %s", p.Name, p.Addr)
  57. }
  58. // NewSnapshotter creates a new Snapshotter that records events up to a
  59. // max byte size before rotating the file. It can also be used to
  60. // recover old state. Snapshotter works by reading an event channel it returns,
  61. // passing through to an output channel, and persisting relevant events to disk.
  62. // Setting rejoinAfterLeave makes leave not clear the state, and can be used
  63. // if you intend to rejoin the same cluster after a leave.
  64. func NewSnapshotter(path string,
  65. maxSize int,
  66. rejoinAfterLeave bool,
  67. logger *log.Logger,
  68. clock *LamportClock,
  69. outCh chan<- Event,
  70. shutdownCh <-chan struct{}) (chan<- Event, *Snapshotter, error) {
  71. inCh := make(chan Event, 1024)
  72. // Try to open the file
  73. fh, err := os.OpenFile(path, os.O_RDWR|os.O_APPEND|os.O_CREATE, 0755)
  74. if err != nil {
  75. return nil, nil, fmt.Errorf("failed to open snapshot: %v", err)
  76. }
  77. // Determine the offset
  78. info, err := fh.Stat()
  79. if err != nil {
  80. fh.Close()
  81. return nil, nil, fmt.Errorf("failed to stat snapshot: %v", err)
  82. }
  83. offset := info.Size()
  84. // Create the snapshotter
  85. snap := &Snapshotter{
  86. aliveNodes: make(map[string]string),
  87. clock: clock,
  88. fh: fh,
  89. buffered: bufio.NewWriter(fh),
  90. inCh: inCh,
  91. lastClock: 0,
  92. lastEventClock: 0,
  93. lastQueryClock: 0,
  94. leaveCh: make(chan struct{}),
  95. logger: logger,
  96. maxSize: int64(maxSize),
  97. path: path,
  98. offset: offset,
  99. outCh: outCh,
  100. rejoinAfterLeave: rejoinAfterLeave,
  101. shutdownCh: shutdownCh,
  102. waitCh: make(chan struct{}),
  103. }
  104. // Recover the last known state
  105. if err := snap.replay(); err != nil {
  106. fh.Close()
  107. return nil, nil, err
  108. }
  109. // Start handling new commands
  110. go snap.stream()
  111. return inCh, snap, nil
  112. }
  113. // LastClock returns the last known clock time
  114. func (s *Snapshotter) LastClock() LamportTime {
  115. return s.lastClock
  116. }
  117. // LastEventClock returns the last known event clock time
  118. func (s *Snapshotter) LastEventClock() LamportTime {
  119. return s.lastEventClock
  120. }
  121. // LastQueryClock returns the last known query clock time
  122. func (s *Snapshotter) LastQueryClock() LamportTime {
  123. return s.lastQueryClock
  124. }
  125. // AliveNodes returns the last known alive nodes
  126. func (s *Snapshotter) AliveNodes() []*PreviousNode {
  127. // Copy the previously known
  128. previous := make([]*PreviousNode, 0, len(s.aliveNodes))
  129. for name, addr := range s.aliveNodes {
  130. previous = append(previous, &PreviousNode{name, addr})
  131. }
  132. // Randomize the order, prevents hot shards
  133. for i := range previous {
  134. j := rand.Intn(i + 1)
  135. previous[i], previous[j] = previous[j], previous[i]
  136. }
  137. return previous
  138. }
  139. // Wait is used to wait until the snapshotter finishes shut down
  140. func (s *Snapshotter) Wait() {
  141. <-s.waitCh
  142. }
  143. // Leave is used to remove known nodes to prevent a restart from
  144. // causing a join. Otherwise nodes will re-join after leaving!
  145. func (s *Snapshotter) Leave() {
  146. select {
  147. case s.leaveCh <- struct{}{}:
  148. case <-s.shutdownCh:
  149. }
  150. }
  151. // stream is a long running routine that is used to handle events
  152. func (s *Snapshotter) stream() {
  153. for {
  154. select {
  155. case <-s.leaveCh:
  156. s.leaving = true
  157. // If we plan to re-join, keep our state
  158. if !s.rejoinAfterLeave {
  159. s.aliveNodes = make(map[string]string)
  160. }
  161. s.tryAppend("leave\n")
  162. if err := s.buffered.Flush(); err != nil {
  163. s.logger.Printf("[ERR] serf: failed to flush leave to snapshot: %v", err)
  164. }
  165. if err := s.fh.Sync(); err != nil {
  166. s.logger.Printf("[ERR] serf: failed to sync leave to snapshot: %v", err)
  167. }
  168. case e := <-s.inCh:
  169. // Forward the event immediately
  170. if s.outCh != nil {
  171. s.outCh <- e
  172. }
  173. // Stop recording events after a leave is issued
  174. if s.leaving {
  175. continue
  176. }
  177. switch typed := e.(type) {
  178. case MemberEvent:
  179. s.processMemberEvent(typed)
  180. case UserEvent:
  181. s.processUserEvent(typed)
  182. case *Query:
  183. s.processQuery(typed)
  184. default:
  185. s.logger.Printf("[ERR] serf: Unknown event to snapshot: %#v", e)
  186. }
  187. case <-time.After(clockUpdateInterval):
  188. s.updateClock()
  189. case <-s.shutdownCh:
  190. if err := s.buffered.Flush(); err != nil {
  191. s.logger.Printf("[ERR] serf: failed to flush snapshot: %v", err)
  192. }
  193. if err := s.fh.Sync(); err != nil {
  194. s.logger.Printf("[ERR] serf: failed to sync snapshot: %v", err)
  195. }
  196. s.fh.Close()
  197. close(s.waitCh)
  198. return
  199. }
  200. }
  201. }
  202. // processMemberEvent is used to handle a single member event
  203. func (s *Snapshotter) processMemberEvent(e MemberEvent) {
  204. switch e.Type {
  205. case EventMemberJoin:
  206. for _, mem := range e.Members {
  207. addr := net.TCPAddr{IP: mem.Addr, Port: int(mem.Port)}
  208. s.aliveNodes[mem.Name] = addr.String()
  209. s.tryAppend(fmt.Sprintf("alive: %s %s\n", mem.Name, addr.String()))
  210. }
  211. case EventMemberLeave:
  212. fallthrough
  213. case EventMemberFailed:
  214. for _, mem := range e.Members {
  215. delete(s.aliveNodes, mem.Name)
  216. s.tryAppend(fmt.Sprintf("not-alive: %s\n", mem.Name))
  217. }
  218. }
  219. s.updateClock()
  220. }
  221. // updateClock is called periodically to check if we should udpate our
  222. // clock value. This is done after member events but should also be done
  223. // periodically due to race conditions with join and leave intents
  224. func (s *Snapshotter) updateClock() {
  225. lastSeen := s.clock.Time() - 1
  226. if lastSeen > s.lastClock {
  227. s.lastClock = lastSeen
  228. s.tryAppend(fmt.Sprintf("clock: %d\n", s.lastClock))
  229. }
  230. }
  231. // processUserEvent is used to handle a single user event
  232. func (s *Snapshotter) processUserEvent(e UserEvent) {
  233. // Ignore old clocks
  234. if e.LTime <= s.lastEventClock {
  235. return
  236. }
  237. s.lastEventClock = e.LTime
  238. s.tryAppend(fmt.Sprintf("event-clock: %d\n", e.LTime))
  239. }
  240. // processQuery is used to handle a single query event
  241. func (s *Snapshotter) processQuery(q *Query) {
  242. // Ignore old clocks
  243. if q.LTime <= s.lastQueryClock {
  244. return
  245. }
  246. s.lastQueryClock = q.LTime
  247. s.tryAppend(fmt.Sprintf("query-clock: %d\n", q.LTime))
  248. }
  249. // tryAppend will invoke append line but will not return an error
  250. func (s *Snapshotter) tryAppend(l string) {
  251. if err := s.appendLine(l); err != nil {
  252. s.logger.Printf("[ERR] serf: Failed to update snapshot: %v", err)
  253. }
  254. }
  255. // appendLine is used to append a line to the existing log
  256. func (s *Snapshotter) appendLine(l string) error {
  257. defer metrics.MeasureSince([]string{"serf", "snapshot", "appendLine"}, time.Now())
  258. n, err := s.buffered.WriteString(l)
  259. if err != nil {
  260. return err
  261. }
  262. // Check if we should flush
  263. now := time.Now()
  264. if now.Sub(s.lastFlush) > flushInterval {
  265. s.lastFlush = now
  266. if err := s.buffered.Flush(); err != nil {
  267. return err
  268. }
  269. }
  270. // Check if a compaction is necessary
  271. s.offset += int64(n)
  272. if s.offset > s.maxSize {
  273. return s.compact()
  274. }
  275. return nil
  276. }
  277. // Compact is used to compact the snapshot once it is too large
  278. func (s *Snapshotter) compact() error {
  279. defer metrics.MeasureSince([]string{"serf", "snapshot", "compact"}, time.Now())
  280. // Try to open the file to new fiel
  281. newPath := s.path + tmpExt
  282. fh, err := os.OpenFile(newPath, os.O_RDWR|os.O_TRUNC|os.O_CREATE, 0755)
  283. if err != nil {
  284. return fmt.Errorf("failed to open new snapshot: %v", err)
  285. }
  286. // Create a buffered writer
  287. buf := bufio.NewWriter(fh)
  288. // Write out the live nodes
  289. var offset int64
  290. for name, addr := range s.aliveNodes {
  291. line := fmt.Sprintf("alive: %s %s\n", name, addr)
  292. n, err := buf.WriteString(line)
  293. if err != nil {
  294. fh.Close()
  295. return err
  296. }
  297. offset += int64(n)
  298. }
  299. // Write out the clocks
  300. line := fmt.Sprintf("clock: %d\n", s.lastClock)
  301. n, err := buf.WriteString(line)
  302. if err != nil {
  303. fh.Close()
  304. return err
  305. }
  306. offset += int64(n)
  307. line = fmt.Sprintf("event-clock: %d\n", s.lastEventClock)
  308. n, err = buf.WriteString(line)
  309. if err != nil {
  310. fh.Close()
  311. return err
  312. }
  313. offset += int64(n)
  314. line = fmt.Sprintf("query-clock: %d\n", s.lastQueryClock)
  315. n, err = buf.WriteString(line)
  316. if err != nil {
  317. fh.Close()
  318. return err
  319. }
  320. offset += int64(n)
  321. // Flush the new snapshot
  322. err = buf.Flush()
  323. fh.Close()
  324. if err != nil {
  325. return fmt.Errorf("failed to flush new snapshot: %v", err)
  326. }
  327. // We now need to swap the old snapshot file with the new snapshot.
  328. // Turns out, Windows won't let us rename the files if we have
  329. // open handles to them or if the destination already exists. This
  330. // means we are forced to close the existing handles, delete the
  331. // old file, move the new one in place, and then re-open the file
  332. // handles.
  333. // Flush the existing snapshot, ignoring errors since we will
  334. // delete it momentarily.
  335. s.buffered.Flush()
  336. s.buffered = nil
  337. // Close the file handle to the old snapshot
  338. s.fh.Close()
  339. s.fh = nil
  340. // Delete the old file
  341. if err := os.Remove(s.path); err != nil {
  342. return fmt.Errorf("failed to remove old snapshot: %v", err)
  343. }
  344. // Move the new file into place
  345. if err := os.Rename(newPath, s.path); err != nil {
  346. return fmt.Errorf("failed to install new snapshot: %v", err)
  347. }
  348. // Open the new snapshot
  349. fh, err = os.OpenFile(s.path, os.O_RDWR|os.O_APPEND|os.O_CREATE, 0755)
  350. if err != nil {
  351. return fmt.Errorf("failed to open snapshot: %v", err)
  352. }
  353. buf = bufio.NewWriter(fh)
  354. // Rotate our handles
  355. s.fh = fh
  356. s.buffered = buf
  357. s.offset = offset
  358. s.lastFlush = time.Now()
  359. return nil
  360. }
  361. // replay is used to seek to reset our internal state by replaying
  362. // the snapshot file. It is used at initialization time to read old
  363. // state
  364. func (s *Snapshotter) replay() error {
  365. // Seek to the beginning
  366. if _, err := s.fh.Seek(0, os.SEEK_SET); err != nil {
  367. return err
  368. }
  369. // Read each line
  370. reader := bufio.NewReader(s.fh)
  371. for {
  372. line, err := reader.ReadString('\n')
  373. if err != nil {
  374. break
  375. }
  376. // Skip the newline
  377. line = line[:len(line)-1]
  378. // Switch on the prefix
  379. if strings.HasPrefix(line, "alive: ") {
  380. info := strings.TrimPrefix(line, "alive: ")
  381. addrIdx := strings.LastIndex(info, " ")
  382. if addrIdx == -1 {
  383. s.logger.Printf("[WARN] serf: Failed to parse address: %v", line)
  384. continue
  385. }
  386. addr := info[addrIdx+1:]
  387. name := info[:addrIdx]
  388. s.aliveNodes[name] = addr
  389. } else if strings.HasPrefix(line, "not-alive: ") {
  390. name := strings.TrimPrefix(line, "not-alive: ")
  391. delete(s.aliveNodes, name)
  392. } else if strings.HasPrefix(line, "clock: ") {
  393. timeStr := strings.TrimPrefix(line, "clock: ")
  394. timeInt, err := strconv.ParseUint(timeStr, 10, 64)
  395. if err != nil {
  396. s.logger.Printf("[WARN] serf: Failed to convert clock time: %v", err)
  397. continue
  398. }
  399. s.lastClock = LamportTime(timeInt)
  400. } else if strings.HasPrefix(line, "event-clock: ") {
  401. timeStr := strings.TrimPrefix(line, "event-clock: ")
  402. timeInt, err := strconv.ParseUint(timeStr, 10, 64)
  403. if err != nil {
  404. s.logger.Printf("[WARN] serf: Failed to convert event clock time: %v", err)
  405. continue
  406. }
  407. s.lastEventClock = LamportTime(timeInt)
  408. } else if strings.HasPrefix(line, "query-clock: ") {
  409. timeStr := strings.TrimPrefix(line, "query-clock: ")
  410. timeInt, err := strconv.ParseUint(timeStr, 10, 64)
  411. if err != nil {
  412. s.logger.Printf("[WARN] serf: Failed to convert query clock time: %v", err)
  413. continue
  414. }
  415. s.lastQueryClock = LamportTime(timeInt)
  416. } else if line == "leave" {
  417. // Ignore a leave if we plan on re-joining
  418. if s.rejoinAfterLeave {
  419. s.logger.Printf("[INFO] serf: Ignoring previous leave in snapshot")
  420. continue
  421. }
  422. s.aliveNodes = make(map[string]string)
  423. s.lastClock = 0
  424. s.lastEventClock = 0
  425. s.lastQueryClock = 0
  426. } else if strings.HasPrefix(line, "#") {
  427. // Skip comment lines
  428. } else {
  429. s.logger.Printf("[WARN] serf: Unrecognized snapshot line: %v", line)
  430. }
  431. }
  432. // Seek to the end
  433. if _, err := s.fh.Seek(0, os.SEEK_END); err != nil {
  434. return err
  435. }
  436. return nil
  437. }