PageRenderTime 65ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 1ms

/vendor/github.com/hashicorp/consul/command/agent/agent.go

https://github.com/backstage/backstage
Go | 1646 lines | 1213 code | 206 blank | 227 comment | 401 complexity | 84fe9228eb64d7ab10fd75630cc4e13f MD5 | raw file
Possible License(s): Apache-2.0, MIT, BSD-3-Clause, MPL-2.0-no-copyleft-exception
  1. package agent
  2. import (
  3. "encoding/json"
  4. "fmt"
  5. "io"
  6. "io/ioutil"
  7. "log"
  8. "net"
  9. "os"
  10. "path/filepath"
  11. "reflect"
  12. "regexp"
  13. "strconv"
  14. "sync"
  15. "time"
  16. "github.com/hashicorp/consul/consul"
  17. "github.com/hashicorp/consul/consul/state"
  18. "github.com/hashicorp/consul/consul/structs"
  19. "github.com/hashicorp/consul/lib"
  20. "github.com/hashicorp/consul/types"
  21. "github.com/hashicorp/go-uuid"
  22. "github.com/hashicorp/serf/coordinate"
  23. "github.com/hashicorp/serf/serf"
  24. )
  25. const (
  26. // Path to save agent service definitions
  27. servicesDir = "services"
  28. // Path to save local agent checks
  29. checksDir = "checks"
  30. checkStateDir = "checks/state"
  31. // The ID of the faux health checks for maintenance mode
  32. serviceMaintCheckPrefix = "_service_maintenance"
  33. nodeMaintCheckID = "_node_maintenance"
  34. // Default reasons for node/service maintenance mode
  35. defaultNodeMaintReason = "Maintenance mode is enabled for this node, " +
  36. "but no reason was provided. This is a default message."
  37. defaultServiceMaintReason = "Maintenance mode is enabled for this " +
  38. "service, but no reason was provided. This is a default message."
  39. )
  40. var (
  41. // dnsNameRe checks if a name or tag is dns-compatible.
  42. dnsNameRe = regexp.MustCompile(`^[a-zA-Z0-9\-]+$`)
  43. )
  44. /*
  45. The agent is the long running process that is run on every machine.
  46. It exposes an RPC interface that is used by the CLI to control the
  47. agent. The agent runs the query interfaces like HTTP, DNS, and RPC.
  48. However, it can run in either a client, or server mode. In server
  49. mode, it runs a full Consul server. In client-only mode, it only forwards
  50. requests to other Consul servers.
  51. */
  52. type Agent struct {
  53. config *Config
  54. // Used for writing our logs
  55. logger *log.Logger
  56. // Output sink for logs
  57. logOutput io.Writer
  58. // We have one of a client or a server, depending
  59. // on our configuration
  60. server *consul.Server
  61. client *consul.Client
  62. // state stores a local representation of the node,
  63. // services and checks. Used for anti-entropy.
  64. state localState
  65. // checkReapAfter maps the check ID to a timeout after which we should
  66. // reap its associated service
  67. checkReapAfter map[types.CheckID]time.Duration
  68. // checkMonitors maps the check ID to an associated monitor
  69. checkMonitors map[types.CheckID]*CheckMonitor
  70. // checkHTTPs maps the check ID to an associated HTTP check
  71. checkHTTPs map[types.CheckID]*CheckHTTP
  72. // checkTCPs maps the check ID to an associated TCP check
  73. checkTCPs map[types.CheckID]*CheckTCP
  74. // checkTTLs maps the check ID to an associated check TTL
  75. checkTTLs map[types.CheckID]*CheckTTL
  76. // checkDockers maps the check ID to an associated Docker Exec based check
  77. checkDockers map[types.CheckID]*CheckDocker
  78. // checkLock protects updates to the check* maps
  79. checkLock sync.Mutex
  80. // eventCh is used to receive user events
  81. eventCh chan serf.UserEvent
  82. // eventBuf stores the most recent events in a ring buffer
  83. // using eventIndex as the next index to insert into. This
  84. // is guarded by eventLock. When an insert happens, the
  85. // eventNotify group is notified.
  86. eventBuf []*UserEvent
  87. eventIndex int
  88. eventLock sync.RWMutex
  89. eventNotify state.NotifyGroup
  90. shutdown bool
  91. shutdownCh chan struct{}
  92. shutdownLock sync.Mutex
  93. // endpoints lets you override RPC endpoints for testing. Not all
  94. // agent methods use this, so use with care and never override
  95. // outside of a unit test.
  96. endpoints map[string]string
  97. }
  98. // Create is used to create a new Agent. Returns
  99. // the agent or potentially an error.
  100. func Create(config *Config, logOutput io.Writer) (*Agent, error) {
  101. // Ensure we have a log sink
  102. if logOutput == nil {
  103. logOutput = os.Stderr
  104. }
  105. // Validate the config
  106. if config.Datacenter == "" {
  107. return nil, fmt.Errorf("Must configure a Datacenter")
  108. }
  109. if config.DataDir == "" && !config.DevMode {
  110. return nil, fmt.Errorf("Must configure a DataDir")
  111. }
  112. // Try to get an advertise address
  113. if config.AdvertiseAddr != "" {
  114. if ip := net.ParseIP(config.AdvertiseAddr); ip == nil {
  115. return nil, fmt.Errorf("Failed to parse advertise address: %v", config.AdvertiseAddr)
  116. }
  117. } else if config.BindAddr != "0.0.0.0" && config.BindAddr != "" && config.BindAddr != "[::]" {
  118. config.AdvertiseAddr = config.BindAddr
  119. } else {
  120. var err error
  121. var ip net.IP
  122. if config.BindAddr == "[::]" {
  123. ip, err = consul.GetPublicIPv6()
  124. } else {
  125. ip, err = consul.GetPrivateIP()
  126. }
  127. if err != nil {
  128. return nil, fmt.Errorf("Failed to get advertise address: %v", err)
  129. }
  130. config.AdvertiseAddr = ip.String()
  131. }
  132. // Try to get an advertise address for the wan
  133. if config.AdvertiseAddrWan != "" {
  134. if ip := net.ParseIP(config.AdvertiseAddrWan); ip == nil {
  135. return nil, fmt.Errorf("Failed to parse advertise address for wan: %v", config.AdvertiseAddrWan)
  136. }
  137. } else {
  138. config.AdvertiseAddrWan = config.AdvertiseAddr
  139. }
  140. // Create the default set of tagged addresses.
  141. config.TaggedAddresses = map[string]string{
  142. "lan": config.AdvertiseAddr,
  143. "wan": config.AdvertiseAddrWan,
  144. }
  145. agent := &Agent{
  146. config: config,
  147. logger: log.New(logOutput, "", log.LstdFlags),
  148. logOutput: logOutput,
  149. checkReapAfter: make(map[types.CheckID]time.Duration),
  150. checkMonitors: make(map[types.CheckID]*CheckMonitor),
  151. checkTTLs: make(map[types.CheckID]*CheckTTL),
  152. checkHTTPs: make(map[types.CheckID]*CheckHTTP),
  153. checkTCPs: make(map[types.CheckID]*CheckTCP),
  154. checkDockers: make(map[types.CheckID]*CheckDocker),
  155. eventCh: make(chan serf.UserEvent, 1024),
  156. eventBuf: make([]*UserEvent, 256),
  157. shutdownCh: make(chan struct{}),
  158. endpoints: make(map[string]string),
  159. }
  160. // Initialize the local state.
  161. agent.state.Init(config, agent.logger)
  162. // Setup either the client or the server.
  163. var err error
  164. if config.Server {
  165. err = agent.setupServer()
  166. agent.state.SetIface(agent.server)
  167. // Automatically register the "consul" service on server nodes
  168. consulService := structs.NodeService{
  169. Service: consul.ConsulServiceName,
  170. ID: consul.ConsulServiceID,
  171. Port: agent.config.Ports.Server,
  172. Tags: []string{},
  173. }
  174. agent.state.AddService(&consulService, "")
  175. } else {
  176. err = agent.setupClient()
  177. agent.state.SetIface(agent.client)
  178. }
  179. if err != nil {
  180. return nil, err
  181. }
  182. // Load checks/services.
  183. if err := agent.loadServices(config); err != nil {
  184. return nil, err
  185. }
  186. if err := agent.loadChecks(config); err != nil {
  187. return nil, err
  188. }
  189. // Start watching for critical services to deregister, based on their
  190. // checks.
  191. go agent.reapServices()
  192. // Start handling events.
  193. go agent.handleEvents()
  194. // Start sending network coordinate to the server.
  195. if !config.DisableCoordinates {
  196. go agent.sendCoordinate()
  197. }
  198. // Write out the PID file if necessary.
  199. err = agent.storePid()
  200. if err != nil {
  201. return nil, err
  202. }
  203. return agent, nil
  204. }
  205. // consulConfig is used to return a consul configuration
  206. func (a *Agent) consulConfig() *consul.Config {
  207. // Start with the provided config or default config
  208. var base *consul.Config
  209. if a.config.ConsulConfig != nil {
  210. base = a.config.ConsulConfig
  211. } else {
  212. base = consul.DefaultConfig()
  213. }
  214. // Apply dev mode
  215. base.DevMode = a.config.DevMode
  216. // Apply performance factors
  217. if a.config.Performance.RaftMultiplier > 0 {
  218. base.ScaleRaft(a.config.Performance.RaftMultiplier)
  219. }
  220. // Override with our config
  221. if a.config.Datacenter != "" {
  222. base.Datacenter = a.config.Datacenter
  223. }
  224. if a.config.DataDir != "" {
  225. base.DataDir = a.config.DataDir
  226. }
  227. if a.config.NodeName != "" {
  228. base.NodeName = a.config.NodeName
  229. }
  230. if a.config.Ports.SerfLan != 0 {
  231. base.SerfLANConfig.MemberlistConfig.BindPort = a.config.Ports.SerfLan
  232. base.SerfLANConfig.MemberlistConfig.AdvertisePort = a.config.Ports.SerfLan
  233. }
  234. if a.config.Ports.SerfWan != 0 {
  235. base.SerfWANConfig.MemberlistConfig.BindPort = a.config.Ports.SerfWan
  236. base.SerfWANConfig.MemberlistConfig.AdvertisePort = a.config.Ports.SerfWan
  237. }
  238. if a.config.BindAddr != "" {
  239. bindAddr := &net.TCPAddr{
  240. IP: net.ParseIP(a.config.BindAddr),
  241. Port: a.config.Ports.Server,
  242. }
  243. base.RPCAddr = bindAddr
  244. // Set the Serf configs using the old default behavior, we may
  245. // override these in the code right below.
  246. base.SerfLANConfig.MemberlistConfig.BindAddr = a.config.BindAddr
  247. base.SerfWANConfig.MemberlistConfig.BindAddr = a.config.BindAddr
  248. }
  249. if a.config.SerfLanBindAddr != "" {
  250. base.SerfLANConfig.MemberlistConfig.BindAddr = a.config.SerfLanBindAddr
  251. }
  252. if a.config.SerfWanBindAddr != "" {
  253. base.SerfWANConfig.MemberlistConfig.BindAddr = a.config.SerfWanBindAddr
  254. }
  255. if a.config.AdvertiseAddr != "" {
  256. base.SerfLANConfig.MemberlistConfig.AdvertiseAddr = a.config.AdvertiseAddr
  257. if a.config.AdvertiseAddrWan != "" {
  258. base.SerfWANConfig.MemberlistConfig.AdvertiseAddr = a.config.AdvertiseAddrWan
  259. } else {
  260. base.SerfWANConfig.MemberlistConfig.AdvertiseAddr = a.config.AdvertiseAddr
  261. }
  262. base.RPCAdvertise = &net.TCPAddr{
  263. IP: net.ParseIP(a.config.AdvertiseAddr),
  264. Port: a.config.Ports.Server,
  265. }
  266. }
  267. if a.config.AdvertiseAddrs.SerfLan != nil {
  268. base.SerfLANConfig.MemberlistConfig.AdvertiseAddr = a.config.AdvertiseAddrs.SerfLan.IP.String()
  269. base.SerfLANConfig.MemberlistConfig.AdvertisePort = a.config.AdvertiseAddrs.SerfLan.Port
  270. }
  271. if a.config.AdvertiseAddrs.SerfWan != nil {
  272. base.SerfWANConfig.MemberlistConfig.AdvertiseAddr = a.config.AdvertiseAddrs.SerfWan.IP.String()
  273. base.SerfWANConfig.MemberlistConfig.AdvertisePort = a.config.AdvertiseAddrs.SerfWan.Port
  274. }
  275. if a.config.ReconnectTimeoutLan != 0 {
  276. base.SerfLANConfig.ReconnectTimeout = a.config.ReconnectTimeoutLan
  277. }
  278. if a.config.ReconnectTimeoutWan != 0 {
  279. base.SerfWANConfig.ReconnectTimeout = a.config.ReconnectTimeoutWan
  280. }
  281. if a.config.AdvertiseAddrs.RPC != nil {
  282. base.RPCAdvertise = a.config.AdvertiseAddrs.RPC
  283. }
  284. if a.config.Bootstrap {
  285. base.Bootstrap = true
  286. }
  287. if a.config.RejoinAfterLeave {
  288. base.RejoinAfterLeave = true
  289. }
  290. if a.config.BootstrapExpect != 0 {
  291. base.BootstrapExpect = a.config.BootstrapExpect
  292. }
  293. if a.config.Protocol > 0 {
  294. base.ProtocolVersion = uint8(a.config.Protocol)
  295. }
  296. if a.config.ACLToken != "" {
  297. base.ACLToken = a.config.ACLToken
  298. }
  299. if a.config.ACLMasterToken != "" {
  300. base.ACLMasterToken = a.config.ACLMasterToken
  301. }
  302. if a.config.ACLDatacenter != "" {
  303. base.ACLDatacenter = a.config.ACLDatacenter
  304. }
  305. if a.config.ACLTTLRaw != "" {
  306. base.ACLTTL = a.config.ACLTTL
  307. }
  308. if a.config.ACLDefaultPolicy != "" {
  309. base.ACLDefaultPolicy = a.config.ACLDefaultPolicy
  310. }
  311. if a.config.ACLDownPolicy != "" {
  312. base.ACLDownPolicy = a.config.ACLDownPolicy
  313. }
  314. if a.config.ACLReplicationToken != "" {
  315. base.ACLReplicationToken = a.config.ACLReplicationToken
  316. }
  317. if a.config.SessionTTLMinRaw != "" {
  318. base.SessionTTLMin = a.config.SessionTTLMin
  319. }
  320. // Format the build string
  321. revision := a.config.Revision
  322. if len(revision) > 8 {
  323. revision = revision[:8]
  324. }
  325. base.Build = fmt.Sprintf("%s%s:%s",
  326. a.config.Version, a.config.VersionPrerelease, revision)
  327. // Copy the TLS configuration
  328. base.VerifyIncoming = a.config.VerifyIncoming
  329. base.VerifyOutgoing = a.config.VerifyOutgoing
  330. base.VerifyServerHostname = a.config.VerifyServerHostname
  331. base.CAFile = a.config.CAFile
  332. base.CertFile = a.config.CertFile
  333. base.KeyFile = a.config.KeyFile
  334. base.ServerName = a.config.ServerName
  335. base.Domain = a.config.Domain
  336. // Setup the ServerUp callback
  337. base.ServerUp = a.state.ConsulServerUp
  338. // Setup the user event callback
  339. base.UserEventHandler = func(e serf.UserEvent) {
  340. select {
  341. case a.eventCh <- e:
  342. case <-a.shutdownCh:
  343. }
  344. }
  345. // Setup the loggers
  346. base.LogOutput = a.logOutput
  347. return base
  348. }
  349. // setupServer is used to initialize the Consul server
  350. func (a *Agent) setupServer() error {
  351. config := a.consulConfig()
  352. if err := a.setupKeyrings(config); err != nil {
  353. return fmt.Errorf("Failed to configure keyring: %v", err)
  354. }
  355. server, err := consul.NewServer(config)
  356. if err != nil {
  357. return fmt.Errorf("Failed to start Consul server: %v", err)
  358. }
  359. a.server = server
  360. return nil
  361. }
  362. // setupClient is used to initialize the Consul client
  363. func (a *Agent) setupClient() error {
  364. config := a.consulConfig()
  365. if err := a.setupKeyrings(config); err != nil {
  366. return fmt.Errorf("Failed to configure keyring: %v", err)
  367. }
  368. client, err := consul.NewClient(config)
  369. if err != nil {
  370. return fmt.Errorf("Failed to start Consul client: %v", err)
  371. }
  372. a.client = client
  373. return nil
  374. }
  375. // setupKeyrings is used to initialize and load keyrings during agent startup
  376. func (a *Agent) setupKeyrings(config *consul.Config) error {
  377. fileLAN := filepath.Join(a.config.DataDir, serfLANKeyring)
  378. fileWAN := filepath.Join(a.config.DataDir, serfWANKeyring)
  379. if a.config.EncryptKey == "" {
  380. goto LOAD
  381. }
  382. if _, err := os.Stat(fileLAN); err != nil {
  383. if err := initKeyring(fileLAN, a.config.EncryptKey); err != nil {
  384. return err
  385. }
  386. }
  387. if a.config.Server {
  388. if _, err := os.Stat(fileWAN); err != nil {
  389. if err := initKeyring(fileWAN, a.config.EncryptKey); err != nil {
  390. return err
  391. }
  392. }
  393. }
  394. LOAD:
  395. if _, err := os.Stat(fileLAN); err == nil {
  396. config.SerfLANConfig.KeyringFile = fileLAN
  397. }
  398. if err := loadKeyringFile(config.SerfLANConfig); err != nil {
  399. return err
  400. }
  401. if a.config.Server {
  402. if _, err := os.Stat(fileWAN); err == nil {
  403. config.SerfWANConfig.KeyringFile = fileWAN
  404. }
  405. if err := loadKeyringFile(config.SerfWANConfig); err != nil {
  406. return err
  407. }
  408. }
  409. // Success!
  410. return nil
  411. }
  412. // RPC is used to make an RPC call to the Consul servers
  413. // This allows the agent to implement the Consul.Interface
  414. func (a *Agent) RPC(method string, args interface{}, reply interface{}) error {
  415. if a.server != nil {
  416. return a.server.RPC(method, args, reply)
  417. }
  418. return a.client.RPC(method, args, reply)
  419. }
  420. // SnapshotRPC performs the requested snapshot RPC against the Consul server in
  421. // a streaming manner. The contents of in will be read and passed along as the
  422. // payload, and the response message will determine the error status, and any
  423. // return payload will be written to out.
  424. func (a *Agent) SnapshotRPC(args *structs.SnapshotRequest, in io.Reader, out io.Writer,
  425. replyFn consul.SnapshotReplyFn) error {
  426. if a.server != nil {
  427. return a.server.SnapshotRPC(args, in, out, replyFn)
  428. }
  429. return a.client.SnapshotRPC(args, in, out, replyFn)
  430. }
  431. // Leave is used to prepare the agent for a graceful shutdown
  432. func (a *Agent) Leave() error {
  433. if a.server != nil {
  434. return a.server.Leave()
  435. } else {
  436. return a.client.Leave()
  437. }
  438. }
  439. // Shutdown is used to hard stop the agent. Should be
  440. // preceded by a call to Leave to do it gracefully.
  441. func (a *Agent) Shutdown() error {
  442. a.shutdownLock.Lock()
  443. defer a.shutdownLock.Unlock()
  444. if a.shutdown {
  445. return nil
  446. }
  447. // Stop all the checks
  448. a.checkLock.Lock()
  449. defer a.checkLock.Unlock()
  450. for _, chk := range a.checkMonitors {
  451. chk.Stop()
  452. }
  453. for _, chk := range a.checkTTLs {
  454. chk.Stop()
  455. }
  456. for _, chk := range a.checkHTTPs {
  457. chk.Stop()
  458. }
  459. for _, chk := range a.checkTCPs {
  460. chk.Stop()
  461. }
  462. a.logger.Println("[INFO] agent: requesting shutdown")
  463. var err error
  464. if a.server != nil {
  465. err = a.server.Shutdown()
  466. } else {
  467. err = a.client.Shutdown()
  468. }
  469. pidErr := a.deletePid()
  470. if pidErr != nil {
  471. a.logger.Println("[WARN] agent: could not delete pid file ", pidErr)
  472. }
  473. a.logger.Println("[INFO] agent: shutdown complete")
  474. a.shutdown = true
  475. close(a.shutdownCh)
  476. return err
  477. }
  478. // ShutdownCh is used to return a channel that can be
  479. // selected to wait for the agent to perform a shutdown.
  480. func (a *Agent) ShutdownCh() <-chan struct{} {
  481. return a.shutdownCh
  482. }
  483. // JoinLAN is used to have the agent join a LAN cluster
  484. func (a *Agent) JoinLAN(addrs []string) (n int, err error) {
  485. a.logger.Printf("[INFO] agent: (LAN) joining: %v", addrs)
  486. if a.server != nil {
  487. n, err = a.server.JoinLAN(addrs)
  488. } else {
  489. n, err = a.client.JoinLAN(addrs)
  490. }
  491. a.logger.Printf("[INFO] agent: (LAN) joined: %d Err: %v", n, err)
  492. return
  493. }
  494. // JoinWAN is used to have the agent join a WAN cluster
  495. func (a *Agent) JoinWAN(addrs []string) (n int, err error) {
  496. a.logger.Printf("[INFO] agent: (WAN) joining: %v", addrs)
  497. if a.server != nil {
  498. n, err = a.server.JoinWAN(addrs)
  499. } else {
  500. err = fmt.Errorf("Must be a server to join WAN cluster")
  501. }
  502. a.logger.Printf("[INFO] agent: (WAN) joined: %d Err: %v", n, err)
  503. return
  504. }
  505. // ForceLeave is used to remove a failed node from the cluster
  506. func (a *Agent) ForceLeave(node string) (err error) {
  507. a.logger.Printf("[INFO] Force leaving node: %v", node)
  508. if a.server != nil {
  509. err = a.server.RemoveFailedNode(node)
  510. } else {
  511. err = a.client.RemoveFailedNode(node)
  512. }
  513. if err != nil {
  514. a.logger.Printf("[WARN] Failed to remove node: %v", err)
  515. }
  516. return err
  517. }
  518. // LocalMember is used to return the local node
  519. func (a *Agent) LocalMember() serf.Member {
  520. if a.server != nil {
  521. return a.server.LocalMember()
  522. } else {
  523. return a.client.LocalMember()
  524. }
  525. }
  526. // LANMembers is used to retrieve the LAN members
  527. func (a *Agent) LANMembers() []serf.Member {
  528. if a.server != nil {
  529. return a.server.LANMembers()
  530. } else {
  531. return a.client.LANMembers()
  532. }
  533. }
  534. // WANMembers is used to retrieve the WAN members
  535. func (a *Agent) WANMembers() []serf.Member {
  536. if a.server != nil {
  537. return a.server.WANMembers()
  538. } else {
  539. return nil
  540. }
  541. }
  542. // StartSync is called once Services and Checks are registered.
  543. // This is called to prevent a race between clients and the anti-entropy routines
  544. func (a *Agent) StartSync() {
  545. // Start the anti entropy routine
  546. go a.state.antiEntropy(a.shutdownCh)
  547. }
  548. // PauseSync is used to pause anti-entropy while bulk changes are make
  549. func (a *Agent) PauseSync() {
  550. a.state.Pause()
  551. }
  552. // ResumeSync is used to unpause anti-entropy after bulk changes are make
  553. func (a *Agent) ResumeSync() {
  554. a.state.Resume()
  555. }
  556. // Returns the coordinate of this node in the local pool (assumes coordinates
  557. // are enabled, so check that before calling).
  558. func (a *Agent) GetCoordinate() (*coordinate.Coordinate, error) {
  559. if a.config.Server {
  560. return a.server.GetLANCoordinate()
  561. } else {
  562. return a.client.GetCoordinate()
  563. }
  564. }
  565. // sendCoordinate is a long-running loop that periodically sends our coordinate
  566. // to the server. Closing the agent's shutdownChannel will cause this to exit.
  567. func (a *Agent) sendCoordinate() {
  568. for {
  569. rate := a.config.SyncCoordinateRateTarget
  570. min := a.config.SyncCoordinateIntervalMin
  571. intv := lib.RateScaledInterval(rate, min, len(a.LANMembers()))
  572. intv = intv + lib.RandomStagger(intv)
  573. select {
  574. case <-time.After(intv):
  575. members := a.LANMembers()
  576. grok, err := consul.CanServersUnderstandProtocol(members, 3)
  577. if err != nil {
  578. a.logger.Printf("[ERR] agent: failed to check servers: %s", err)
  579. continue
  580. }
  581. if !grok {
  582. a.logger.Printf("[DEBUG] agent: skipping coordinate updates until servers are upgraded")
  583. continue
  584. }
  585. c, err := a.GetCoordinate()
  586. if err != nil {
  587. a.logger.Printf("[ERR] agent: failed to get coordinate: %s", err)
  588. continue
  589. }
  590. // TODO - Consider adding a distance check so we don't send
  591. // an update if the position hasn't changed by more than a
  592. // threshold.
  593. req := structs.CoordinateUpdateRequest{
  594. Datacenter: a.config.Datacenter,
  595. Node: a.config.NodeName,
  596. Coord: c,
  597. WriteRequest: structs.WriteRequest{Token: a.config.ACLToken},
  598. }
  599. var reply struct{}
  600. if err := a.RPC("Coordinate.Update", &req, &reply); err != nil {
  601. a.logger.Printf("[ERR] agent: coordinate update error: %s", err)
  602. continue
  603. }
  604. case <-a.shutdownCh:
  605. return
  606. }
  607. }
  608. }
  609. // reapServicesInternal does a single pass, looking for services to reap.
  610. func (a *Agent) reapServicesInternal() {
  611. reaped := make(map[string]struct{})
  612. for checkID, check := range a.state.CriticalChecks() {
  613. // There's nothing to do if there's no service.
  614. if check.Check.ServiceID == "" {
  615. continue
  616. }
  617. // There might be multiple checks for one service, so
  618. // we don't need to reap multiple times.
  619. serviceID := check.Check.ServiceID
  620. if _, ok := reaped[serviceID]; ok {
  621. continue
  622. }
  623. // See if there's a timeout.
  624. a.checkLock.Lock()
  625. timeout, ok := a.checkReapAfter[checkID]
  626. a.checkLock.Unlock()
  627. // Reap, if necessary. We keep track of which service
  628. // this is so that we won't try to remove it again.
  629. if ok && check.CriticalFor > timeout {
  630. reaped[serviceID] = struct{}{}
  631. a.RemoveService(serviceID, true)
  632. a.logger.Printf("[INFO] agent: Check %q for service %q has been critical for too long; deregistered service",
  633. checkID, serviceID)
  634. }
  635. }
  636. }
  637. // reapServices is a long running goroutine that looks for checks that have been
  638. // critical too long and dregisters their associated services.
  639. func (a *Agent) reapServices() {
  640. for {
  641. select {
  642. case <-time.After(a.config.CheckReapInterval):
  643. a.reapServicesInternal()
  644. case <-a.shutdownCh:
  645. return
  646. }
  647. }
  648. }
  649. // persistService saves a service definition to a JSON file in the data dir
  650. func (a *Agent) persistService(service *structs.NodeService) error {
  651. svcPath := filepath.Join(a.config.DataDir, servicesDir, stringHash(service.ID))
  652. wrapped := persistedService{
  653. Token: a.state.ServiceToken(service.ID),
  654. Service: service,
  655. }
  656. encoded, err := json.Marshal(wrapped)
  657. if err != nil {
  658. return err
  659. }
  660. return writeFileAtomic(svcPath, encoded)
  661. }
  662. // purgeService removes a persisted service definition file from the data dir
  663. func (a *Agent) purgeService(serviceID string) error {
  664. svcPath := filepath.Join(a.config.DataDir, servicesDir, stringHash(serviceID))
  665. if _, err := os.Stat(svcPath); err == nil {
  666. return os.Remove(svcPath)
  667. }
  668. return nil
  669. }
  670. // persistCheck saves a check definition to the local agent's state directory
  671. func (a *Agent) persistCheck(check *structs.HealthCheck, chkType *CheckType) error {
  672. checkPath := filepath.Join(a.config.DataDir, checksDir, checkIDHash(check.CheckID))
  673. // Create the persisted check
  674. wrapped := persistedCheck{
  675. Check: check,
  676. ChkType: chkType,
  677. Token: a.state.CheckToken(check.CheckID),
  678. }
  679. encoded, err := json.Marshal(wrapped)
  680. if err != nil {
  681. return err
  682. }
  683. return writeFileAtomic(checkPath, encoded)
  684. }
  685. // purgeCheck removes a persisted check definition file from the data dir
  686. func (a *Agent) purgeCheck(checkID types.CheckID) error {
  687. checkPath := filepath.Join(a.config.DataDir, checksDir, checkIDHash(checkID))
  688. if _, err := os.Stat(checkPath); err == nil {
  689. return os.Remove(checkPath)
  690. }
  691. return nil
  692. }
  693. // writeFileAtomic writes the given contents to a temporary file in the same
  694. // directory, does an fsync and then renames the file to its real path
  695. func writeFileAtomic(path string, contents []byte) error {
  696. uuid, err := uuid.GenerateUUID()
  697. if err != nil {
  698. return err
  699. }
  700. tempPath := fmt.Sprintf("%s-%s.tmp", path, uuid)
  701. if err := os.MkdirAll(filepath.Dir(path), 0700); err != nil {
  702. return err
  703. }
  704. fh, err := os.OpenFile(tempPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0600)
  705. if err != nil {
  706. return err
  707. }
  708. if _, err := fh.Write(contents); err != nil {
  709. return err
  710. }
  711. if err := fh.Sync(); err != nil {
  712. return err
  713. }
  714. if err := fh.Close(); err != nil {
  715. return err
  716. }
  717. return os.Rename(tempPath, path)
  718. }
  719. // AddService is used to add a service entry.
  720. // This entry is persistent and the agent will make a best effort to
  721. // ensure it is registered
  722. func (a *Agent) AddService(service *structs.NodeService, chkTypes CheckTypes, persist bool, token string) error {
  723. if service.Service == "" {
  724. return fmt.Errorf("Service name missing")
  725. }
  726. if service.ID == "" && service.Service != "" {
  727. service.ID = service.Service
  728. }
  729. for _, check := range chkTypes {
  730. if !check.Valid() {
  731. return fmt.Errorf("Check type is not valid")
  732. }
  733. }
  734. // Warn if the service name is incompatible with DNS
  735. if !dnsNameRe.MatchString(service.Service) {
  736. a.logger.Printf("[WARN] Service name %q will not be discoverable "+
  737. "via DNS due to invalid characters. Valid characters include "+
  738. "all alpha-numerics and dashes.", service.Service)
  739. }
  740. // Warn if any tags are incompatible with DNS
  741. for _, tag := range service.Tags {
  742. if !dnsNameRe.MatchString(tag) {
  743. a.logger.Printf("[DEBUG] Service tag %q will not be discoverable "+
  744. "via DNS due to invalid characters. Valid characters include "+
  745. "all alpha-numerics and dashes.", tag)
  746. }
  747. }
  748. // Pause the service syncs during modification
  749. a.PauseSync()
  750. defer a.ResumeSync()
  751. // Take a snapshot of the current state of checks (if any), and
  752. // restore them before resuming anti-entropy.
  753. snap := a.snapshotCheckState()
  754. defer a.restoreCheckState(snap)
  755. // Add the service
  756. a.state.AddService(service, token)
  757. // Persist the service to a file
  758. if persist && !a.config.DevMode {
  759. if err := a.persistService(service); err != nil {
  760. return err
  761. }
  762. }
  763. // Create an associated health check
  764. for i, chkType := range chkTypes {
  765. checkID := fmt.Sprintf("service:%s", service.ID)
  766. if len(chkTypes) > 1 {
  767. checkID += fmt.Sprintf(":%d", i+1)
  768. }
  769. check := &structs.HealthCheck{
  770. Node: a.config.NodeName,
  771. CheckID: types.CheckID(checkID),
  772. Name: fmt.Sprintf("Service '%s' check", service.Service),
  773. Status: structs.HealthCritical,
  774. Notes: chkType.Notes,
  775. ServiceID: service.ID,
  776. ServiceName: service.Service,
  777. }
  778. if chkType.Status != "" {
  779. check.Status = chkType.Status
  780. }
  781. if err := a.AddCheck(check, chkType, persist, token); err != nil {
  782. return err
  783. }
  784. }
  785. return nil
  786. }
  787. // RemoveService is used to remove a service entry.
  788. // The agent will make a best effort to ensure it is deregistered
  789. func (a *Agent) RemoveService(serviceID string, persist bool) error {
  790. // Protect "consul" service from deletion by a user
  791. if a.server != nil && serviceID == consul.ConsulServiceID {
  792. return fmt.Errorf(
  793. "Deregistering the %s service is not allowed",
  794. consul.ConsulServiceID)
  795. }
  796. // Validate ServiceID
  797. if serviceID == "" {
  798. return fmt.Errorf("ServiceID missing")
  799. }
  800. // Remove service immediately
  801. err := a.state.RemoveService(serviceID)
  802. // TODO: Return the error instead of just logging here in Consul 0.8
  803. // For now, keep the current idempotent behavior on deleting a nonexistent service
  804. if err != nil {
  805. a.logger.Printf("[WARN] agent: Failed to deregister service %q: %s", serviceID, err)
  806. return nil
  807. }
  808. // Remove the service from the data dir
  809. if persist {
  810. if err := a.purgeService(serviceID); err != nil {
  811. return err
  812. }
  813. }
  814. // Deregister any associated health checks
  815. for checkID, health := range a.state.Checks() {
  816. if health.ServiceID != serviceID {
  817. continue
  818. }
  819. if err := a.RemoveCheck(checkID, persist); err != nil {
  820. return err
  821. }
  822. }
  823. log.Printf("[DEBUG] agent: removed service %q", serviceID)
  824. return nil
  825. }
  826. // AddCheck is used to add a health check to the agent.
  827. // This entry is persistent and the agent will make a best effort to
  828. // ensure it is registered. The Check may include a CheckType which
  829. // is used to automatically update the check status
  830. func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *CheckType, persist bool, token string) error {
  831. if check.CheckID == "" {
  832. return fmt.Errorf("CheckID missing")
  833. }
  834. if chkType != nil && !chkType.Valid() {
  835. return fmt.Errorf("Check type is not valid")
  836. }
  837. if check.ServiceID != "" {
  838. svc, ok := a.state.Services()[check.ServiceID]
  839. if !ok {
  840. return fmt.Errorf("ServiceID %q does not exist", check.ServiceID)
  841. }
  842. check.ServiceName = svc.Service
  843. }
  844. a.checkLock.Lock()
  845. defer a.checkLock.Unlock()
  846. // Check if already registered
  847. if chkType != nil {
  848. if chkType.IsTTL() {
  849. if existing, ok := a.checkTTLs[check.CheckID]; ok {
  850. existing.Stop()
  851. }
  852. ttl := &CheckTTL{
  853. Notify: &a.state,
  854. CheckID: check.CheckID,
  855. TTL: chkType.TTL,
  856. Logger: a.logger,
  857. }
  858. // Restore persisted state, if any
  859. if err := a.loadCheckState(check); err != nil {
  860. a.logger.Printf("[WARN] agent: failed restoring state for check %q: %s",
  861. check.CheckID, err)
  862. }
  863. ttl.Start()
  864. a.checkTTLs[check.CheckID] = ttl
  865. } else if chkType.IsHTTP() {
  866. if existing, ok := a.checkHTTPs[check.CheckID]; ok {
  867. existing.Stop()
  868. }
  869. if chkType.Interval < MinInterval {
  870. a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v",
  871. check.CheckID, MinInterval))
  872. chkType.Interval = MinInterval
  873. }
  874. http := &CheckHTTP{
  875. Notify: &a.state,
  876. CheckID: check.CheckID,
  877. HTTP: chkType.HTTP,
  878. Interval: chkType.Interval,
  879. Timeout: chkType.Timeout,
  880. Logger: a.logger,
  881. TLSSkipVerify: chkType.TLSSkipVerify,
  882. }
  883. http.Start()
  884. a.checkHTTPs[check.CheckID] = http
  885. } else if chkType.IsTCP() {
  886. if existing, ok := a.checkTCPs[check.CheckID]; ok {
  887. existing.Stop()
  888. }
  889. if chkType.Interval < MinInterval {
  890. a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v",
  891. check.CheckID, MinInterval))
  892. chkType.Interval = MinInterval
  893. }
  894. tcp := &CheckTCP{
  895. Notify: &a.state,
  896. CheckID: check.CheckID,
  897. TCP: chkType.TCP,
  898. Interval: chkType.Interval,
  899. Timeout: chkType.Timeout,
  900. Logger: a.logger,
  901. }
  902. tcp.Start()
  903. a.checkTCPs[check.CheckID] = tcp
  904. } else if chkType.IsDocker() {
  905. if existing, ok := a.checkDockers[check.CheckID]; ok {
  906. existing.Stop()
  907. }
  908. if chkType.Interval < MinInterval {
  909. a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v",
  910. check.CheckID, MinInterval))
  911. chkType.Interval = MinInterval
  912. }
  913. dockerCheck := &CheckDocker{
  914. Notify: &a.state,
  915. CheckID: check.CheckID,
  916. DockerContainerID: chkType.DockerContainerID,
  917. Shell: chkType.Shell,
  918. Script: chkType.Script,
  919. Interval: chkType.Interval,
  920. Logger: a.logger,
  921. }
  922. if err := dockerCheck.Init(); err != nil {
  923. return err
  924. }
  925. dockerCheck.Start()
  926. a.checkDockers[check.CheckID] = dockerCheck
  927. } else if chkType.IsMonitor() {
  928. if existing, ok := a.checkMonitors[check.CheckID]; ok {
  929. existing.Stop()
  930. }
  931. if chkType.Interval < MinInterval {
  932. a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v",
  933. check.CheckID, MinInterval))
  934. chkType.Interval = MinInterval
  935. }
  936. monitor := &CheckMonitor{
  937. Notify: &a.state,
  938. CheckID: check.CheckID,
  939. Script: chkType.Script,
  940. Interval: chkType.Interval,
  941. Timeout: chkType.Timeout,
  942. Logger: a.logger,
  943. }
  944. monitor.Start()
  945. a.checkMonitors[check.CheckID] = monitor
  946. } else {
  947. return fmt.Errorf("Check type is not valid")
  948. }
  949. if chkType.DeregisterCriticalServiceAfter > 0 {
  950. timeout := chkType.DeregisterCriticalServiceAfter
  951. if timeout < a.config.CheckDeregisterIntervalMin {
  952. timeout = a.config.CheckDeregisterIntervalMin
  953. a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has deregister interval below minimum of %v",
  954. check.CheckID, a.config.CheckDeregisterIntervalMin))
  955. }
  956. a.checkReapAfter[check.CheckID] = timeout
  957. } else {
  958. delete(a.checkReapAfter, check.CheckID)
  959. }
  960. }
  961. // Add to the local state for anti-entropy
  962. a.state.AddCheck(check, token)
  963. // Persist the check
  964. if persist && !a.config.DevMode {
  965. return a.persistCheck(check, chkType)
  966. }
  967. return nil
  968. }
  969. // RemoveCheck is used to remove a health check.
  970. // The agent will make a best effort to ensure it is deregistered
  971. func (a *Agent) RemoveCheck(checkID types.CheckID, persist bool) error {
  972. // Validate CheckID
  973. if checkID == "" {
  974. return fmt.Errorf("CheckID missing")
  975. }
  976. // Add to the local state for anti-entropy
  977. a.state.RemoveCheck(checkID)
  978. a.checkLock.Lock()
  979. defer a.checkLock.Unlock()
  980. // Stop any monitors
  981. delete(a.checkReapAfter, checkID)
  982. if check, ok := a.checkMonitors[checkID]; ok {
  983. check.Stop()
  984. delete(a.checkMonitors, checkID)
  985. }
  986. if check, ok := a.checkHTTPs[checkID]; ok {
  987. check.Stop()
  988. delete(a.checkHTTPs, checkID)
  989. }
  990. if check, ok := a.checkTCPs[checkID]; ok {
  991. check.Stop()
  992. delete(a.checkTCPs, checkID)
  993. }
  994. if check, ok := a.checkTTLs[checkID]; ok {
  995. check.Stop()
  996. delete(a.checkTTLs, checkID)
  997. }
  998. if persist {
  999. if err := a.purgeCheck(checkID); err != nil {
  1000. return err
  1001. }
  1002. if err := a.purgeCheckState(checkID); err != nil {
  1003. return err
  1004. }
  1005. }
  1006. log.Printf("[DEBUG] agent: removed check %q", checkID)
  1007. return nil
  1008. }
  1009. // updateTTLCheck is used to update the status of a TTL check via the Agent API.
  1010. func (a *Agent) updateTTLCheck(checkID types.CheckID, status, output string) error {
  1011. a.checkLock.Lock()
  1012. defer a.checkLock.Unlock()
  1013. // Grab the TTL check.
  1014. check, ok := a.checkTTLs[checkID]
  1015. if !ok {
  1016. return fmt.Errorf("CheckID %q does not have associated TTL", checkID)
  1017. }
  1018. // Set the status through CheckTTL to reset the TTL.
  1019. check.SetStatus(status, output)
  1020. // We don't write any files in dev mode so bail here.
  1021. if a.config.DevMode {
  1022. return nil
  1023. }
  1024. // Persist the state so the TTL check can come up in a good state after
  1025. // an agent restart, especially with long TTL values.
  1026. if err := a.persistCheckState(check, status, output); err != nil {
  1027. return fmt.Errorf("failed persisting state for check %q: %s", checkID, err)
  1028. }
  1029. return nil
  1030. }
  1031. // persistCheckState is used to record the check status into the data dir.
  1032. // This allows the state to be restored on a later agent start. Currently
  1033. // only useful for TTL based checks.
  1034. func (a *Agent) persistCheckState(check *CheckTTL, status, output string) error {
  1035. // Create the persisted state
  1036. state := persistedCheckState{
  1037. CheckID: check.CheckID,
  1038. Status: status,
  1039. Output: output,
  1040. Expires: time.Now().Add(check.TTL).Unix(),
  1041. }
  1042. // Encode the state
  1043. buf, err := json.Marshal(state)
  1044. if err != nil {
  1045. return err
  1046. }
  1047. // Create the state dir if it doesn't exist
  1048. dir := filepath.Join(a.config.DataDir, checkStateDir)
  1049. if err := os.MkdirAll(dir, 0700); err != nil {
  1050. return fmt.Errorf("failed creating check state dir %q: %s", dir, err)
  1051. }
  1052. // Write the state to the file
  1053. file := filepath.Join(dir, checkIDHash(check.CheckID))
  1054. // Create temp file in same dir, to make more likely atomic
  1055. tempFile := file + ".tmp"
  1056. // persistCheckState is called frequently, so don't use writeFileAtomic to avoid calling fsync here
  1057. if err := ioutil.WriteFile(tempFile, buf, 0600); err != nil {
  1058. return fmt.Errorf("failed writing temp file %q: %s", tempFile, err)
  1059. }
  1060. if err := os.Rename(tempFile, file); err != nil {
  1061. return fmt.Errorf("failed to rename temp file from %q to %q: %s", tempFile, file, err)
  1062. }
  1063. return nil
  1064. }
  1065. // loadCheckState is used to restore the persisted state of a check.
  1066. func (a *Agent) loadCheckState(check *structs.HealthCheck) error {
  1067. // Try to read the persisted state for this check
  1068. file := filepath.Join(a.config.DataDir, checkStateDir, checkIDHash(check.CheckID))
  1069. buf, err := ioutil.ReadFile(file)
  1070. if err != nil {
  1071. if os.IsNotExist(err) {
  1072. return nil
  1073. }
  1074. return fmt.Errorf("failed reading file %q: %s", file, err)
  1075. }
  1076. // Decode the state data
  1077. var p persistedCheckState
  1078. if err := json.Unmarshal(buf, &p); err != nil {
  1079. a.logger.Printf("[ERROR] agent: failed decoding check state: %s", err)
  1080. return a.purgeCheckState(check.CheckID)
  1081. }
  1082. // Check if the state has expired
  1083. if time.Now().Unix() >= p.Expires {
  1084. a.logger.Printf("[DEBUG] agent: check state expired for %q, not restoring", check.CheckID)
  1085. return a.purgeCheckState(check.CheckID)
  1086. }
  1087. // Restore the fields from the state
  1088. check.Output = p.Output
  1089. check.Status = p.Status
  1090. return nil
  1091. }
  1092. // purgeCheckState is used to purge the state of a check from the data dir
  1093. func (a *Agent) purgeCheckState(checkID types.CheckID) error {
  1094. file := filepath.Join(a.config.DataDir, checkStateDir, checkIDHash(checkID))
  1095. err := os.Remove(file)
  1096. if os.IsNotExist(err) {
  1097. return nil
  1098. }
  1099. return err
  1100. }
  1101. // Stats is used to get various debugging state from the sub-systems
  1102. func (a *Agent) Stats() map[string]map[string]string {
  1103. toString := func(v uint64) string {
  1104. return strconv.FormatUint(v, 10)
  1105. }
  1106. var stats map[string]map[string]string
  1107. if a.server != nil {
  1108. stats = a.server.Stats()
  1109. } else {
  1110. stats = a.client.Stats()
  1111. }
  1112. stats["agent"] = map[string]string{
  1113. "check_monitors": toString(uint64(len(a.checkMonitors))),
  1114. "check_ttls": toString(uint64(len(a.checkTTLs))),
  1115. "checks": toString(uint64(len(a.state.checks))),
  1116. "services": toString(uint64(len(a.state.services))),
  1117. }
  1118. revision := a.config.Revision
  1119. if len(revision) > 8 {
  1120. revision = revision[:8]
  1121. }
  1122. stats["build"] = map[string]string{
  1123. "revision": revision,
  1124. "version": a.config.Version,
  1125. "prerelease": a.config.VersionPrerelease,
  1126. }
  1127. return stats
  1128. }
  1129. // storePid is used to write out our PID to a file if necessary
  1130. func (a *Agent) storePid() error {
  1131. // Quit fast if no pidfile
  1132. pidPath := a.config.PidFile
  1133. if pidPath == "" {
  1134. return nil
  1135. }
  1136. // Open the PID file
  1137. pidFile, err := os.OpenFile(pidPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0666)
  1138. if err != nil {
  1139. return fmt.Errorf("Could not open pid file: %v", err)
  1140. }
  1141. defer pidFile.Close()
  1142. // Write out the PID
  1143. pid := os.Getpid()
  1144. _, err = pidFile.WriteString(fmt.Sprintf("%d", pid))
  1145. if err != nil {
  1146. return fmt.Errorf("Could not write to pid file: %s", err)
  1147. }
  1148. return nil
  1149. }
  1150. // deletePid is used to delete our PID on exit
  1151. func (a *Agent) deletePid() error {
  1152. // Quit fast if no pidfile
  1153. pidPath := a.config.PidFile
  1154. if pidPath == "" {
  1155. return nil
  1156. }
  1157. stat, err := os.Stat(pidPath)
  1158. if err != nil {
  1159. return fmt.Errorf("Could not remove pid file: %s", err)
  1160. }
  1161. if stat.IsDir() {
  1162. return fmt.Errorf("Specified pid file path is directory")
  1163. }
  1164. err = os.Remove(pidPath)
  1165. if err != nil {
  1166. return fmt.Errorf("Could not remove pid file: %s", err)
  1167. }
  1168. return nil
  1169. }
  1170. // loadServices will load service definitions from configuration and persisted
  1171. // definitions on disk, and load them into the local agent.
  1172. func (a *Agent) loadServices(conf *Config) error {
  1173. // Register the services from config
  1174. for _, service := range conf.Services {
  1175. ns := service.NodeService()
  1176. chkTypes := service.CheckTypes()
  1177. if err := a.AddService(ns, chkTypes, false, service.Token); err != nil {
  1178. return fmt.Errorf("Failed to register service '%s': %v", service.ID, err)
  1179. }
  1180. }
  1181. // Load any persisted services
  1182. svcDir := filepath.Join(a.config.DataDir, servicesDir)
  1183. files, err := ioutil.ReadDir(svcDir)
  1184. if err != nil {
  1185. if os.IsNotExist(err) {
  1186. return nil
  1187. }
  1188. return fmt.Errorf("Failed reading services dir %q: %s", svcDir, err)
  1189. }
  1190. for _, fi := range files {
  1191. // Skip all dirs
  1192. if fi.IsDir() {
  1193. continue
  1194. }
  1195. // Open the file for reading
  1196. file := filepath.Join(svcDir, fi.Name())
  1197. fh, err := os.Open(file)
  1198. if err != nil {
  1199. return fmt.Errorf("failed opening service file %q: %s", file, err)
  1200. }
  1201. // Read the contents into a buffer
  1202. buf, err := ioutil.ReadAll(fh)
  1203. fh.Close()
  1204. if err != nil {
  1205. return fmt.Errorf("failed reading service file %q: %s", file, err)
  1206. }
  1207. // Try decoding the service definition
  1208. var p persistedService
  1209. if err := json.Unmarshal(buf, &p); err != nil {
  1210. // Backwards-compatibility for pre-0.5.1 persisted services
  1211. if err := json.Unmarshal(buf, &p.Service); err != nil {
  1212. return fmt.Errorf("failed decoding service file %q: %s", file, err)
  1213. }
  1214. }
  1215. serviceID := p.Service.ID
  1216. if _, ok := a.state.services[serviceID]; ok {
  1217. // Purge previously persisted service. This allows config to be
  1218. // preferred over services persisted from the API.
  1219. a.logger.Printf("[DEBUG] agent: service %q exists, not restoring from %q",
  1220. serviceID, file)
  1221. if err := a.purgeService(serviceID); err != nil {
  1222. return fmt.Errorf("failed purging service %q: %s", serviceID, err)
  1223. }
  1224. } else {
  1225. a.logger.Printf("[DEBUG] agent: restored service definition %q from %q",
  1226. serviceID, file)
  1227. if err := a.AddService(p.Service, nil, false, p.Token); err != nil {
  1228. return fmt.Errorf("failed adding service %q: %s", serviceID, err)
  1229. }
  1230. }
  1231. }
  1232. return nil
  1233. }
  1234. // unloadServices will deregister all services other than the 'consul' service
  1235. // known to the local agent.
  1236. func (a *Agent) unloadServices() error {
  1237. for _, service := range a.state.Services() {
  1238. if service.ID == consul.ConsulServiceID {
  1239. continue
  1240. }
  1241. if err := a.RemoveService(service.ID, false); err != nil {
  1242. return fmt.Errorf("Failed deregistering service '%s': %v", service.ID, err)
  1243. }
  1244. }
  1245. return nil
  1246. }
  1247. // loadChecks loads check definitions and/or persisted check definitions from
  1248. // disk and re-registers them with the local agent.
  1249. func (a *Agent) loadChecks(conf *Config) error {
  1250. // Register the checks from config
  1251. for _, check := range conf.Checks {
  1252. health := check.HealthCheck(conf.NodeName)
  1253. chkType := &check.CheckType
  1254. if err := a.AddCheck(health, chkType, false, check.Token); err != nil {
  1255. return fmt.Errorf("Failed to register check '%s': %v %v", check.Name, err, check)
  1256. }
  1257. }
  1258. // Load any persisted checks
  1259. checkDir := filepath.Join(a.config.DataDir, checksDir)
  1260. files, err := ioutil.ReadDir(checkDir)
  1261. if err != nil {
  1262. if os.IsNotExist(err) {
  1263. return nil
  1264. }
  1265. return fmt.Errorf("Failed reading checks dir %q: %s", checkDir, err)
  1266. }
  1267. for _, fi := range files {
  1268. // Ignore dirs - we only care about the check definition files
  1269. if fi.IsDir() {
  1270. continue
  1271. }
  1272. // Open the file for reading
  1273. file := filepath.Join(checkDir, fi.Name())
  1274. fh, err := os.Open(file)
  1275. if err != nil {
  1276. return fmt.Errorf("Failed opening check file %q: %s", file, err)
  1277. }
  1278. // Read the contents into a buffer
  1279. buf, err := ioutil.ReadAll(fh)
  1280. fh.Close()
  1281. if err != nil {
  1282. return fmt.Errorf("failed reading check file %q: %s", file, err)
  1283. }
  1284. // Decode the check
  1285. var p persistedCheck
  1286. if err := json.Unmarshal(buf, &p); err != nil {
  1287. return fmt.Errorf("Failed decoding check file %q: %s", file, err)
  1288. }
  1289. checkID := p.Check.CheckID
  1290. if _, ok := a.state.checks[checkID]; ok {
  1291. // Purge previously persisted check. This allows config to be
  1292. // preferred over persisted checks from the API.
  1293. a.logger.Printf("[DEBUG] agent: check %q exists, not restoring from %q",
  1294. checkID, file)
  1295. if err := a.purgeCheck(checkID); err != nil {
  1296. return fmt.Errorf("Failed purging check %q: %s", checkID, err)
  1297. }
  1298. } else {
  1299. // Default check to critical to avoid placing potentially unhealthy
  1300. // services into the active pool
  1301. p.Check.Status = structs.HealthCritical
  1302. if err := a.AddCheck(p.Check, p.ChkType, false, p.Token); err != nil {
  1303. // Purge the check if it is unable to be restored.
  1304. a.logger.Printf("[WARN] agent: Failed to restore check %q: %s",
  1305. checkID, err)
  1306. if err := a.purgeCheck(checkID); err != nil {
  1307. return fmt.Errorf("Failed purging check %q: %s", checkID, err)
  1308. }
  1309. }
  1310. a.logger.Printf("[DEBUG] agent: restored health check %q from %q",
  1311. p.Check.CheckID, file)
  1312. }
  1313. }
  1314. return nil
  1315. }
  1316. // unloadChecks will deregister all checks known to the local agent.
  1317. func (a *Agent) unloadChecks() error {
  1318. for _, check := range a.state.Checks() {
  1319. if err := a.RemoveCheck(check.CheckID, false); err != nil {
  1320. return fmt.Errorf("Failed deregistering check '%s': %s", check.CheckID, err)
  1321. }
  1322. }
  1323. return nil
  1324. }
  1325. // snapshotCheckState is used to snapshot the current state of the health
  1326. // checks. This is done before we reload our checks, so that we can properly
  1327. // restore into the same state.
  1328. func (a *Agent) snapshotCheckState() map[types.CheckID]*structs.HealthCheck {
  1329. return a.state.Checks()
  1330. }
  1331. // restoreCheckState is used to reset the health state based on a snapshot.
  1332. // This is done after we finish the reload to avoid any unnecessary flaps
  1333. // in health state and potential session invalidations.
  1334. func (a *Agent) restoreCheckState(snap map[types.CheckID]*structs.HealthCheck) {
  1335. for id, check := range snap {
  1336. a.state.UpdateCheck(id, check.Status, check.Output)
  1337. }
  1338. }
  1339. // serviceMaintCheckID returns the ID of a given service's maintenance check
  1340. func serviceMaintCheckID(serviceID string) types.CheckID {
  1341. return types.CheckID(fmt.Sprintf("%s:%s", serviceMaintCheckPrefix, serviceID))
  1342. }
  1343. // EnableServiceMaintenance will register a false health check against the given
  1344. // service ID with critical status. This will exclude the service from queries.
  1345. func (a *Agent) EnableServiceMaintenance(serviceID, reason, token string) error {
  1346. service, ok := a.state.Services()[serviceID]
  1347. if !ok {
  1348. return fmt.Errorf("No service registered with ID %q", serviceID)
  1349. }
  1350. // Check if maintenance mode is not already enabled
  1351. checkID := serviceMaintCheckID(serviceID)
  1352. if _, ok := a.state.Checks()[checkID]; ok {
  1353. return nil
  1354. }
  1355. // Use default notes if no reason provided
  1356. if reason == "" {
  1357. reason = defaultServiceMaintReason
  1358. }
  1359. // Create and register the critical health check
  1360. check := &structs.HealthCheck{
  1361. Node: a.config.NodeName,
  1362. CheckID: checkID,
  1363. Name: "Service Maintenance Mode",
  1364. Notes: reason,
  1365. ServiceID: service.ID,
  1366. ServiceName: service.Service,
  1367. Status: structs.HealthCritical,
  1368. }
  1369. a.AddCheck(check, nil, true, token)
  1370. a.logger.Printf("[INFO] agent: Service %q entered maintenance mode", serviceID)
  1371. return nil
  1372. }
  1373. // DisableServiceMaintenance will deregister the fake maintenance mode check
  1374. // if the service has been marked as in maintenance.
  1375. func (a *Agent) DisableServiceMaintenance(serviceID string) error {
  1376. if _, ok := a.state.Services()[serviceID]; !ok {
  1377. return fmt.Errorf("No service registered with ID %q", serviceID)
  1378. }
  1379. // Check if maintenance mode is enabled
  1380. checkID := serviceMaintCheckID(serviceID)
  1381. if _, ok := a.state.Checks()[checkID]; !ok {
  1382. return nil
  1383. }
  1384. // Deregister the maintenance check
  1385. a.RemoveCheck(checkID, true)
  1386. a.logger.Printf("[INFO] agent: Service %q left maintenance mode", serviceID)
  1387. return nil
  1388. }
  1389. // EnableNodeMaintenance places a node into maintenance mode.
  1390. func (a *Agent) EnableNodeMaintenance(reason, token string) {
  1391. // Ensure node maintenance is not already enabled
  1392. if _, ok := a.state.Checks()[nodeMaintCheckID]; ok {
  1393. return
  1394. }
  1395. // Use a default notes value
  1396. if reason == "" {
  1397. reason = defaultNodeMaintReason
  1398. }
  1399. // Create and register the node maintenance check
  1400. check := &structs.HealthCheck{
  1401. Node: a.config.NodeName,
  1402. CheckID: nodeMaintCheckID,
  1403. Name: "Node Maintenance Mode",
  1404. Notes: reason,
  1405. Status: structs.HealthCritical,
  1406. }
  1407. a.AddCheck(check, nil, true, token)
  1408. a.logger.Printf("[INFO] agent: Node entered maintenance mode")
  1409. }
  1410. // DisableNodeMaintenance removes a node from maintenance mode
  1411. func (a *Agent) DisableNodeMaintenance() {
  1412. if _, ok := a.state.Checks()[nodeMaintCheckID]; !ok {
  1413. return
  1414. }
  1415. a.RemoveCheck(nodeMaintCheckID, true)
  1416. a.logger.Printf("[INFO] agent: Node left maintenance mode")
  1417. }
  1418. // InjectEndpoint overrides the given endpoint with a substitute one. Note
  1419. // that not all agent methods use this mechanism, and that is should only
  1420. // be used for testing.
  1421. func (a *Agent) InjectEndpoint(endpoint string, handler interface{}) error {
  1422. if a.server == nil {
  1423. return fmt.Errorf("agent must be a server")
  1424. }
  1425. if err := a.server.InjectEndpoint(handler); err != nil {
  1426. return err
  1427. }
  1428. name := reflect.Indirect(reflect.ValueOf(handler)).Type().Name()
  1429. a.endpoints[endpoint] = name
  1430. a.logger.Printf("[WARN] agent: endpoint injected; this should only be used for testing")
  1431. return nil
  1432. }
  1433. // getEndpoint returns the endpoint name to use for the given endpoint,
  1434. // which may be overridden.
  1435. func (a *Agent) getEndpoint(endpoint string) string {
  1436. if override, ok := a.endpoints[endpoint]; ok {
  1437. return override
  1438. }
  1439. return endpoint
  1440. }