PageRenderTime 45ms CodeModel.GetById 27ms RepoModel.GetById 0ms app.codeStats 0ms

/components/builder-jobsrv/src/server/mod.rs

https://gitlab.com/admin-github-cloud/habitat
Rust | 421 lines | 354 code | 42 blank | 25 comment | 22 complexity | dfd422b594eeb965f22590c08b4997cb MD5 | raw file
  1. // Copyright (c) 2016 Chef Software Inc. and/or applicable contributors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. pub mod handlers;
  15. use std::ops::Deref;
  16. use std::sync::{mpsc, Arc, RwLock};
  17. use std::time::{Duration, Instant};
  18. use std::thread::{self, JoinHandle};
  19. use dbcache::InstaSet;
  20. use linked_hash_map::LinkedHashMap;
  21. use hab_net::{Application, Dispatcher, Supervisor};
  22. use hab_net::config::ToAddrString;
  23. use hab_net::server::{Envelope, NetIdent, RouteConn, Service, ZMQ_CONTEXT};
  24. use protobuf::{parse_from_bytes, Message};
  25. use protocol::net;
  26. use protocol::jobsrv;
  27. use zmq;
  28. use config::Config;
  29. use data_store::DataStore;
  30. use error::{Error, Result};
  31. const BE_LISTEN_ADDR: &'static str = "inproc://backend";
  32. const WORKER_MGR_ADDR: &'static str = "inproc://work-manager";
  33. const WORKER_TIMEOUT_MS: u64 = 33_000;
  34. pub struct ServerState {
  35. pub work_manager: WorkManager,
  36. datastore: Option<DataStore>,
  37. }
  38. impl ServerState {
  39. pub fn datastore(&mut self) -> &mut DataStore {
  40. self.datastore.as_mut().unwrap()
  41. }
  42. }
  43. impl Default for ServerState {
  44. fn default() -> Self {
  45. let work_manager = WorkManager::default();
  46. ServerState {
  47. datastore: None,
  48. work_manager: work_manager,
  49. }
  50. }
  51. }
  52. pub struct WorkManager {
  53. socket: zmq::Socket,
  54. }
  55. impl WorkManager {
  56. pub fn connect(&mut self) -> Result<()> {
  57. try!(self.socket.connect(WORKER_MGR_ADDR));
  58. Ok(())
  59. }
  60. pub fn notify_work(&mut self) -> Result<()> {
  61. try!(self.socket.send(&[1], 0));
  62. Ok(())
  63. }
  64. }
  65. impl Default for WorkManager {
  66. fn default() -> WorkManager {
  67. let socket = (**ZMQ_CONTEXT).as_mut().socket(zmq::DEALER).unwrap();
  68. socket.set_sndhwm(1).unwrap();
  69. socket.set_linger(0).unwrap();
  70. socket.set_immediate(true).unwrap();
  71. WorkManager { socket: socket }
  72. }
  73. }
  74. pub struct Worker {
  75. config: Arc<RwLock<Config>>,
  76. state: ServerState,
  77. }
  78. impl Worker {
  79. fn try_connect_datastore(&mut self) {
  80. loop {
  81. let result = {
  82. let cfg = self.config.read().unwrap();
  83. DataStore::open(cfg.deref())
  84. };
  85. match result {
  86. Ok(datastore) => {
  87. self.state.datastore = Some(datastore);
  88. break;
  89. }
  90. Err(e) => {
  91. error!("{}", e);
  92. thread::sleep(Duration::from_millis(5000));
  93. }
  94. }
  95. }
  96. }
  97. }
  98. impl Dispatcher for Worker {
  99. type Config = Config;
  100. type Error = Error;
  101. type State = ServerState;
  102. fn message_queue() -> &'static str {
  103. BE_LISTEN_ADDR
  104. }
  105. fn dispatch(message: &mut Envelope,
  106. sock: &mut zmq::Socket,
  107. state: &mut ServerState)
  108. -> Result<()> {
  109. match message.message_id() {
  110. "JobCreate" => handlers::job_create(message, sock, state),
  111. "JobGet" => handlers::job_get(message, sock, state),
  112. _ => panic!("unexpected message: {:?}", message.message_id()),
  113. }
  114. }
  115. fn context(&mut self) -> &mut zmq::Context {
  116. (**ZMQ_CONTEXT).as_mut()
  117. }
  118. fn new(config: Arc<RwLock<Config>>) -> Self {
  119. let state = ServerState::default();
  120. Worker {
  121. config: config,
  122. state: state,
  123. }
  124. }
  125. fn init(&mut self) -> Result<()> {
  126. try!(self.state.work_manager.connect());
  127. self.try_connect_datastore();
  128. Ok(())
  129. }
  130. fn state(&mut self) -> &mut ServerState {
  131. &mut self.state
  132. }
  133. }
  134. pub struct Server {
  135. config: Arc<RwLock<Config>>,
  136. router: RouteConn,
  137. be_sock: zmq::Socket,
  138. }
  139. impl Server {
  140. pub fn new(config: Config) -> Result<Self> {
  141. let router = try!(RouteConn::new(Self::net_ident(), (**ZMQ_CONTEXT).as_mut()));
  142. let be = try!((**ZMQ_CONTEXT).as_mut().socket(zmq::DEALER));
  143. Ok(Server {
  144. config: Arc::new(RwLock::new(config)),
  145. router: router,
  146. be_sock: be,
  147. })
  148. }
  149. pub fn reconfigure(&self, config: Config) -> Result<()> {
  150. {
  151. let mut cfg = self.config.write().unwrap();
  152. *cfg = config;
  153. }
  154. // * disconnect from removed routers
  155. // * notify remaining routers of any shard hosting changes
  156. // * connect to new shard servers
  157. Ok(())
  158. }
  159. }
  160. impl Application for Server {
  161. type Error = Error;
  162. fn run(&mut self) -> Result<()> {
  163. try!(self.be_sock.bind(BE_LISTEN_ADDR));
  164. let cfg1 = self.config.clone();
  165. let cfg2 = self.config.clone();
  166. let sup: Supervisor<Worker> = Supervisor::new(cfg1);
  167. let work_mgr = try!(WorkerManager::start(cfg2));
  168. try!(sup.start());
  169. try!(self.connect());
  170. try!(zmq::proxy(&mut self.router.socket, &mut self.be_sock));
  171. work_mgr.join().unwrap();
  172. Ok(())
  173. }
  174. }
  175. impl Service for Server {
  176. type Application = Self;
  177. type Config = Config;
  178. type Error = Error;
  179. fn protocol() -> net::Protocol {
  180. net::Protocol::JobSrv
  181. }
  182. fn config(&self) -> &Arc<RwLock<Self::Config>> {
  183. &self.config
  184. }
  185. fn conn(&self) -> &RouteConn {
  186. &self.router
  187. }
  188. fn conn_mut(&mut self) -> &mut RouteConn {
  189. &mut self.router
  190. }
  191. }
  192. impl NetIdent for Server {}
  193. struct WorkerManager {
  194. config: Arc<RwLock<Config>>,
  195. datastore: DataStore,
  196. hb_sock: zmq::Socket,
  197. rq_sock: zmq::Socket,
  198. work_mgr_sock: zmq::Socket,
  199. msg: zmq::Message,
  200. workers: LinkedHashMap<String, Instant>,
  201. }
  202. impl WorkerManager {
  203. pub fn new(config: Arc<RwLock<Config>>) -> Result<Self> {
  204. let datastore = {
  205. let cfg = config.read().unwrap();
  206. try!(DataStore::open(cfg.deref()))
  207. };
  208. let hb_sock = try!((**ZMQ_CONTEXT).as_mut().socket(zmq::SUB));
  209. let rq_sock = try!((**ZMQ_CONTEXT).as_mut().socket(zmq::ROUTER));
  210. let work_mgr_sock = try!((**ZMQ_CONTEXT).as_mut().socket(zmq::DEALER));
  211. try!(rq_sock.set_router_mandatory(true));
  212. try!(hb_sock.set_subscribe(&[]));
  213. try!(work_mgr_sock.set_rcvhwm(1));
  214. try!(work_mgr_sock.set_linger(0));
  215. try!(work_mgr_sock.set_immediate(true));
  216. let msg = try!(zmq::Message::new());
  217. Ok(WorkerManager {
  218. config: config,
  219. datastore: datastore,
  220. hb_sock: hb_sock,
  221. rq_sock: rq_sock,
  222. work_mgr_sock: work_mgr_sock,
  223. msg: msg,
  224. workers: LinkedHashMap::new(),
  225. })
  226. }
  227. pub fn start(config: Arc<RwLock<Config>>) -> Result<JoinHandle<()>> {
  228. let (tx, rx) = mpsc::sync_channel(1);
  229. let handle = thread::Builder::new()
  230. .name("worker-manager".to_string())
  231. .spawn(move || {
  232. let mut manager = Self::new(config).unwrap();
  233. manager.run(tx).unwrap();
  234. })
  235. .unwrap();
  236. match rx.recv() {
  237. Ok(()) => Ok(handle),
  238. Err(e) => panic!("worker-manager thread startup error, err={}", e),
  239. }
  240. }
  241. fn run(&mut self, rz: mpsc::SyncSender<()>) -> Result<()> {
  242. try!(self.work_mgr_sock.bind(WORKER_MGR_ADDR));
  243. {
  244. let cfg = self.config.read().unwrap();
  245. println!("Listening for commands on {}",
  246. cfg.worker_command_addr.to_addr_string());
  247. try!(self.rq_sock.bind(&cfg.worker_command_addr.to_addr_string()));
  248. println!("Listening for heartbeats on {}",
  249. cfg.worker_heartbeat_addr.to_addr_string());
  250. try!(self.hb_sock.bind(&cfg.worker_heartbeat_addr.to_addr_string()));
  251. }
  252. let mut hb_sock = false;
  253. let mut rq_sock = false;
  254. let mut work_mgr_sock = false;
  255. rz.send(()).unwrap();
  256. loop {
  257. {
  258. let timeout = self.poll_timeout();
  259. let mut items = [self.hb_sock.as_poll_item(1),
  260. self.rq_sock.as_poll_item(1),
  261. self.work_mgr_sock.as_poll_item(1)];
  262. // Poll until timeout or message is received. Checking for the zmq::POLLIN flag on
  263. // a poll item's revents will let you know if you have received a message or not
  264. // on that socket.
  265. try!(zmq::poll(&mut items, timeout));
  266. if (items[0].get_revents() & zmq::POLLIN) > 0 {
  267. hb_sock = true;
  268. }
  269. if (items[1].get_revents() & zmq::POLLIN) > 0 {
  270. rq_sock = true;
  271. }
  272. if (items[2].get_revents() & zmq::POLLIN) > 0 {
  273. work_mgr_sock = true;
  274. }
  275. }
  276. if hb_sock {
  277. try!(self.process_heartbeat());
  278. hb_sock = false;
  279. }
  280. self.expire_workers();
  281. if rq_sock {
  282. try!(self.process_job_status());
  283. rq_sock = false;
  284. }
  285. if work_mgr_sock {
  286. try!(self.distribute_work());
  287. }
  288. }
  289. Ok(())
  290. }
  291. fn poll_timeout(&self) -> i64 {
  292. if let Some((_, expiry)) = self.workers.front() {
  293. let timeout = *expiry - Instant::now();
  294. (timeout.as_secs() as i64 * 1000) + (timeout.subsec_nanos() as i64 / 1000 / 1000)
  295. } else {
  296. -1
  297. }
  298. }
  299. fn distribute_work(&mut self) -> Result<()> {
  300. loop {
  301. let job = match self.datastore.job_queue.peek() {
  302. Ok(Some(job)) => job,
  303. Ok(None) => break,
  304. Err(e) => return Err(e),
  305. };
  306. match self.workers.pop_front() {
  307. Some((worker, _)) => {
  308. debug!("sending work, worker={:?}, job={:?}", worker, job);
  309. if self.rq_sock.send_str(&worker, zmq::SNDMORE).is_err() {
  310. debug!("failed to send, worker went away, worker={:?}", worker);
  311. continue;
  312. }
  313. if self.rq_sock.send(&[], zmq::SNDMORE).is_err() {
  314. debug!("failed to send, worker went away, worker={:?}", worker);
  315. continue;
  316. }
  317. if self.rq_sock.send(&job.write_to_bytes().unwrap(), 0).is_err() {
  318. debug!("failed to send, worker went away, worker={:?}", worker);
  319. continue;
  320. }
  321. // JW TODO: Wait for response back to ensure we can dequeue this. If state
  322. // returned is not processing then we move onto next worker and assume this
  323. // worker is no longer valid. Put work back on queue.
  324. try!(self.datastore.job_queue.dequeue());
  325. // Consume the to-do work notification if the queue is empty.
  326. if try!(self.datastore.job_queue.peek()).is_none() {
  327. try!(self.work_mgr_sock.recv(&mut self.msg, 0));
  328. }
  329. break;
  330. }
  331. None => break,
  332. }
  333. }
  334. Ok(())
  335. }
  336. fn expire_workers(&mut self) {
  337. let now = Instant::now();
  338. loop {
  339. if let Some((_, expiry)) = self.workers.front() {
  340. if expiry >= &now {
  341. break;
  342. }
  343. } else {
  344. break;
  345. }
  346. let worker = self.workers.pop_front();
  347. debug!("expiring worker due to inactivity, worker={:?}", worker);
  348. }
  349. }
  350. fn process_heartbeat(&mut self) -> Result<()> {
  351. try!(self.hb_sock.recv(&mut self.msg, 0));
  352. let heartbeat: jobsrv::Heartbeat = try!(parse_from_bytes(&self.msg));
  353. debug!("heartbeat={:?}", heartbeat);
  354. match heartbeat.get_state() {
  355. jobsrv::WorkerState::Ready => {
  356. let now = Instant::now();
  357. let expiry = now + Duration::from_millis(WORKER_TIMEOUT_MS);
  358. self.workers.insert(heartbeat.get_endpoint().to_string(), expiry);
  359. }
  360. jobsrv::WorkerState::Busy => {
  361. self.workers.remove(heartbeat.get_endpoint());
  362. }
  363. }
  364. Ok(())
  365. }
  366. fn process_job_status(&mut self) -> Result<()> {
  367. // Pop message delimiter
  368. try!(self.rq_sock.recv(&mut self.msg, 0));
  369. // Pop message body
  370. try!(self.rq_sock.recv(&mut self.msg, 0));
  371. let job: jobsrv::Job = try!(parse_from_bytes(&self.msg));
  372. debug!("job_status={:?}", job);
  373. try!(self.datastore.jobs.update(&job));
  374. Ok(())
  375. }
  376. }
  377. pub fn run(config: Config) -> Result<()> {
  378. try!(Server::new(config)).run()
  379. }