/src/support/z_supervisor.erl

https://code.google.com/p/zotonic/ · Erlang · 603 lines · 387 code · 90 blank · 126 comment · 7 complexity · 92dd6256a12fb8d556c67e70f7d96227 MD5 · raw file

  1. %% @author Marc Worrell <marc@worrell.nl>
  2. %% @copyright 2010 Marc Worrell
  3. %% @doc Supervisor with a one_for_one strategy and disabling of too-often-crashing resources.
  4. %% All children of this supervisor should be gen_server/supervisor processes.
  5. %% Copyright 2010 Marc Worrell
  6. %%
  7. %% Licensed under the Apache License, Version 2.0 (the "License");
  8. %% you may not use this file except in compliance with the License.
  9. %% You may obtain a copy of the License at
  10. %%
  11. %% http://www.apache.org/licenses/LICENSE-2.0
  12. %%
  13. %% Unless required by applicable law or agreed to in writing, software
  14. %% distributed under the License is distributed on an "AS IS" BASIS,
  15. %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16. %% See the License for the specific language governing permissions and
  17. %% limitations under the License.
  18. -module(z_supervisor).
  19. -author("Marc Worrell <marc@worrell.nl>").
  20. -behaviour(gen_server).
  21. % gen_server exports
  22. -export([
  23. init/1,
  24. handle_call/3,
  25. handle_cast/2,
  26. handle_info/2,
  27. terminate/2,
  28. code_change/3
  29. ]).
  30. % z_supervisor API
  31. -export([
  32. start_link/1,
  33. start_link/2,
  34. add_child/2,
  35. delete_child/2,
  36. start_child/2,
  37. stop_child/2,
  38. restart_child/2,
  39. which_children/1,
  40. running_children/1,
  41. check_children/1,
  42. set_manager_pid/2
  43. ]).
  44. % -record(supervised, {name, mfa, status, pid, crashes = 5, period = 60, period_retry = 1800, period_retries=10}).
  45. -define(INTERVAL, 1000).
  46. -record(state, {waiting=[], running=[], retrying=[], failed=[], stopped=[], timer_ref, manager_pid}).
  47. -record(child_state, {name, pid,
  48. state=waiting, time,
  49. crash_time, crashes=0,
  50. retry_time, retries=0,
  51. fail_time,
  52. child}).
  53. -include_lib("zotonic.hrl").
  54. %%% ---------------------------------------------------
  55. %%% This is a general process supervisor built upon gen_server.erl.
  56. %%% Servers/processes should/could also be built using gen_server.erl.
  57. %%% SupName = {local, atom()} | {global, atom()}.
  58. %%% ---------------------------------------------------
  59. start_link(Args) ->
  60. gen_server:start_link(?MODULE, Args, []).
  61. start_link(SupName, Args) ->
  62. gen_server:start_link(SupName, ?MODULE, Args, []).
  63. %% @doc Add a child, the child will be added and started.
  64. add_child(Pid, ChildSpec) ->
  65. gen_server:call(Pid, {add_child, ChildSpec}).
  66. %% @doc Delete a child, the child will be terminated and removed.
  67. delete_child(Pid, Name) ->
  68. gen_server:cast(Pid, {delete_child, Name}).
  69. %% @doc Start a child when it is not running (either failed or stopped)
  70. start_child(Pid, Name) ->
  71. gen_server:call(Pid, {start_child, Name}).
  72. %% @doc Stop a child, the child will be terminated and put in "stopped" state
  73. stop_child(Pid, Name) ->
  74. gen_server:cast(Pid, {stop_child, Name}).
  75. %% @doc Terminate and restart a child.
  76. restart_child(Pid, Name) ->
  77. gen_server:call(Pid, {restart_child, Name}).
  78. %% @doc Return the list of all children and their run state.
  79. which_children(Pid) ->
  80. gen_server:call(Pid, which_children).
  81. %% @doc Return the list of running children
  82. running_children(Pid) ->
  83. gen_server:call(Pid, running_children).
  84. %% @doc Check children, try restarting children when they are in 'error' state.
  85. check_children(Pid) ->
  86. gen_server:cast(Pid, check_children).
  87. %% @doc Set the manager pid for this supervisor
  88. set_manager_pid(Pid, ManagerPid) ->
  89. gen_server:cast(Pid, {set_manager_pid, ManagerPid}).
  90. %%====================================================================
  91. %% gen_server callbacks
  92. %%====================================================================
  93. %% @spec init(Args) -> {ok, State} |
  94. %% {ok, State, Timeout} |
  95. %% ignore |
  96. %% {stop, Reason}
  97. %% @doc Initiates the server.
  98. init(InitialChildren) ->
  99. process_flag(trap_exit, true),
  100. {ok, TimerRef} = timer:apply_interval(?INTERVAL, ?MODULE, check_children, [self()]),
  101. {ok, #state{
  102. waiting=[ #child_state{name=C#child_spec.name, child=C, state=starting, time=erlang:localtime()}
  103. || C <- InitialChildren ],
  104. timer_ref=TimerRef
  105. }
  106. }.
  107. %% @spec handle_call(Request, From, State) -> {reply, Reply, State} |
  108. %% {reply, Reply, State, Timeout} |
  109. %% {noreply, State} |
  110. %% {noreply, State, Timeout} |
  111. %% {stop, Reason, Reply, State} |
  112. %% {stop, Reason, State}
  113. %% @doc Add a child in the stopped state.
  114. handle_call({add_child, ChildSpec}, _From, State) ->
  115. case exists(ChildSpec#child_spec.name, State) of
  116. false ->
  117. CS = #child_state{name=ChildSpec#child_spec.name, child=ChildSpec, state=starting, time=erlang:localtime()},
  118. {reply, ok, State#state{stopped=[CS|State#state.stopped]}};
  119. true ->
  120. {reply, {error, duplicate_name}, State}
  121. end;
  122. %% @doc Start the child when it is not running already
  123. handle_call({start_child, Name}, _From, State) ->
  124. case is_running(Name, State) of
  125. true ->
  126. {reply, ok, State};
  127. false ->
  128. case do_remove_child(Name, State) of
  129. {CS, State1} -> {reply, ok, do_start_child(CS, State1)};
  130. error -> {reply, {error, unknown_child}, State}
  131. end
  132. end;
  133. %% @doc Restart or start a child.
  134. handle_call({restart_child, Name}, _From, State) ->
  135. case do_remove_child(Name, State) of
  136. {CS,State1} ->
  137. shutdown_child(CS, State),
  138. {reply, ok, do_start_child(CS, State1)};
  139. error ->
  140. %% Unknown child
  141. {reply, {error, unknown_child}, State}
  142. end;
  143. %% @doc Return a full list of all children
  144. handle_call(which_children, _From, State) ->
  145. F = fun(C) ->
  146. {C#child_state.name, #child_state.child, C#child_state.pid, C#child_state.time}
  147. end,
  148. {reply, [
  149. {waiting, [ F(C) || C <- State#state.waiting]},
  150. {running, [ F(C) || C <- State#state.running]},
  151. {retrying, [ F(C) || C <- State#state.retrying]},
  152. {failed, [ F(C) || C <- State#state.failed]},
  153. {stopped, [ F(C) || C <- State#state.stopped]}
  154. ], State};
  155. %% @doc Return the list of running children
  156. handle_call(running_children, _From, State) ->
  157. {reply, [ C#child_state.name || C <- State#state.running ], State};
  158. %% @doc Trap unknown calls
  159. handle_call(Message, _From, State) ->
  160. ?DEBUG({unknown_call, Message}),
  161. {stop, {unknown_call, Message}, State}.
  162. %% @doc Stop a child process and add it to the stopped list.
  163. handle_cast({stop_child, Name}, State) ->
  164. case do_remove_child(Name, State) of
  165. {CS,State1} ->
  166. shutdown_child(CS, State1),
  167. CS1 = CS#child_state{state=stopped, time=erlang:localtime(), pid=undefined},
  168. {noreply, State1#state{stopped=[CS1|State1#state.stopped]}};
  169. error ->
  170. %% Unknown child
  171. {noreply, State}
  172. end;
  173. %% @doc Delete a child and add remove it from any queue, optionally stopping it.
  174. handle_cast({delete_child, Name}, State) ->
  175. case do_remove_child(Name, State) of
  176. {CS,State1} ->
  177. shutdown_child(CS, State1),
  178. {noreply, State1};
  179. error ->
  180. %% Unknown child
  181. {noreply, State}
  182. end;
  183. %% @doc Start any children that are waiting or up for a retry.
  184. handle_cast(check_children, State) ->
  185. State1 = handle_waiting_children(State),
  186. State2 = handle_retrying_children(State1),
  187. State3 = handle_failed_children(State2),
  188. z_utils:flush_message({'$gen_cast', check_children}),
  189. {noreply, State3};
  190. %% @doc Set the manager pid of this supervisor
  191. handle_cast({set_manager_pid, Pid}, State) ->
  192. {noreply, State#state{manager_pid=Pid}};
  193. %% @doc Trap unknown casts
  194. handle_cast(Message, State) ->
  195. ?DEBUG({unknown_cast, Message}),
  196. {stop, {unknown_cast, Message}, State}.
  197. %% @spec handle_info(Info, State) -> {noreply, State} |
  198. %% {noreply, State, Timeout} |
  199. %% {stop, Reason, State}
  200. %% @doc Handle the exit of a child
  201. handle_info({'EXIT', Pid, Reason}, State) ->
  202. {noreply, handle_exit(Pid, Reason, State)};
  203. %% @doc Handling all non call/cast messages
  204. handle_info(Info, State) ->
  205. ?DEBUG({unknown_info, Info}),
  206. {noreply, State}.
  207. %% @spec terminate(Reason, State) -> void()
  208. %% @doc This function is called by a gen_server when it is about to
  209. %% terminate. It should be the opposite of Module:init/1 and do any necessary
  210. %% cleaning up. When it returns, the gen_server terminates with Reason.
  211. %% The return value is ignored.
  212. terminate(_Reason, _State) ->
  213. ok.
  214. %% @spec code_change(OldVsn, State, Extra) -> {ok, NewState}
  215. %% @doc Convert process state when code is changed
  216. code_change(_OldVsn, State, _Extra) ->
  217. {ok, State}.
  218. %%====================================================================
  219. %% support functions
  220. %%====================================================================
  221. %% @doc Start all waiting children and add them to the 'running' state.
  222. handle_waiting_children(#state{waiting=[]} = State) ->
  223. State;
  224. handle_waiting_children(#state{waiting=Waiting} = State) ->
  225. lists:foldl(fun(C, S) -> do_start_child(C, S) end, State#state{waiting=[]}, Waiting).
  226. %% @doc Restart all retrying children add them to the 'running' state.
  227. %% Repeat until the retry queue is empty.
  228. handle_retrying_children(#state{retrying=[]} = State) ->
  229. State;
  230. handle_retrying_children(#state{retrying=Retrying} = State) ->
  231. Now = z_utils:now(),
  232. {Start,Wait} = lists:partition(fun(CS) -> is_ready_for_retry(CS, Now) end, Retrying),
  233. lists:foldl(fun(#child_state{child=Child} = CS, S) ->
  234. case start_child_mfa(Child#child_spec.mfa) of
  235. {ok, Pid} ->
  236. CS1 = CS#child_state{state=running_from_retry, pid=Pid, time=erlang:localtime()},
  237. S#state{running=[CS1|S#state.running]};
  238. {error, _What} ->
  239. % Move the child to the failed state when it crashed too often
  240. case CS#child_state.retries >= Child#child_spec.period_retries of
  241. true ->
  242. CS1 = CS#child_state{state=failed, time=erlang:localtime(), fail_time=Now},
  243. S#state{failed=[CS1|S#state.failed]};
  244. false ->
  245. CS1 = CS#child_state{retries=CS#child_state.retries+1, retry_time=Now},
  246. S#state{retrying=[CS1|S#state.retrying]}
  247. end
  248. end
  249. end,
  250. State#state{retrying=Wait}, Start).
  251. is_ready_for_retry(#child_state{retry_time=RetryTime, child=Child}, Now) ->
  252. Now - RetryTime > Child#child_spec.period_retry.
  253. %% @doc Period check if any failed children are up for a restart.
  254. handle_failed_children(#state{failed=[]} = State) ->
  255. State;
  256. handle_failed_children(#state{failed=Failed} = State) ->
  257. Now = z_utils:now(),
  258. {Start,Fail} = lists:partition(fun(CS) -> is_ready_for_unfail(CS, Now) end, Failed),
  259. lists:foldl(fun(#child_state{child=Child} = CS, S) ->
  260. case start_child_mfa(Child#child_spec.mfa) of
  261. {ok, Pid} ->
  262. CS1 = CS#child_state{state=running_from_failed, pid=Pid, time=erlang:localtime()},
  263. S#state{running=[CS1|S#state.running]};
  264. {error, _What} ->
  265. CS1 = CS#child_state{state=failed, time=erlang:localtime(), fail_time=Now},
  266. S#state{failed=[CS1|S#state.failed]}
  267. end
  268. end,
  269. State#state{failed=Fail}, Start).
  270. is_ready_for_unfail(#child_state{fail_time=FailTime, child=Child}, Now) ->
  271. Now - FailTime > Child#child_spec.eternal_retry.
  272. %% @doc Handle an 'EXIT' message for a Pid
  273. handle_exit(Pid, Reason, State) ->
  274. case remove_running_pid(Pid, State) of
  275. {CS, State1} ->
  276. case Reason of
  277. normal -> append_stopped(CS, State1);
  278. shutdown -> append_stopped(CS, State1);
  279. _Other -> do_maybe_restart(CS, State1)
  280. end;
  281. error ->
  282. %% No child with this pid in the running list, ignore the exit
  283. State
  284. end.
  285. %% @doc Start a single child. Doesn't remove it from any queue (caller should have done that).
  286. do_start_child(#child_state{child=Child} = CS, State) ->
  287. #child_spec{mfa=MFA} = Child,
  288. case start_child_mfa(MFA) of
  289. {ok, Pid} ->
  290. CS1 = CS#child_state{state=running, pid=Pid, time=erlang:localtime()},
  291. notify_start(CS1, State),
  292. State#state{running=[CS1|State#state.running]};
  293. {error, _What} ->
  294. do_maybe_restart(CS, State)
  295. end.
  296. do_maybe_restart(CS, State) ->
  297. case may_restart(CS) of
  298. first_restart ->
  299. CS1 = CS#child_state{state=crashed, time=erlang:localtime(),
  300. crashes=1, crash_time=z_utils:now(), retries=0},
  301. do_start_child(CS1, State);
  302. restart ->
  303. CS1 = CS#child_state{state=crashed, time=erlang:localtime(),
  304. crashes=CS#child_state.crashes+1, retries=0},
  305. do_start_child(CS1, State);
  306. first_retry ->
  307. CS1 = CS#child_state{state=retrying, time=erlang:localtime(),
  308. retries=1, retry_time=z_utils:now()},
  309. State#state{retrying=[CS1|State#state.retrying]};
  310. retry ->
  311. CS1 = CS#child_state{state=retrying, time=erlang:localtime(),
  312. retries=CS#child_state.retries+1, retry_time=z_utils:now()},
  313. State#state{retrying=[CS1|State#state.retrying]};
  314. fail ->
  315. CS1 = CS#child_state{state=failed, time=erlang:localtime(), fail_time=z_utils:now()},
  316. State#state{failed=[CS1|State#state.failed]}
  317. end.
  318. %% @doc Remove the child with Name from any list of children
  319. do_remove_child(Name, State) ->
  320. case remove_by_name(Name, State#state.waiting, []) of
  321. {CS, L} -> {CS, State#state{waiting=L}};
  322. error ->
  323. case remove_by_name(Name, State#state.running, []) of
  324. {CS1, L1} -> {CS1, State#state{running=L1}};
  325. error ->
  326. case remove_by_name(Name, State#state.retrying, []) of
  327. {CS2, L2} -> {CS2, State#state{retrying=L2}};
  328. error ->
  329. case remove_by_name(Name, State#state.failed, []) of
  330. {CS3, L3} -> {CS3, State#state{failed=L3}};
  331. error ->
  332. case remove_by_name(Name, State#state.stopped, []) of
  333. {CS4, L4} -> {CS4, State#state{stopped=L4}};
  334. error -> error
  335. end
  336. end
  337. end
  338. end
  339. end.
  340. %% @doc Start a single child, catch exceptions
  341. start_child_mfa({M,F,A}) ->
  342. case catch apply(M, F, A) of
  343. {ok, Pid} when is_pid(Pid) ->
  344. {ok, Pid};
  345. ignore ->
  346. {ok, undefined};
  347. {error, What} ->
  348. {error, What};
  349. What ->
  350. {error, What}
  351. end.
  352. %% @doc Check if a child is not crashing or retrying too often.
  353. %% @spec may_restart(#child_state{}) -> first_restart | restart | first_retry | retry | fail
  354. may_restart(#child_state{state=running_from_fail, fail_time=FailTime, child=Child} = CS) ->
  355. #child_spec{eternal_retry=Period} = Child,
  356. case z_utils:now() - FailTime > Period of
  357. true ->
  358. %% Did run for longer than a (normal) retry period, reset to normal restarting behaviour
  359. may_restart(CS#child_state{state=running});
  360. false ->
  361. %% A restart within the retry/fail period, maybe queue for another eternal retry
  362. fail
  363. end;
  364. may_restart(#child_state{state=running_from_retry, retry_time=RetryTime, retries=Retries, child=Child} = CS) ->
  365. #child_spec{period_retries=MaxRetries, period_retry=Period} = Child,
  366. case z_utils:now() - RetryTime > Period of
  367. true ->
  368. %% Did run for longer than a retry period, reset to new restarting behaviour
  369. may_restart(CS#child_state{state=running});
  370. false ->
  371. %% A restart within the retry period, maybe queue for another retry
  372. case Retries > MaxRetries of
  373. true -> fail;
  374. false -> retry
  375. end
  376. end;
  377. may_restart(#child_state{crash_time=undefined}) ->
  378. first_restart;
  379. may_restart(#child_state{crash_time=CrashTime, crashes=Crashes, child=Child}) ->
  380. #child_spec{period=Period, crashes=MaxCrashes} = Child,
  381. case z_utils:now() - CrashTime > Period of
  382. true ->
  383. first_restart;
  384. false ->
  385. case Crashes > MaxCrashes of
  386. true -> first_retry;
  387. false -> restart
  388. end
  389. end.
  390. %% @doc Append a child to the stopped queue
  391. append_stopped(CS, State) ->
  392. CS1 = CS#child_state{state=stopped, time=erlang:localtime()},
  393. State#state{stopped=[CS1|State#state.stopped]}.
  394. %% @doc Remove a child from the running list
  395. remove_running_pid(Pid, State) ->
  396. case remove_by_pid(Pid, State#state.running, []) of
  397. {CS, Running} ->
  398. notify_exit(CS, State),
  399. {CS#child_state{pid=undefined}, State#state{running=Running}};
  400. error ->
  401. error
  402. end.
  403. remove_by_name(_Name, [], _Acc) ->
  404. error;
  405. remove_by_name(Name, [#child_state{name=Name} = CS|Rest], Acc) ->
  406. {CS, Rest++Acc};
  407. remove_by_name(Name, [CS|Rest], Acc) ->
  408. remove_by_name(Name, Rest, [CS|Acc]).
  409. remove_by_pid(_Pid, [], _Acc) ->
  410. error;
  411. remove_by_pid(Pid, [#child_state{pid=Pid} = CS|Rest], Acc) ->
  412. {CS, Rest++Acc};
  413. remove_by_pid(Pid, [CS|Rest], Acc) ->
  414. remove_by_pid(Pid, Rest, [CS|Acc]).
  415. %% @doc Check if a named child is running
  416. is_running(Name, State) ->
  417. is_member(Name, State#state.running).
  418. %% @doc Check if a named child exists
  419. exists(Name, State) ->
  420. is_member(Name, State#state.waiting)
  421. orelse is_member(Name, State#state.running)
  422. orelse is_member(Name, State#state.retrying)
  423. orelse is_member(Name, State#state.failed)
  424. orelse is_member(Name, State#state.stopped).
  425. is_member(_Name, []) -> false;
  426. is_member(Name, [#child_state{name=Name}|_]) -> true;
  427. is_member(Name, [_|Rest]) -> is_member(Name, Rest).
  428. %% @doc Kill the child process when it is running
  429. shutdown_child(#child_state{pid=Pid, child=Child} = CS, State) when is_pid(Pid) ->
  430. notify_exit(CS, State),
  431. shutdown(Pid, Child#child_spec.shutdown);
  432. shutdown_child(_, _State) ->
  433. {error, no_process}.
  434. %% @doc Notify the manager that a child has stopped
  435. notify_exit(_ChildState, #state{manager_pid=undefined}) ->
  436. nop;
  437. notify_exit(ChildState, #state{manager_pid=ManagerPid}) ->
  438. gen_server:cast(ManagerPid, {supervisor_child_stopped, ChildState#child_state.child, ChildState#child_state.pid}).
  439. %% @doc Notify the manager that a child has started
  440. notify_start(_ChildState, #state{manager_pid=undefined}) ->
  441. nop;
  442. notify_start(ChildState, #state{manager_pid=ManagerPid}) ->
  443. gen_server:cast(ManagerPid, {supervisor_child_started, ChildState#child_state.child, ChildState#child_state.pid}).
  444. %%-----------------------------------------------------------------
  445. %% shutdown/2 and monitor_child/1 are from supervisor.erl
  446. %% Copyright Ericsson AB 1996-2010. All Rights Reserved.
  447. %%
  448. %% Shutdowns a child. We must check the EXIT value
  449. %% of the child, because it might have died with another reason than
  450. %% the wanted. In that case we want to report the error. We put a
  451. %% monitor on the child an check for the 'DOWN' message instead of
  452. %% checking for the 'EXIT' message, because if we check the 'EXIT'
  453. %% message a "naughty" child, who does unlink(Sup), could hang the
  454. %% supervisor.
  455. %% Returns: ok | {error, OtherReason} (this should be reported)
  456. %%-----------------------------------------------------------------
  457. shutdown(Pid, brutal_kill) ->
  458. case monitor_child(Pid) of
  459. ok ->
  460. exit(Pid, kill),
  461. receive
  462. {'DOWN', _MRef, process, Pid, killed} ->
  463. ok;
  464. {'DOWN', _MRef, process, Pid, OtherReason} ->
  465. {error, OtherReason}
  466. end;
  467. {error, Reason} ->
  468. {error, Reason}
  469. end;
  470. shutdown(Pid, Time) ->
  471. case monitor_child(Pid) of
  472. ok ->
  473. exit(Pid, shutdown), %% Try to shutdown gracefully
  474. receive
  475. {'DOWN', _MRef, process, Pid, shutdown} ->
  476. ok;
  477. {'DOWN', _MRef, process, Pid, OtherReason} ->
  478. {error, OtherReason}
  479. after Time ->
  480. exit(Pid, kill), %% Force termination.
  481. receive
  482. {'DOWN', _MRef, process, Pid, OtherReason} ->
  483. {error, OtherReason}
  484. end
  485. end;
  486. {error, Reason} ->
  487. {error, Reason}
  488. end.
  489. %% Help function to shutdown/2 switches from link to monitor approach
  490. monitor_child(Pid) ->
  491. %% Do the monitor operation first so that if the child dies
  492. %% before the monitoring is done causing a 'DOWN'-message with
  493. %% reason noproc, we will get the real reason in the 'EXIT'-message
  494. %% unless a naughty child has already done unlink...
  495. erlang:monitor(process, Pid),
  496. unlink(Pid),
  497. receive
  498. %% If the child dies before the unlik we must empty
  499. %% the mail-box of the 'EXIT'-message and the 'DOWN'-message.
  500. {'EXIT', Pid, Reason} ->
  501. receive
  502. {'DOWN', _, process, Pid, _} ->
  503. {error, Reason}
  504. end
  505. after 0 ->
  506. %% If a naughty child did unlink and the child dies before
  507. %% monitor the result will be that shutdown/2 receives a
  508. %% 'DOWN'-message with reason noproc.
  509. %% If the child should die after the unlink there
  510. %% will be a 'DOWN'-message with a correct reason
  511. %% that will be handled in shutdown/2.
  512. ok
  513. end.