PageRenderTime 72ms CodeModel.GetById 16ms app.highlight 49ms RepoModel.GetById 1ms app.codeStats 1ms

/src/support/z_supervisor.erl

https://code.google.com/p/zotonic/
Erlang | 603 lines | 387 code | 90 blank | 126 comment | 7 complexity | 92dd6256a12fb8d556c67e70f7d96227 MD5 | raw file
  1%% @author Marc Worrell <marc@worrell.nl>
  2%% @copyright 2010 Marc Worrell
  3%% @doc Supervisor with a one_for_one strategy and disabling of too-often-crashing resources.
  4%% All children of this supervisor should be gen_server/supervisor processes.
  5
  6%% Copyright 2010 Marc Worrell
  7%%
  8%% Licensed under the Apache License, Version 2.0 (the "License");
  9%% you may not use this file except in compliance with the License.
 10%% You may obtain a copy of the License at
 11%% 
 12%%     http://www.apache.org/licenses/LICENSE-2.0
 13%% 
 14%% Unless required by applicable law or agreed to in writing, software
 15%% distributed under the License is distributed on an "AS IS" BASIS,
 16%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 17%% See the License for the specific language governing permissions and
 18%% limitations under the License.
 19
 20
 21-module(z_supervisor).
 22-author("Marc Worrell <marc@worrell.nl>").
 23-behaviour(gen_server).
 24
 25% gen_server exports
 26-export([
 27    init/1,
 28    handle_call/3,
 29    handle_cast/2,
 30    handle_info/2,
 31    terminate/2,
 32    code_change/3
 33]).
 34
 35% z_supervisor API
 36-export([
 37    start_link/1,
 38    start_link/2,
 39    add_child/2,
 40    delete_child/2,
 41    start_child/2,
 42    stop_child/2,
 43    restart_child/2,
 44    which_children/1,
 45    running_children/1,
 46    check_children/1,
 47    set_manager_pid/2
 48]).
 49
 50% -record(supervised, {name, mfa, status, pid, crashes = 5, period = 60, period_retry = 1800, period_retries=10}).
 51
 52-define(INTERVAL, 1000).
 53-record(state, {waiting=[], running=[], retrying=[], failed=[], stopped=[], timer_ref, manager_pid}).
 54
 55-record(child_state, {name, pid,
 56                      state=waiting, time,
 57                      crash_time, crashes=0, 
 58                      retry_time, retries=0, 
 59                      fail_time,
 60                      child}).
 61
 62-include_lib("zotonic.hrl").
 63
 64%%% ---------------------------------------------------
 65%%% This is a general process supervisor built upon gen_server.erl.
 66%%% Servers/processes should/could also be built using gen_server.erl.
 67%%% SupName = {local, atom()} | {global, atom()}.
 68%%% ---------------------------------------------------
 69start_link(Args) ->
 70    gen_server:start_link(?MODULE, Args, []).
 71
 72start_link(SupName, Args) ->
 73    gen_server:start_link(SupName, ?MODULE, Args, []).
 74
 75
 76%% @doc Add a child, the child will be added and started.
 77add_child(Pid, ChildSpec) ->
 78    gen_server:call(Pid, {add_child, ChildSpec}).
 79
 80%% @doc Delete a child, the child will be terminated and removed.
 81delete_child(Pid, Name) ->
 82    gen_server:cast(Pid, {delete_child, Name}).
 83
 84%% @doc Start a child when it is not running (either failed or stopped)
 85start_child(Pid, Name) ->
 86    gen_server:call(Pid, {start_child, Name}).
 87
 88%% @doc Stop a child, the child will be terminated and put in "stopped" state
 89stop_child(Pid, Name) ->
 90    gen_server:cast(Pid, {stop_child, Name}).
 91
 92%% @doc Terminate and restart a child.
 93restart_child(Pid, Name) ->
 94    gen_server:call(Pid, {restart_child, Name}).
 95
 96%% @doc Return the list of all children and their run state.
 97which_children(Pid) ->
 98    gen_server:call(Pid, which_children).
 99
100%% @doc Return the list of running children
101running_children(Pid) ->
102    gen_server:call(Pid, running_children).
103
104%% @doc Check children, try restarting children when they are in 'error' state.
105check_children(Pid) ->
106    gen_server:cast(Pid, check_children).
107
108
109%% @doc Set the manager pid for this supervisor
110set_manager_pid(Pid, ManagerPid) ->
111    gen_server:cast(Pid, {set_manager_pid, ManagerPid}).
112
113%%====================================================================
114%% gen_server callbacks
115%%====================================================================
116
117%% @spec init(Args) -> {ok, State} |
118%%                     {ok, State, Timeout} |
119%%                     ignore               |
120%%                     {stop, Reason}
121%% @doc Initiates the server.
122init(InitialChildren) ->
123    process_flag(trap_exit, true),
124    {ok, TimerRef} = timer:apply_interval(?INTERVAL, ?MODULE, check_children, [self()]),
125    {ok, #state{
126            waiting=[ #child_state{name=C#child_spec.name, child=C, state=starting, time=erlang:localtime()}
127                        || C <- InitialChildren ], 
128            timer_ref=TimerRef
129        }
130    }.
131
132%% @spec handle_call(Request, From, State) -> {reply, Reply, State} |
133%%                                      {reply, Reply, State, Timeout} |
134%%                                      {noreply, State} |
135%%                                      {noreply, State, Timeout} |
136%%                                      {stop, Reason, Reply, State} |
137%%                                      {stop, Reason, State}
138%% @doc Add a child in the stopped state.
139handle_call({add_child, ChildSpec}, _From, State) ->
140    case exists(ChildSpec#child_spec.name, State) of
141        false ->
142            CS = #child_state{name=ChildSpec#child_spec.name, child=ChildSpec, state=starting, time=erlang:localtime()},
143            {reply, ok, State#state{stopped=[CS|State#state.stopped]}};
144        true ->
145            {reply, {error, duplicate_name}, State}
146    end;
147
148%% @doc Start the child when it is not running already
149handle_call({start_child, Name}, _From, State) ->
150    case is_running(Name, State) of
151        true ->
152            {reply, ok, State};
153        false ->
154            case do_remove_child(Name, State) of
155                {CS, State1} -> {reply, ok, do_start_child(CS, State1)};
156                error -> {reply, {error, unknown_child}, State}
157            end
158    end;
159
160%% @doc Restart or start a child.
161handle_call({restart_child, Name}, _From, State) ->
162    case do_remove_child(Name, State) of
163        {CS,State1} ->
164            shutdown_child(CS, State),
165            {reply, ok, do_start_child(CS, State1)};
166        error ->
167            %% Unknown child
168            {reply, {error, unknown_child}, State}
169    end;
170
171%% @doc Return a full list of all children
172handle_call(which_children, _From, State) ->
173    F = fun(C) ->
174        {C#child_state.name, #child_state.child, C#child_state.pid, C#child_state.time} 
175    end,
176    {reply, [
177            {waiting, [ F(C) || C <- State#state.waiting]},
178            {running, [ F(C) || C <- State#state.running]},
179            {retrying, [ F(C) || C <- State#state.retrying]},
180            {failed, [ F(C) || C <- State#state.failed]},
181            {stopped, [ F(C) || C <- State#state.stopped]}
182        ], State};
183
184%% @doc Return the list of running children
185handle_call(running_children, _From, State) ->
186    {reply, [ C#child_state.name || C <- State#state.running ], State};
187
188%% @doc Trap unknown calls
189handle_call(Message, _From, State) ->
190    ?DEBUG({unknown_call, Message}),
191    {stop, {unknown_call, Message}, State}.
192
193
194%% @doc Stop a child process and add it to the stopped list.
195handle_cast({stop_child, Name}, State) ->
196    case do_remove_child(Name, State) of
197        {CS,State1} ->
198            shutdown_child(CS, State1),
199            CS1 = CS#child_state{state=stopped, time=erlang:localtime(), pid=undefined},
200            {noreply, State1#state{stopped=[CS1|State1#state.stopped]}};
201        error ->
202            %% Unknown child
203            {noreply, State}
204    end;
205
206%% @doc Delete a child and add remove it from any queue, optionally stopping it.
207handle_cast({delete_child, Name}, State) ->
208    case do_remove_child(Name, State) of
209        {CS,State1} ->
210            shutdown_child(CS, State1),
211            {noreply, State1};
212        error ->
213            %% Unknown child
214            {noreply, State}
215    end;
216
217%% @doc Start any children that are waiting or up for a retry.
218handle_cast(check_children, State) ->
219    State1 = handle_waiting_children(State),
220    State2 = handle_retrying_children(State1),
221    State3 = handle_failed_children(State2),
222    z_utils:flush_message({'$gen_cast', check_children}),
223    {noreply, State3};
224
225%% @doc Set the manager pid of this supervisor
226handle_cast({set_manager_pid, Pid}, State) ->
227    {noreply, State#state{manager_pid=Pid}};
228
229%% @doc Trap unknown casts
230handle_cast(Message, State) ->
231    ?DEBUG({unknown_cast, Message}),
232    {stop, {unknown_cast, Message}, State}.
233
234
235%% @spec handle_info(Info, State) -> {noreply, State} |
236%%                                       {noreply, State, Timeout} |
237%%                                       {stop, Reason, State}
238%% @doc Handle the exit of a child
239handle_info({'EXIT', Pid, Reason}, State) ->
240    {noreply, handle_exit(Pid, Reason, State)};
241
242%% @doc Handling all non call/cast messages
243handle_info(Info, State) ->
244    ?DEBUG({unknown_info, Info}),
245    {noreply, State}.
246
247
248%% @spec terminate(Reason, State) -> void()
249%% @doc This function is called by a gen_server when it is about to
250%% terminate. It should be the opposite of Module:init/1 and do any necessary
251%% cleaning up. When it returns, the gen_server terminates with Reason.
252%% The return value is ignored.
253terminate(_Reason, _State) ->
254    ok.
255
256%% @spec code_change(OldVsn, State, Extra) -> {ok, NewState}
257%% @doc Convert process state when code is changed
258
259code_change(_OldVsn, State, _Extra) ->
260    {ok, State}.
261
262
263%%====================================================================
264%% support functions
265%%====================================================================
266
267
268%% @doc Start all waiting children and add them to the 'running' state.
269handle_waiting_children(#state{waiting=[]} = State) ->
270    State;
271handle_waiting_children(#state{waiting=Waiting} = State) ->
272    lists:foldl(fun(C, S) -> do_start_child(C, S) end, State#state{waiting=[]}, Waiting).
273
274%% @doc Restart all retrying children add them to the 'running' state.
275%% Repeat until the retry queue is empty.
276handle_retrying_children(#state{retrying=[]} = State) ->
277    State;
278handle_retrying_children(#state{retrying=Retrying} = State) ->
279    Now = z_utils:now(),
280    {Start,Wait} = lists:partition(fun(CS) -> is_ready_for_retry(CS, Now) end, Retrying),
281    lists:foldl(fun(#child_state{child=Child} = CS, S) ->
282                    case start_child_mfa(Child#child_spec.mfa) of
283                        {ok, Pid} ->
284                            CS1 = CS#child_state{state=running_from_retry, pid=Pid, time=erlang:localtime()},
285                            S#state{running=[CS1|S#state.running]};
286                        {error, _What} ->
287                            % Move the child to the failed state when it crashed too often
288                            case CS#child_state.retries >= Child#child_spec.period_retries of
289                                true ->
290                                    CS1 = CS#child_state{state=failed, time=erlang:localtime(), fail_time=Now},
291                                    S#state{failed=[CS1|S#state.failed]};
292                                false ->
293                                    CS1 = CS#child_state{retries=CS#child_state.retries+1, retry_time=Now},
294                                    S#state{retrying=[CS1|S#state.retrying]}
295                            end
296                    end
297                end,
298                State#state{retrying=Wait}, Start).
299
300    is_ready_for_retry(#child_state{retry_time=RetryTime, child=Child}, Now) ->
301        Now - RetryTime > Child#child_spec.period_retry.
302
303%% @doc Period check if any failed children are up for a restart.
304handle_failed_children(#state{failed=[]} = State) ->
305    State;
306handle_failed_children(#state{failed=Failed} = State) ->
307    Now = z_utils:now(),
308    {Start,Fail} = lists:partition(fun(CS) -> is_ready_for_unfail(CS, Now) end, Failed),
309    lists:foldl(fun(#child_state{child=Child} = CS, S) ->
310                    case start_child_mfa(Child#child_spec.mfa) of
311                        {ok, Pid} ->
312                            CS1 = CS#child_state{state=running_from_failed, pid=Pid, time=erlang:localtime()},
313                            S#state{running=[CS1|S#state.running]};
314                        {error, _What} ->
315                            CS1 = CS#child_state{state=failed, time=erlang:localtime(), fail_time=Now},
316                            S#state{failed=[CS1|S#state.failed]}
317                    end
318                end,
319                State#state{failed=Fail}, Start).
320
321    is_ready_for_unfail(#child_state{fail_time=FailTime, child=Child}, Now) ->
322        Now - FailTime > Child#child_spec.eternal_retry.
323
324
325%% @doc Handle an 'EXIT' message for a Pid
326handle_exit(Pid, Reason, State) ->
327    case remove_running_pid(Pid, State) of
328        {CS, State1} ->
329            case Reason of
330                normal -> append_stopped(CS, State1);
331                shutdown -> append_stopped(CS, State1);
332                _Other -> do_maybe_restart(CS, State1)
333            end;
334        error ->
335            %% No child with this pid in the running list, ignore the exit
336            State
337    end.
338
339
340%% @doc Start a single child.  Doesn't remove it from any queue (caller should have done that).
341do_start_child(#child_state{child=Child} = CS, State) ->
342    #child_spec{mfa=MFA} = Child,
343    case start_child_mfa(MFA) of
344        {ok, Pid} ->
345            CS1 = CS#child_state{state=running, pid=Pid, time=erlang:localtime()},
346            notify_start(CS1, State),
347            State#state{running=[CS1|State#state.running]};
348        {error, _What} ->
349            do_maybe_restart(CS, State)
350    end.
351
352
353do_maybe_restart(CS, State) ->
354    case may_restart(CS) of
355        first_restart ->
356            CS1 = CS#child_state{state=crashed, time=erlang:localtime(), 
357                            crashes=1, crash_time=z_utils:now(), retries=0},
358            do_start_child(CS1, State);
359        restart ->
360            CS1 = CS#child_state{state=crashed, time=erlang:localtime(),
361                            crashes=CS#child_state.crashes+1, retries=0},
362            do_start_child(CS1, State);
363        first_retry ->
364            CS1 = CS#child_state{state=retrying, time=erlang:localtime(),
365                            retries=1, retry_time=z_utils:now()},
366            State#state{retrying=[CS1|State#state.retrying]};
367        retry ->
368            CS1 = CS#child_state{state=retrying, time=erlang:localtime(),
369                            retries=CS#child_state.retries+1, retry_time=z_utils:now()},
370            State#state{retrying=[CS1|State#state.retrying]};
371        fail ->
372            CS1 = CS#child_state{state=failed, time=erlang:localtime(), fail_time=z_utils:now()},
373            State#state{failed=[CS1|State#state.failed]}
374    end.
375
376
377%% @doc Remove the child with Name from any list of children
378do_remove_child(Name, State) ->
379    case remove_by_name(Name, State#state.waiting, []) of
380        {CS, L} -> {CS, State#state{waiting=L}};
381        error ->
382            case remove_by_name(Name, State#state.running, []) of
383                {CS1, L1} -> {CS1, State#state{running=L1}};
384                error ->
385                    case remove_by_name(Name, State#state.retrying, []) of
386                        {CS2, L2} -> {CS2, State#state{retrying=L2}};
387                        error ->
388                            case remove_by_name(Name, State#state.failed, []) of
389                                {CS3, L3} -> {CS3, State#state{failed=L3}};
390                                error ->
391                                    case remove_by_name(Name, State#state.stopped, []) of
392                                        {CS4, L4} -> {CS4, State#state{stopped=L4}};
393                                        error -> error
394                                    end
395                            end
396                    end
397            end
398    end.
399
400
401%% @doc Start a single child, catch exceptions
402start_child_mfa({M,F,A}) ->
403    case catch apply(M, F, A) of
404        {ok, Pid} when is_pid(Pid) ->
405            {ok, Pid};
406        ignore -> 
407            {ok, undefined};
408        {error, What} ->
409            {error, What};
410        What ->
411            {error, What}
412    end.
413
414
415%% @doc Check if a child is not crashing or retrying too often.
416%% @spec may_restart(#child_state{}) -> first_restart | restart | first_retry | retry | fail
417may_restart(#child_state{state=running_from_fail, fail_time=FailTime, child=Child} = CS) ->
418    #child_spec{eternal_retry=Period} = Child,
419    case z_utils:now() - FailTime > Period of
420        true ->
421            %% Did run for longer than a (normal) retry period, reset to normal restarting behaviour
422            may_restart(CS#child_state{state=running});
423        false ->
424            %% A restart within the retry/fail period, maybe queue for another eternal retry
425            fail
426    end;
427may_restart(#child_state{state=running_from_retry, retry_time=RetryTime, retries=Retries, child=Child} = CS) ->
428    #child_spec{period_retries=MaxRetries, period_retry=Period} = Child,
429    case z_utils:now() - RetryTime > Period of
430        true ->
431            %% Did run for longer than a retry period, reset to new restarting behaviour
432            may_restart(CS#child_state{state=running});
433        false ->
434            %% A restart within the retry period, maybe queue for another retry
435            case Retries > MaxRetries of
436                true -> fail;
437                false -> retry
438            end
439    end;
440may_restart(#child_state{crash_time=undefined}) ->
441    first_restart;
442may_restart(#child_state{crash_time=CrashTime, crashes=Crashes, child=Child}) ->
443    #child_spec{period=Period, crashes=MaxCrashes} = Child,
444    case z_utils:now() - CrashTime  > Period of
445        true ->
446            first_restart;
447        false ->
448            case Crashes > MaxCrashes of
449                true -> first_retry;
450                false -> restart
451            end
452    end.
453
454
455%% @doc Append a child to the stopped queue
456append_stopped(CS, State) ->
457    CS1 = CS#child_state{state=stopped, time=erlang:localtime()},
458    State#state{stopped=[CS1|State#state.stopped]}.
459
460
461%% @doc Remove a child from the running list
462remove_running_pid(Pid, State) ->
463    case remove_by_pid(Pid, State#state.running, []) of
464        {CS, Running} -> 
465            notify_exit(CS, State),
466            {CS#child_state{pid=undefined}, State#state{running=Running}};
467        error ->
468            error
469    end.
470
471
472remove_by_name(_Name, [], _Acc) ->
473    error;
474remove_by_name(Name, [#child_state{name=Name} = CS|Rest], Acc) ->
475    {CS, Rest++Acc};
476remove_by_name(Name, [CS|Rest], Acc) ->
477    remove_by_name(Name, Rest, [CS|Acc]).
478
479
480remove_by_pid(_Pid, [], _Acc) ->
481    error;
482remove_by_pid(Pid, [#child_state{pid=Pid} = CS|Rest], Acc) ->
483    {CS, Rest++Acc};
484remove_by_pid(Pid, [CS|Rest], Acc) ->
485    remove_by_pid(Pid, Rest, [CS|Acc]).
486
487
488%% @doc Check if a named child is running
489is_running(Name, State) ->
490    is_member(Name, State#state.running).
491
492%% @doc Check if a named child exists
493exists(Name, State) ->
494    is_member(Name, State#state.waiting)
495    orelse is_member(Name, State#state.running)
496    orelse is_member(Name, State#state.retrying)
497    orelse is_member(Name, State#state.failed)
498    orelse is_member(Name, State#state.stopped).
499
500
501is_member(_Name, []) -> false;
502is_member(Name, [#child_state{name=Name}|_]) -> true;
503is_member(Name, [_|Rest]) -> is_member(Name, Rest).
504
505
506
507%% @doc Kill the child process when it is running
508shutdown_child(#child_state{pid=Pid, child=Child} = CS, State) when is_pid(Pid) ->
509    notify_exit(CS, State),
510    shutdown(Pid, Child#child_spec.shutdown);
511shutdown_child(_, _State) ->
512    {error, no_process}.
513
514
515%% @doc Notify the manager that a child has stopped
516notify_exit(_ChildState, #state{manager_pid=undefined}) ->
517    nop;
518notify_exit(ChildState, #state{manager_pid=ManagerPid}) ->
519    gen_server:cast(ManagerPid, {supervisor_child_stopped, ChildState#child_state.child, ChildState#child_state.pid}).
520
521
522%% @doc Notify the manager that a child has started
523notify_start(_ChildState, #state{manager_pid=undefined}) ->
524    nop;
525notify_start(ChildState, #state{manager_pid=ManagerPid}) ->
526    gen_server:cast(ManagerPid, {supervisor_child_started, ChildState#child_state.child, ChildState#child_state.pid}).
527
528
529%%-----------------------------------------------------------------
530%% shutdown/2 and monitor_child/1 are from supervisor.erl
531%% Copyright Ericsson AB 1996-2010. All Rights Reserved.
532%% 
533%% Shutdowns a child. We must check the EXIT value 
534%% of the child, because it might have died with another reason than
535%% the wanted. In that case we want to report the error. We put a 
536%% monitor on the child an check for the 'DOWN' message instead of 
537%% checking for the 'EXIT' message, because if we check the 'EXIT' 
538%% message a "naughty" child, who does unlink(Sup), could hang the 
539%% supervisor. 
540%% Returns: ok | {error, OtherReason}  (this should be reported)
541%%-----------------------------------------------------------------
542shutdown(Pid, brutal_kill) ->
543    case monitor_child(Pid) of
544    ok ->
545        exit(Pid, kill),
546        receive
547        {'DOWN', _MRef, process, Pid, killed} ->
548            ok;
549        {'DOWN', _MRef, process, Pid, OtherReason} ->
550            {error, OtherReason}
551        end;
552    {error, Reason} ->      
553        {error, Reason}
554    end;
555shutdown(Pid, Time) ->
556    case monitor_child(Pid) of
557    ok ->
558        exit(Pid, shutdown), %% Try to shutdown gracefully
559        receive 
560        {'DOWN', _MRef, process, Pid, shutdown} ->
561            ok;
562        {'DOWN', _MRef, process, Pid, OtherReason} ->
563            {error, OtherReason}
564        after Time ->
565            exit(Pid, kill),  %% Force termination.
566            receive
567            {'DOWN', _MRef, process, Pid, OtherReason} ->
568                {error, OtherReason}
569            end
570        end;
571    {error, Reason} ->      
572        {error, Reason}
573    end.
574
575
576%% Help function to shutdown/2 switches from link to monitor approach
577monitor_child(Pid) ->
578
579    %% Do the monitor operation first so that if the child dies 
580    %% before the monitoring is done causing a 'DOWN'-message with
581    %% reason noproc, we will get the real reason in the 'EXIT'-message
582    %% unless a naughty child has already done unlink...
583    erlang:monitor(process, Pid),
584    unlink(Pid),
585
586    receive
587    %% If the child dies before the unlik we must empty
588    %% the mail-box of the 'EXIT'-message and the 'DOWN'-message.
589    {'EXIT', Pid, Reason} -> 
590        receive 
591        {'DOWN', _, process, Pid, _} ->
592            {error, Reason}
593        end
594    after 0 -> 
595        %% If a naughty child did unlink and the child dies before
596        %% monitor the result will be that shutdown/2 receives a 
597        %% 'DOWN'-message with reason noproc.
598        %% If the child should die after the unlink there
599        %% will be a 'DOWN'-message with a correct reason
600        %% that will be handled in shutdown/2. 
601        ok   
602    end.
603