How nice should I be on exit?

Tue Mar 4 14:04:53 CET 2003

On Mon, 3 Mar 2003, Vance Shipley wrote:

>Following the Erlang philosophy we let crashes occur when
>there are things going wrong.  A process should die when it
>is abused, this is normal.  It takes some getting used to
>though as we tend to want to program defensively.

Yes, and yes, and yes.  (:

>Now I'm asking myself whether I should worry about cleaning
>up when a process is going to die normally.  In this case I
>have set up monitoring of a number of other processes and
>stored the references.  It's a small pain to go through the
>list, pull out the references and run erlang:demonitor/1 on
>them.  I'm thinking that in all likelyhood it would be more
>effecient (oops!) to let the run time system clean up after
>me.  The same could apply to ets tables, ???, ...

Anything that the system cleans up automatically should be
left to the system to clean up. As a general rule (in my
opinion), you should introduce as little code as possible to
react to software errors. For every cleanup or fault
containment procedure you add, chances are that you'll
compound the error by making further mistakes.

You should start with the simplest possible mechanism (let
it crash, and rely on the system/infrastructure for cleanup
and recovery). Then test your system through fault insertion
in order to see what happens as processes are killed. If
your simple recovery procedures are not sufficient in
practice, put some energy into fine-tuning them where it
matters.

I've attached a small program that runs through an OTP
supervision tree killing processes and checking to see if
they bounce back, then proceeding up to the next higher
level. Perhaps it might be of some help to you.

My two cents.

/Uffe
-- 
Ulf Wiger, Senior Specialist,
   / / /   Architecture & Design of Carrier-Class Software
  / / /    Strategic Product & System Management
 / / /     Ericsson AB, Connectivity and Control Nodes
-------------- next part --------------
%%%
%%% The contents of this file are subject to the Erlang Public License,
%%% Version 1.0, (the "License"); you may not use this file except in
%%% compliance with the License. You may obtain a copy of the License at
%%% http://www.erlang.org/license/EPL1_0.txt
%%%
%%% Software distributed under the License is distributed on an "AS IS"
%%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
%%% the License for the specific language governing rights and limitations
%%% under the License.
%%%
%%% The Original Code is ape_test-1.0
%%%
%%% The Initial Developer of the Original Code is Ericsson Telecom
%%% AB. Portions created by Ericsson are Copyright (C), 1998, Ericsson
%%% Telecom AB. All Rights Reserved.
%%%
%%% Contributor(s): ______________________________________.

%%%----------------------------------------------------------------------
%%% #0.    BASIC INFORMATION
%%%----------------------------------------------------------------------
%%% File:       ape_test.erl
%%% Author       : Ulf Wiger <ulf.wiger@REDACTED>
%%% Description  : Fault insertion testing for OTP supervision trees
%%% 
%%% Modules used : application, application_controller, ets, error_logger
%%%                lists, supervisor
%%%----------------------------------------------------------------------

-module(aptest).
-date('97-12-13').
-author('ulf.wiger@REDACTED').

-export([start/0, start/2, start/3]).
-export([find_pids/0]).

-compile(export_all).

-import(error_logger, [info_msg/2, error_msg/2]).

-define(TAB, apTestProcs).

%% -----------------------------------------------------------------------
%%
%% start(Interval, Reason, Applications) ->
%%
%% Interval     ::= integer()
%% Reason       ::= term()
%% Applications ::= [atom()]
%%
%% Starts issuing exit(Pid, Reason) - one every Interval seconds -
%% for all registered and supervised processes in Applications.
%% The killing starts at the bottom and goes through one level at a 
%% time across Applications. After finishing one level, it checks
%% to see if the processes have been restarted. It then moves on to
%% the next higher level.
%%
%% Progress is reported via error_logger:info_msg/2.
%% Unexpected events are reported via error_logger:error_msg/2.
%% -----------------------------------------------------------------------
start(Interval, Reason, Apps) ->
    MaxLevel = find_pids(Apps),
    kill_pids(MaxLevel, Interval, Reason).

%% -----------------------------------------------------------------------
%% start(Interval, Reason)
%%
%% Same as above, but for all applications.
%% -----------------------------------------------------------------------
start(Interval, Reason) ->
    Apps = [A || {A, _, _} <- application:which_applications()],
    start(Interval, Reason, Apps).

%% -----------------------------------------------------------------------
%% start()
%%
%% Interval = 30 secs
%% Reason   = aptest
%% Apps     = <all applications>
%% -----------------------------------------------------------------------
start() ->
    start(30000, aptest).

%%% ---------------------------------------------------------
%%% Internal functions
%%% ---------------------------------------------------------

%% -----------------------------------------------------------------------
%% find_pids(Apps) -> MaxLevel
%%
%% finds all registered processes in the supervision tree for Apps
%% and stores them in an ets table.
%% Returns the maximum depth of the processed supervision tree.
%% -----------------------------------------------------------------------
find_pids() ->
    new_ets(),
    Apps = [A || {A, _, _} <- application:which_applications()],
    find_pids(Apps).

find_pids(Apps) ->
    new_ets(),
    Levels = find_pids1(Apps),
    lists:max(lists:flatten(Levels)).

find_pids1([App|T]) ->
    case application_controller:get_master(App) of
	Pid when pid(Pid) ->
	    {Root, _} = application_master:get_child(Pid),
	    [get_procs(supervisor:which_children(Root), App, 1)|
	     find_pids1(T)];
	_ ->
	    find_pids1(T)
    end;
find_pids1([]) ->
    [].

get_procs([], _App, _Level) ->
    [];
get_procs(Children, App, Level) ->
    {Workers, Supers} = check_children(Children),
    [maybe_register(P, App, Level) || P <- Workers ++ Supers],
    [Level|[get_procs(supervisor:which_children(SPid), App, Level+1) || 
	       {_, SPid} <- Supers,
	       pid(SPid)]].

check_children(Ch) ->
    check_children(Ch, [], []).

check_children([{Name, Pid, worker, _}|Ch], Ws, Ss) ->
    check_children(Ch, Ws ++ [{Name, Pid}], Ss);
check_children([{Name, Pid, supervisor, _}|Ch], Ws, Ss) ->
    check_children(Ch, Ws, Ss ++ [{Name, Pid}]);
check_children([], Ws, Ss) ->
    {Ws, Ss}.

%% -----------------------------------------------------------------------
%% maybe_register({Name, Pid}, AppName, Level) -> true | false
%%
%% Store process in ets, only if it has a registered name
%% The reason is that we can't keep up with anonymous pids anyway,
%% since pids will change as the processes restart,
%% so it gets too complicated to include them in the killing spree.
%% -----------------------------------------------------------------------
maybe_register({Name, Pid}, App, Level) when pid(Pid) ->
    case process_info(Pid, registered_name) of
	{registered_name, N} ->
	    ets:insert(?TAB, {N, App, Level}),
	    true;
	_ ->
	    false
    end;
maybe_register(_, _, _) ->
    false.

kill_pids(Level, Interval, Reason) when Level > 0 ->
    Ps = ets:match_object(?TAB, {'_','_',Level}),
    info_msg("Procs to kill at level ~p: ~p.~n", [Level, Ps]),
    Regs0 = registered(),
    RegPids0 = [{N, whereis(N)} || N <- Regs0],
    do_kill_pids(Ps, Interval, Reason),
    Regs1 = registered(),
    Lost = Regs0 -- Regs1,
    SamePids = same_pids(RegPids0),
    info_msg("Done killing procs at level ~p.~n"
	     "  Stayed up:   ~p~n"
	     "  Restarted:   ~p~n"
	     "  Disappeared: ~p~n", 
	     [Level, [N || {N,A,L} <- Ps,
			   lists:member(N, SamePids)],
	      Regs1 -- SamePids, Lost]),
    kill_pids(Level-1, Interval, Reason);
kill_pids(_, _, _) ->
    done.

do_kill_pids([{Name, App, _Level}|Ps], Interval, Reason) ->
    case whereis(Name) of
	undefined ->
	    error_msg("~p no longer registered.~n", 
		      [Name]);
	Pid ->
	    info_msg("Killing ~p (~p, app = ~p), Reason = ~p.~n", 
		     [Name, Pid, App, Reason]),
	    exit(Pid, Reason),
	    sleep(Interval)
    end,
    do_kill_pids(Ps, Interval, Reason);
do_kill_pids([],_, _) ->
    done.

same_pids([{N, P}|T]) ->
    case whereis(N) of
	P1 when P1 == P ->
	    [N|same_pids(T)];
	_ ->
	    same_pids(T)
    end;
same_pids([]) ->
    [].

new_ets() ->
    case ets:info(?TAB) of
	undefined ->
	    ok;
	_ ->
	    ets:delete(?TAB)
    end,
    ets:new(?TAB, [set, public, named_table]).

sleep(Time) ->
    receive
    after Time ->
	    ok
    end.